aligner.h

Go to the documentation of this file.
00001 /*
00002  * String aligner for the Phonetic Distance Package
00003  * + 
00004  * + April 2006 pcomas@lsi.upc.edu
00005  *
00006  */
00007 #ifndef _aligner_h
00008 #define _aligner_h
00009 
00010 #include <iostream>
00011 #include <fstream>
00012 #include <sstream>
00013 #include <map>
00014 #include <set>
00015 #include <string>
00016 #include <math.h>
00017 #include "phd.h"
00018 
00019 #define GLOBAL    1  // The aligner produces local alignments
00020 #define SEMILOCAL 2  // The aligner produces semilocal alignments
00021 #define LOCAL     3  // The aligner produces global alignments
00022 
00023 
00024 using namespace std;
00025 
00026 
00027 
00028 template<typename T=int> class aligner{
00029 
00030 private:
00031     phd<T>* sc;
00032     T score;
00033     int debug;
00034     static inline T max(const T& a, const T& b){ return a > b ? a : b; }
00035 
00036 
00037 public:
00038 
00039   struct alin {
00040     T score;   // score of this alignment
00041     double scoren; // score normalized
00042     double context;
00043     int Psubstitutions;
00044     int Pinserts;
00045     int Pdeletions;
00046     int Wsubstitutions;
00047     int Winserts;
00048     int Wdeletions;
00049     unsigned int kword; //The number of keyword involved in this alignment
00050     int begin; // index in b[] where the alignment starts
00051     int end;   // index in b[] where the alignment ends
00052     int beginW; //number of word in b[] where the alignment starts (each ' ' changes the word)
00053     int endW;   //number of word in b[] where the alignment ends
00054     char* seg; // Segment of b[] with the letters
00055     char* a;   // Alignment of a
00056     char* b;   // Alignment of b
00057     bool good; // Marks if this alignment is selected as relevant
00058 
00059     ~alin(){
00060       delete[](a);
00061       delete[](b);
00062       delete[](seg);
00063     }
00064 
00065     alin(T _score, int _begin, int _end, int _beginW, int _endW, int _substitutions, int _inserts, int _deletions, char* _a, char* _b) :
00066       score(_score), Psubstitutions(_substitutions), Pinserts(_inserts), Pdeletions(_deletions), begin(_begin), end(_end), beginW(_beginW), endW(_endW), seg(NULL), a(_a), b(_b) {}
00067   };
00068 
00069 
00070   aligner(const string fname, int const _debug = 0){
00071     sc = new phd<T>(fname);
00072     debug=_debug;
00073     // if(debug>3){ sc->show(cerr);}
00074 
00075 
00076   } //constructor
00077 
00078   ~aligner(){
00079     delete(sc);
00080   }
00081 
00082 
00083   /*
00084    * Alinia la cadena A contra la cadena B. Conceptualment es considera que A és la query
00085    */
00086   alin* align(const char* a, const int tj, const char* b, const int ti, const int mode = SEMILOCAL){
00087     // a is the short string with length tj
00088     // b is the long string with length ti
00089     // The algorithm searchs for the best match of a against b in a semi-local or global point of view.
00090     // Usage of global matching is discouraged for the sake of coherence :-) because it doesn't try
00091     // to keep the letters together so 'a' will be meaningless scattered through all 'b'
00092 
00093     /*
00094     //  Build the align matrix
00095     */
00096     int const W = ti+1;
00097 
00098     if( ti==0 || tj == 0 ){ return new alin(0,0,0,0,0,0,0,0,0,0); }
00099     int i,j;
00100     int* m = new int[W*(tj+1)+ti+1]; //int m[tj+1][ti+1];
00101 
00102     // Variables for alignment reconstruction
00103     char* answerA = new char[tj+ti];
00104     char* answerB = new char[tj+ti];
00105     int pA = 0;
00106     int pB = 0;
00107     int insertions=0, deletions=0, substitutions=0;
00108     int spacesA=0;
00109 
00110 
00111     // INITIALIZATION STEP
00112 
00113     //þ Decide the number of word that each character belongs to
00114     int nwords=0;
00115     for(int i=0; i<ti; i++){ 
00116       if( b[i] == ' ' || b[i] == '_' ){
00117         // word change at _i_
00118         nwords++;
00119       }
00120     }
00121 
00122     int* words = new int[nwords];
00123     j=0;
00124     for(int i=0; i<ti; i++){ 
00125       if( b[i] == ' ' || b[i] == '_' ){
00126         words[j++] = i;
00127       }
00128     }
00129 
00130     switch(mode){
00131     case GLOBAL:  //Start values are skips
00132       m[0]=0;
00133       for(int j=1; j<=tj; j++){ m[j*W] = m[(j-1)*W] + sc->dSkip(a[j-1]); }
00134       for(int i=1; i<=ti; i++){ m[i]   = m[i-1]     + sc->dSkip(b[i-1]); }
00135       break;
00136     default: //Start values are 0
00137       for(int j=0;j<=tj;j++){ m[j*W] = 0; }
00138       for(int i=1;i<=ti;i++){ m[i] = 0; }
00139       break;
00140     }
00141 
00142 
00143     /*
00144     // Throw the scoring process
00145     */
00146     int i1, i2, i3, i4, i5, indexInit, indexEnd, bestJ, bestI;
00147     indexEnd  = 0;
00148     indexInit = 0;
00149     int initWord  = 0;
00150     int endWord   = 0;
00151     score = -100000000;
00152     bestJ = tj;
00153     bestI = ti;
00154 
00155     // FILL THE FIRST COLUMN & ROW
00156     for(j=1;j<=tj;j++){
00157         m[j*W+1] = max(  m[W*(j-1)]+sc->dSub(a[j-1],b[0]) , m[W*(j-1)+1]+sc->dSkip(a[j-1]) );
00158         if(a[j-1]==' '|| a[j-1]=='_' ){spacesA++;}
00159         if(score < m[W*j+1]){
00160           score = m[W*j+1];
00161           bestJ = j;
00162           bestI = 0;
00163         }
00164     }
00165     for(i=1;i<=ti;i++){
00166         m[ W+i ] = max(  m[ i-1 ]+sc->dSub(a[0],b[i-1])  ,  m[ W+i-1 ]+sc->dSkip(b[i-1])  );
00167         if(score < m[W+i]){
00168           score = m[W+i];
00169           bestJ = 0;
00170           bestI = i;
00171         }
00172     }
00173 
00174     //FILL THE REST OF THE MATRIX
00175     //int step = ti/4;
00176     int lowLimit = 0;
00177     int upperLimit = 0;
00178     int threshold = -10000000;
00179     if( mode == LOCAL ){ threshold = 0; }
00180 
00181     for(j=2;j<=tj;j++){
00182       //if( mode==GLOBAL && tj/ti> 0.75){
00183       //        // If the scoring is GLOBAL we do not fill the entire matrix
00184       //        lowLimit   = max(  2 , j-step );
00185       //        upperLimit = min( ti , j+step );
00186       //        m[W*j+lowLimit+1] = 0;
00187       //      } else {
00188         lowLimit   = 2;
00189         upperLimit = ti;
00190       //      }
00191 
00192       for(i=lowLimit;i<=upperLimit;i++){
00193         i1 = m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]);
00194         i2 = m[W*j+i-1] + sc->dSkip(b[i-1]);
00195         i3 = m[W*(j-1)+i-2] + sc->dExp(a[j-1],b[i-2],b[i-1]);
00196         i4 = m[W*(j-1)+i] + sc->dSkip(a[j-1]);
00197         i5 = m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]);
00198         m[W*j+i] = max(threshold,max(i1,max(i1,max(i2,max(i3,max(i4,i5))))));
00199 
00200         if(score < m[W*j+i]){
00201           score = m[W*j+i];
00202           bestJ = j;
00203           bestI = i;
00204         }
00205 
00206       }
00207     }
00208 
00209     //Prints the full score matrix if needed
00210     /*if(debug>5){
00211       cerr << endl << "\t";
00212       for(i=0;i<ti;i++){ cerr << "\t" << b[i]; }
00213       cerr << endl << "\t";
00214       for(j=0;j<=tj;j++){
00215         if(j>0) cerr << a[j-1] << "\t";
00216         for(i=0;i<=ti;i++){
00217           cerr << m[W*j+i] << "\t";
00218         }
00219         cerr << endl;
00220       }
00221     }*/
00222 
00223 
00224     /*
00225     || Reconstruction Step
00226     */
00227     bool final = true;    //Exit condition
00228     int lastScore = score;
00229     int steps = 0; //Number of steps uset to align 'a' with 'b'
00230     int lastChar = 0;
00231 
00232 
00233     int ni=0;
00234     int mymax=0;
00235 
00236     switch(mode){
00237     case SEMILOCAL:
00238       /* In semi-local alignment, the best-path reconstructions doesn't start from the 
00239        * botom-right corner of the matrix but from the best-scoring cell among last
00240        * column and last row scores.
00241        * In this implementation only the last row is taken in account since it is 
00242        * the "long" string
00243        */
00244       for(i=0;i<=ti;i++){
00245         if( mymax < m[W*tj+i] ){
00246           mymax = m[W*tj+i];
00247           ni=i;
00248         }
00249       }
00250       score = mymax;
00251       for(i=ti-1;i>=ni;i--){ // Carry on half of the string for debugging purpouses
00252         answerA[pA++]= '-';
00253         answerB[pB++]= b[i];
00254         insertions++;
00255       }
00256       i=ni; j=tj;
00257       break;
00258     case GLOBAL: // now this is global alignment
00259       i = ti; j = tj;
00260       break;
00261     case LOCAL: 
00262       /* In LOCAL alignment, the best-path reconstruction starts with at the best scoring 
00263        * position in the whole matrix.
00264        */
00265       j = bestJ;
00266       i = bestI;
00267       break;
00268     }
00269 
00270 
00271     while( final ){
00272 
00273       /*if(debug>5){
00274         cerr << "Looking for " << m[ W*j +i] << " in ("<<j<<","<<i<<")"<<endl;
00275         cerr << "m[j-1][i-1]=" << m[ W*(j-1) +i-1] << ": dSub=" << sc->dSub(a[j-1],b[i-1]) << endl;
00276         cerr << "m[j][i-1]=" <<   m[ W*j +i-1] << ": dSkipB=" << sc->dSkip(b[i-1]) << endl;
00277         cerr << "m[j-1][i]=" <<   m[ W*(j-1) +i] << ": dSkipA=" << sc->dSkip(a[j-1]) << endl;
00278         if(j>1){ cerr << "m[j-1][i-2]=" << m[ W*(j-1) +i-2] << ": dExp=" << sc->dExp(a[j-1],b[i-2],b[i-1]) << endl; }
00279         if(i>1){ cerr << "m[j-2][i-1]=" << m[ W*(j-2) +i-1] << ": dExp=" << sc->dExp(b[i-1],a[j-2],a[j-1]) << endl; }
00280       }*/
00281 
00282 
00283       if( j>0 && i>0 && m[W*j+i] ==  m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]) ){
00284         //Aliniar b[i] amb a[j]
00285         lastScore = sc->dSub(a[j-1],b[i-1]);
00286         i--; j--;
00287         //cerr << "Sub a[" << j << "]=" << a[j] << " b[" << i << "]=" << b[i] << " @ " << pA << endl;
00288         answerA[pA++] = a[j];
00289         answerB[pB++] = b[i];
00290         indexInit= i;
00291         indexEnd = max(i,indexEnd);
00292         if( a[j]!=b[i] ){ substitutions++;  }
00293         steps++;
00294         lastChar = steps;
00295 
00296       } else if ( i>0 && j>0 && m[W*j+i] == m[W*j+i-1] + sc->dSkip(b[i-1]) ) {
00297         lastScore = sc->dSkip(b[i-1]);
00298         i--;
00299         answerA[pA++] = '-';
00300         answerB[pB++] = b[i];
00301         insertions++;
00302         if(steps==0 && b[i]==' ' && b[i]=='_' ){
00303           score = m[W*j+i];
00304         } else {
00305           steps++;
00306         }
00307 
00308       } else if( j>1 && i>0 && m[W*j+i] ==  m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]) ){
00309         lastScore =  sc->dExp(b[i-1],a[j-2],a[j-1]);
00310         i--; j-=2;
00311         answerA[pA++] = a[j];
00312         answerA[pA++] = a[j+1];
00313         answerB[pB++] = b[i];
00314         answerB[pB++] = '+';
00315         indexInit = i;
00316         indexEnd = max(i,indexEnd);
00317         deletions++;
00318         if( a[j]!=b[i] ){ substitutions++; }
00319         steps+=2;
00320         lastChar = steps;
00321 
00322       } else if( j>0 && i>0 && m[W*j+i] ==  m[W*(j-1)+i] + sc->dSkip(a[j-1]) ){
00323         lastScore = sc->dSkip(a[j-1]);
00324         j--;
00325         answerA[pA++] = a[j];
00326         answerB[pB++] = '-';
00327         indexInit = i;
00328         indexEnd = max(i,indexEnd);
00329         deletions++;
00330         steps++;
00331         lastChar = steps;
00332 
00333       } else if ( i>1 && j>0 && m[W*j+i] == m[ W*(j-1) +i-2 ] + sc->dExp(a[j-1],b[i-2],b[i-1]) ) {
00334         lastScore = sc->dExp(a[j-1],b[i-2],b[i-1]);
00335         j--; i--;
00336         answerA[pA++] = a[j];
00337         answerA[pA++] = '+';
00338         answerB[pB++] = b[i];
00339         answerB[pB++] = b[i-1];
00340         indexInit = i;
00341         indexEnd = max(i,indexEnd);
00342         insertions++;
00343         if( a[j]!=b[i] ){ substitutions++; }
00344         i--;
00345         steps+=2;
00346         lastChar = steps;
00347 
00348       } else if ( j==0 ){
00349         i--;
00350         answerA[pA++] = '-';
00351         answerB[pB++] = b[i];
00352         insertions++;
00353         if(steps!=0 || b[i]!=' ' || b[i]!='_' ) steps++;
00354 
00355       } else if ( i==0 ){
00356         j--;
00357         answerA[pA++] = a[j];
00358         answerB[pB++] = '-';
00359         deletions++;
00360 
00361         if(steps!=0 || b[i]!=' ' || b[i]!='_' ) steps++;
00362         lastChar = steps;
00363         
00364       } else {
00365         cerr << "BOINK! Error at "<< j << "," << i << endl;
00366         break;
00367       }
00368       
00369       // This is the termination condition
00370       switch(mode){
00371         case SEMILOCAL:
00372         final = j!=0;
00373         if(!final){
00374           i--;
00375           while(i>=0){
00376             answerA[pA++] = '-';
00377             answerB[pB++] = b[i--];
00378             insertions++;
00379           }
00380         }
00381         break;
00382 
00383         case GLOBAL:
00384         final = i!=0 || j!=0;
00385         break;
00386 
00387         case LOCAL:
00388         final = m[W*j+i]!=0;
00389         break;
00390       }
00391     }
00392 
00393 
00394     // Computing of the score
00395     // Score= avg.points/phoneme
00396     steps = lastChar;
00397 
00398     switch(mode){
00399     case GLOBAL:
00400       // average of similarity per phonem (without spaces?)
00401       score /= (pA-spacesA);
00402       break;
00403     case SEMILOCAL:
00404       //if(debug>2) cerr << "     Score = " << score << " Last Score = " << lastScore << " steps = " << steps <<  endl;
00405       //score = (score-lastScore) / steps;
00406       score = score / steps;
00407       break;
00408     case LOCAL:
00409       score = score / steps;
00410       break;
00411     }
00412 
00413     // Print the alignment
00414     char* newA = new char[pA+1];
00415     char* newB = new char[pB+1];
00416     newA[pA]=0;
00417     newB[pB]=0;
00418     --pA;
00419     --pB;
00420 
00421     for(int i=0; pA>=0; ++i, --pA){
00422       newA[i] = answerA[pA];
00423     }
00424     for(int i=0; pB>=0; ++i, --pB){
00425       newB[i] = answerB[pB];
00426     }
00427 
00428     delete[](answerA);
00429     delete[](answerB);
00430     delete[] m;
00431 
00432     // Finds in wich word does indexInit and indexEnd fall
00433     for(int i=0; i<nwords; i++){ 
00434       if( words[i] == indexInit ){
00435         initWord = i+1;
00436         break;
00437       } else if( words[i] < indexInit ) {
00438         initWord = i+1;
00439       } else if( words[i] > indexInit ) {
00440         initWord = i;
00441         break;
00442       }
00443 
00444     }
00445 
00446     for(int i=0; i<nwords; i++){ 
00447       if( words[i] == indexEnd ){
00448         endWord = i;
00449         break;
00450       } else if( words[i] < indexEnd ) {
00451         endWord = i+1;
00452       } else if( words[i] > indexEnd ) {
00453         endWord = i;
00454         break;
00455       }
00456 
00457     }
00458 
00459     delete[] words;
00460 
00461     return new alin(score,indexInit,indexEnd,initWord,endWord,substitutions,insertions,deletions,newA,newB);
00462 
00463   }
00464 
00465 };
00466 
00467 
00468 #endif
Generated on Tue Jul 27 16:29:25 2010 for FreeLing by  doxygen 1.6.3