00001
00002
00003
00004
00005
00006
00007 #ifndef _aligner_h
00008 #define _aligner_h
00009
00010 #include <iostream>
00011 #include <fstream>
00012 #include <sstream>
00013 #include <map>
00014 #include <set>
00015 #include <string>
00016 #include <math.h>
00017 #include "phd.h"
00018
00019 #define GLOBAL 1 // The aligner produces local alignments
00020 #define SEMILOCAL 2 // The aligner produces semilocal alignments
00021 #define LOCAL 3 // The aligner produces global alignments
00022
00023
00024 using namespace std;
00025
00026
00027
00028 template<typename T=int> class aligner{
00029
00030 private:
00031 phd<T>* sc;
00032 T score;
00033 int debug;
00034 static inline T max(const T& a, const T& b){ return a > b ? a : b; }
00035
00036
00037 public:
00038
00039 struct alin {
00040 T score;
00041 double scoren;
00042 double context;
00043 int Psubstitutions;
00044 int Pinserts;
00045 int Pdeletions;
00046 int Wsubstitutions;
00047 int Winserts;
00048 int Wdeletions;
00049 unsigned int kword;
00050 int begin;
00051 int end;
00052 int beginW;
00053 int endW;
00054 char* seg;
00055 char* a;
00056 char* b;
00057 bool good;
00058
00059 ~alin(){
00060 delete[](a);
00061 delete[](b);
00062 delete[](seg);
00063 }
00064
00065 alin(T _score, int _begin, int _end, int _beginW, int _endW, int _substitutions, int _inserts, int _deletions, char* _a, char* _b) :
00066 score(_score), Psubstitutions(_substitutions), Pinserts(_inserts), Pdeletions(_deletions), begin(_begin), end(_end), beginW(_beginW), endW(_endW), seg(NULL), a(_a), b(_b) {}
00067 };
00068
00069
00070 aligner(const string fname, int const _debug = 0){
00071 sc = new phd<T>(fname);
00072 debug=_debug;
00073
00074
00075
00076 }
00077
00078 ~aligner(){
00079 delete(sc);
00080 }
00081
00082
00083
00084
00085
00086 alin* align(const char* a, const int tj, const char* b, const int ti, const int mode = SEMILOCAL){
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096 int const W = ti+1;
00097
00098 if( ti==0 || tj == 0 ){ return new alin(0,0,0,0,0,0,0,0,0,0); }
00099 int i,j;
00100 int* m = new int[W*(tj+1)+ti+1];
00101
00102
00103 char* answerA = new char[tj+ti];
00104 char* answerB = new char[tj+ti];
00105 int pA = 0;
00106 int pB = 0;
00107 int insertions=0, deletions=0, substitutions=0;
00108 int spacesA=0;
00109
00110
00111
00112
00113
00114 int nwords=0;
00115 for(int i=0; i<ti; i++){
00116 if( b[i] == ' ' || b[i] == '_' ){
00117
00118 nwords++;
00119 }
00120 }
00121
00122 int* words = new int[nwords];
00123 j=0;
00124 for(int i=0; i<ti; i++){
00125 if( b[i] == ' ' || b[i] == '_' ){
00126 words[j++] = i;
00127 }
00128 }
00129
00130 switch(mode){
00131 case GLOBAL:
00132 m[0]=0;
00133 for(int j=1; j<=tj; j++){ m[j*W] = m[(j-1)*W] + sc->dSkip(a[j-1]); }
00134 for(int i=1; i<=ti; i++){ m[i] = m[i-1] + sc->dSkip(b[i-1]); }
00135 break;
00136 default:
00137 for(int j=0;j<=tj;j++){ m[j*W] = 0; }
00138 for(int i=1;i<=ti;i++){ m[i] = 0; }
00139 break;
00140 }
00141
00142
00143
00144
00145
00146 int i1, i2, i3, i4, i5, indexInit, indexEnd, bestJ, bestI;
00147 indexEnd = 0;
00148 indexInit = 0;
00149 int initWord = 0;
00150 int endWord = 0;
00151 score = -100000000;
00152 bestJ = tj;
00153 bestI = ti;
00154
00155
00156 for(j=1;j<=tj;j++){
00157 m[j*W+1] = max( m[W*(j-1)]+sc->dSub(a[j-1],b[0]) , m[W*(j-1)+1]+sc->dSkip(a[j-1]) );
00158 if(a[j-1]==' '|| a[j-1]=='_' ){spacesA++;}
00159 if(score < m[W*j+1]){
00160 score = m[W*j+1];
00161 bestJ = j;
00162 bestI = 0;
00163 }
00164 }
00165 for(i=1;i<=ti;i++){
00166 m[ W+i ] = max( m[ i-1 ]+sc->dSub(a[0],b[i-1]) , m[ W+i-1 ]+sc->dSkip(b[i-1]) );
00167 if(score < m[W+i]){
00168 score = m[W+i];
00169 bestJ = 0;
00170 bestI = i;
00171 }
00172 }
00173
00174
00175
00176 int lowLimit = 0;
00177 int upperLimit = 0;
00178 int threshold = -10000000;
00179 if( mode == LOCAL ){ threshold = 0; }
00180
00181 for(j=2;j<=tj;j++){
00182
00183
00184
00185
00186
00187
00188 lowLimit = 2;
00189 upperLimit = ti;
00190
00191
00192 for(i=lowLimit;i<=upperLimit;i++){
00193 i1 = m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]);
00194 i2 = m[W*j+i-1] + sc->dSkip(b[i-1]);
00195 i3 = m[W*(j-1)+i-2] + sc->dExp(a[j-1],b[i-2],b[i-1]);
00196 i4 = m[W*(j-1)+i] + sc->dSkip(a[j-1]);
00197 i5 = m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]);
00198 m[W*j+i] = max(threshold,max(i1,max(i1,max(i2,max(i3,max(i4,i5))))));
00199
00200 if(score < m[W*j+i]){
00201 score = m[W*j+i];
00202 bestJ = j;
00203 bestI = i;
00204 }
00205
00206 }
00207 }
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227 bool final = true;
00228 int lastScore = score;
00229 int steps = 0;
00230 int lastChar = 0;
00231
00232
00233 int ni=0;
00234 int mymax=0;
00235
00236 switch(mode){
00237 case SEMILOCAL:
00238
00239
00240
00241
00242
00243
00244 for(i=0;i<=ti;i++){
00245 if( mymax < m[W*tj+i] ){
00246 mymax = m[W*tj+i];
00247 ni=i;
00248 }
00249 }
00250 score = mymax;
00251 for(i=ti-1;i>=ni;i--){
00252 answerA[pA++]= '-';
00253 answerB[pB++]= b[i];
00254 insertions++;
00255 }
00256 i=ni; j=tj;
00257 break;
00258 case GLOBAL:
00259 i = ti; j = tj;
00260 break;
00261 case LOCAL:
00262
00263
00264
00265 j = bestJ;
00266 i = bestI;
00267 break;
00268 }
00269
00270
00271 while( final ){
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283 if( j>0 && i>0 && m[W*j+i] == m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]) ){
00284
00285 lastScore = sc->dSub(a[j-1],b[i-1]);
00286 i--; j--;
00287
00288 answerA[pA++] = a[j];
00289 answerB[pB++] = b[i];
00290 indexInit= i;
00291 indexEnd = max(i,indexEnd);
00292 if( a[j]!=b[i] ){ substitutions++; }
00293 steps++;
00294 lastChar = steps;
00295
00296 } else if ( i>0 && j>0 && m[W*j+i] == m[W*j+i-1] + sc->dSkip(b[i-1]) ) {
00297 lastScore = sc->dSkip(b[i-1]);
00298 i--;
00299 answerA[pA++] = '-';
00300 answerB[pB++] = b[i];
00301 insertions++;
00302 if(steps==0 && b[i]==' ' && b[i]=='_' ){
00303 score = m[W*j+i];
00304 } else {
00305 steps++;
00306 }
00307
00308 } else if( j>1 && i>0 && m[W*j+i] == m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]) ){
00309 lastScore = sc->dExp(b[i-1],a[j-2],a[j-1]);
00310 i--; j-=2;
00311 answerA[pA++] = a[j];
00312 answerA[pA++] = a[j+1];
00313 answerB[pB++] = b[i];
00314 answerB[pB++] = '+';
00315 indexInit = i;
00316 indexEnd = max(i,indexEnd);
00317 deletions++;
00318 if( a[j]!=b[i] ){ substitutions++; }
00319 steps+=2;
00320 lastChar = steps;
00321
00322 } else if( j>0 && i>0 && m[W*j+i] == m[W*(j-1)+i] + sc->dSkip(a[j-1]) ){
00323 lastScore = sc->dSkip(a[j-1]);
00324 j--;
00325 answerA[pA++] = a[j];
00326 answerB[pB++] = '-';
00327 indexInit = i;
00328 indexEnd = max(i,indexEnd);
00329 deletions++;
00330 steps++;
00331 lastChar = steps;
00332
00333 } else if ( i>1 && j>0 && m[W*j+i] == m[ W*(j-1) +i-2 ] + sc->dExp(a[j-1],b[i-2],b[i-1]) ) {
00334 lastScore = sc->dExp(a[j-1],b[i-2],b[i-1]);
00335 j--; i--;
00336 answerA[pA++] = a[j];
00337 answerA[pA++] = '+';
00338 answerB[pB++] = b[i];
00339 answerB[pB++] = b[i-1];
00340 indexInit = i;
00341 indexEnd = max(i,indexEnd);
00342 insertions++;
00343 if( a[j]!=b[i] ){ substitutions++; }
00344 i--;
00345 steps+=2;
00346 lastChar = steps;
00347
00348 } else if ( j==0 ){
00349 i--;
00350 answerA[pA++] = '-';
00351 answerB[pB++] = b[i];
00352 insertions++;
00353 if(steps!=0 || b[i]!=' ' || b[i]!='_' ) steps++;
00354
00355 } else if ( i==0 ){
00356 j--;
00357 answerA[pA++] = a[j];
00358 answerB[pB++] = '-';
00359 deletions++;
00360
00361 if(steps!=0 || b[i]!=' ' || b[i]!='_' ) steps++;
00362 lastChar = steps;
00363
00364 } else {
00365 cerr << "BOINK! Error at "<< j << "," << i << endl;
00366 break;
00367 }
00368
00369
00370 switch(mode){
00371 case SEMILOCAL:
00372 final = j!=0;
00373 if(!final){
00374 i--;
00375 while(i>=0){
00376 answerA[pA++] = '-';
00377 answerB[pB++] = b[i--];
00378 insertions++;
00379 }
00380 }
00381 break;
00382
00383 case GLOBAL:
00384 final = i!=0 || j!=0;
00385 break;
00386
00387 case LOCAL:
00388 final = m[W*j+i]!=0;
00389 break;
00390 }
00391 }
00392
00393
00394
00395
00396 steps = lastChar;
00397
00398 switch(mode){
00399 case GLOBAL:
00400
00401 score /= (pA-spacesA);
00402 break;
00403 case SEMILOCAL:
00404
00405
00406 score = score / steps;
00407 break;
00408 case LOCAL:
00409 score = score / steps;
00410 break;
00411 }
00412
00413
00414 char* newA = new char[pA+1];
00415 char* newB = new char[pB+1];
00416 newA[pA]=0;
00417 newB[pB]=0;
00418 --pA;
00419 --pB;
00420
00421 for(int i=0; pA>=0; ++i, --pA){
00422 newA[i] = answerA[pA];
00423 }
00424 for(int i=0; pB>=0; ++i, --pB){
00425 newB[i] = answerB[pB];
00426 }
00427
00428 delete[](answerA);
00429 delete[](answerB);
00430 delete[] m;
00431
00432
00433 for(int i=0; i<nwords; i++){
00434 if( words[i] == indexInit ){
00435 initWord = i+1;
00436 break;
00437 } else if( words[i] < indexInit ) {
00438 initWord = i+1;
00439 } else if( words[i] > indexInit ) {
00440 initWord = i;
00441 break;
00442 }
00443
00444 }
00445
00446 for(int i=0; i<nwords; i++){
00447 if( words[i] == indexEnd ){
00448 endWord = i;
00449 break;
00450 } else if( words[i] < indexEnd ) {
00451 endWord = i+1;
00452 } else if( words[i] > indexEnd ) {
00453 endWord = i;
00454 break;
00455 }
00456
00457 }
00458
00459 delete[] words;
00460
00461 return new alin(score,indexInit,indexEnd,initWord,endWord,substitutions,insertions,deletions,newA,newB);
00462
00463 }
00464
00465 };
00466
00467
00468 #endif