golem.h

Go to the documentation of this file.
00001 /*
00002  * The Golem is an automaton for recognize at a time all the substrings of fixed length of a given string.
00003  * It stores a matrix of width 128, it do not take input outside of range 32..126 of chars
00004  * + April 2006 - pcomas@lsi.upc.edu
00005  *
00006  *   "These are not the droids you're looking for"
00007  */
00008 
00009 #ifndef _golem_h
00010 
00011 #define _golem_h
00012 #include <string.h>
00013 #include <cstring>
00014 #include <math.h>
00015 #include <iostream>
00016 #include <stack>
00017 #include <vector>
00018 #include <set>
00019 #include <map>
00020 
00021 #define WIDTH 97   // Width of the DFA table (see below)
00022 #define DESP  30   // Number of shiftes positions of each character in the DFA table (see below)
00023 
00024 using namespace std;
00025 
00026 
00027 
00028 class golem{
00029 
00030   private:
00031 
00032   /* NFA */
00033   int** nfa;
00034   int* t0;
00035   int st;
00036   int nst;            // Number of different states is 1 + t*(n-t+1)
00037   int bst;            // Number of "base" states for each substring
00038   set<int> nfa_final; // Set of final states for the NFA
00039   /* DFA */
00040   int* dfa;
00041   int nstates;
00042 
00043   /* OTHERS */
00044   char* query;
00045   int t;
00046   int debug;
00047   int status;
00048   int n;
00049 
00050   struct state {
00051     int length; //number of nstate
00052     int dstate;
00053     char letter;
00054     int* nstate;
00055   };
00056 
00057   
00058 
00059   /* Creates a NFA automaton for recognizing substrings of length t in _query */
00060   void createNFA(){
00061 
00062     bst = n-t+1;       // Number of "base" states for each substring
00063     nst = 1 + t*bst;   // Number of different states is 1 + t*(n-t+1)
00064     int nt  = 2+t;     // Number of transitions/state (at much) = 1+1+n
00065     st = bst+1;        // state counter
00066 
00067     if(debug){
00068       cerr << "n: " << n << endl;
00069       cerr << "bst: " << bst << endl;
00070       cerr << "nst: " << nst << endl;
00071       cerr << "nt: " << nt << endl;
00072       cerr << "st: " << st << endl;
00073     }
00074 
00075     //Create common transitions for each state
00076     t0 = new int[bst*2];
00077     for(int i=0;i<bst;i++){
00078       t0[2*i] = (int)query[i];
00079       t0[2*i+1] = i+1;
00080     }
00081 
00082     //Create the base nfa array
00083     nfa = new int*[nst];
00084 
00085     //Create the 0th state
00086     nfa[0] = new int[4];
00087     nfa[0][0] = '?';
00088     nfa[0][1] = 0;
00089     nfa[0][2] = '?';
00090     nfa[0][3] = 0;
00091 
00092     //create the first n-t+1 states
00093     for(int i=1; i<=bst; i++){
00094 
00095       nfa[i] = new int[4];
00096       // Transition to the next letter in the substring
00097       if(t==1){
00098         nfa[i][0] = '?';
00099         nfa[i][1] = 0;
00100         nfa_final.insert(i);
00101       }else{
00102         nfa[i][0] = query[i];
00103         nfa[i][1] = st;
00104       }
00105       nfa[i][2] = '?';
00106       nfa[i][3] = 0;
00107 
00108       if(st<nst){
00109         //Create the rest of the states after st
00110         //st = create_chain( nfa, st, i+1, nt, t, t0);
00111         for(int j=i+1; j<i+t-1; j++,st++){
00112           //Create new state for ith letter
00113           nfa[st]    = new int[4];
00114           nfa[st][0] = query[j];
00115           nfa[st][1] =  st+1;
00116           nfa[st][2] = '?';
00117           nfa[st][3] = 0;
00118         }
00119 
00120         //Create the last letter
00121         nfa[st] = new int[4];
00122         nfa[st][0] = '?';
00123         nfa[st][1] = 0;
00124         nfa[st][2] = '?';
00125         nfa[st][3] = 0;
00126         nfa_final.insert(st);
00127       }
00128       st++;
00129 
00130     }
00131 
00132     if(debug){
00133       cerr << endl << "THE GOLEM" << endl 
00134            << "==========" << endl 
00135            <<  "The commonest:" << endl 
00136            << "\t";
00137       for(int j=0;j<bst;j++){
00138         cerr << (char)t0[2*j] <<"-->" << t0[2*j+1] << ",  ";
00139       }
00140       cerr << endl;
00141       for(int i=0;i<nst;i++){
00142         cerr << "state[" << i <<"]" << endl << "\t";
00143         for(int j=0;j<2;j++){
00144           cerr << (char)nfa[i][j*2] <<"-->" << nfa[i][j*2+1] << ",  ";
00145         }
00146         cerr << endl;
00147       }
00148     }
00149 
00150   }
00151 
00152 
00153 
00154   /* Determinizes the former NFA automaton */
00155   void createDFA(){
00156 
00157     /*
00158      * nfa  points to a NFA to determinize
00159      *      each state in nfa has two transitions
00160      * t0   is the common set of transitions to all states in nfa
00161      */
00162 
00163     stack<state*> p;         // An stack of nfa states
00164     map<char,set<int> > d0; // The set of transitions from the t0 state in nfa
00165     int dst;                //state counter for our brand new dfa
00166     dst = 0;
00167     int n;
00168     vector<int*> SDFA;       // holds the list of states created for the dfa and the set of nfa states
00169     vector<int> SNDFA;       // holds the number of nfa states in each SDFA state
00170     SDFA.reserve(nst*2);
00171     set<int> dfa_final;      // holds the set of final states for the dfa
00172     set<int> new_states;     // holds a set of states to use in the 4th step
00173     map<int, map<char,int> > transitions;
00174     map<char, set<int> >::iterator it;
00175 
00176     // Create d0. Each char points to a set of states
00177     for(int i=0;i<bst;i++){ d0[(char)t0[2*i]].insert(t0[2*i+1]); }
00178 
00179     // Create the first state for nfa(0)
00180     state* e2;
00181     state* e = new state;
00182     e->length    = 1;
00183     e->nstate = new int[e->length];
00184     e->nstate[0] = 0;    // Sequence representing the state in the NFA
00185     e->dstate    = 0;    // State in the DFA which leats to nstate with letter
00186     e->letter    = '?';  // Letter which leads to nstate(NFA) from dstate(DFA)
00187 
00188     // Push the first state transition
00189     p.push(e);
00190 
00191     while(!p.empty()){
00192 
00193       e = p.top();
00194       p.pop();
00195 
00196    /* 
00197     * 1 Search for e->nstate in the current set of states
00198     */
00199       if(debug){
00200         cerr << endl << "POP ESTAT " << e->dstate 
00201              << "---(" << e->letter << ")-->[";
00202         for(int i=0;i<e->length;i++){ cerr << e->nstate[i] << ",";}
00203         cerr << endl;
00204       }
00205 
00206       int final = 0;
00207       for(unsigned int it=0; it<SDFA.size() && !final; it++){
00208         if( e->length != SNDFA[it] ) {
00209           final = 0;
00210         } else {
00211           final = 1;
00212           for(int i=0; i<e->length && final; i++){
00213             final = final && (e->nstate[i] == SDFA[it][i] );
00214           }          
00215           // 2 If e->nstate exists is has number N
00216           n = final ? it : n;
00217         }
00218       }
00219 
00220       if( final ){
00221         if(debug){
00222           cerr << " - Found that it is state " << n << endl;
00223           cerr << " + ADD TRANSITION " << e->dstate 
00224                << "---(" << e->letter << ")-->" << n << endl;
00225         }
00226         transitions[e->dstate][e->letter] = n;
00227 
00228       }else{         
00229    /*
00230     * 3 If e->nstate does not exist, create it with number N
00231     */
00232         n=dst++;
00233         SDFA.push_back(new int[e->length]);
00234         memcpy( SDFA[n], e->nstate, sizeof(int)*e->length);
00235         SNDFA.push_back(e->length);
00236 
00237         for(int i=0 ; i<e->length ; i++ ){
00238           if(nfa_final.find(SDFA[n][i])!=nfa_final.end() ) dfa_final.insert(n);
00239         }
00240         transitions[e->dstate][e->letter] = n;
00241 
00242         if(debug){
00243           cerr << " + CREATED DFA STATE " << n << " = [";
00244           for(int i=0 ; i<e->length ; i++ ) cerr << SDFA[n][i] << ",";
00245           cerr << "]" << endl;
00246           cerr << " + ADD TRANSITION " << e->dstate 
00247                << "---(" << e->letter << ")-->" << n << endl;
00248         }
00249 
00250    /* 
00251     * 4 For each letter in transitions(e) U d0 push new state N---(letter)-->nstate      
00252     */
00253 
00254         // Create a set of transitions that may not appear in d0
00255         map<char,set<int> > tr_tmp;
00256         for(int i=0; i<e->length; i++){
00257           if((char)nfa[e->nstate[i]][0] != '?' )
00258             tr_tmp[ (char)nfa[e->nstate[i]][0] ].insert( nfa[e->nstate[i]][1] );
00259         }
00260 
00261         // 4A  Firstly add all the transitions from d0
00262         for(map<char, set<int> >::iterator it=d0.begin(); it!=d0.end(); it++){
00263 
00264           if(debug) cerr << " + Estat " << e->dstate
00265                          << ", tractant amb el carącter " << (*it).first << endl;
00266 
00267           new_states.clear();
00268 
00269           // + Genate list of states for e2->letter
00270           // - First states from d0
00271           for( set<int>::iterator it2=(*it).second.begin(); it2!=(*it).second.end(); it2++){
00272             new_states.insert( (*it2) );
00273           }
00274 
00275           for(int i=0; i<e->length; i++){
00276             // Then states from e->nstate
00277             if( nfa[e->nstate[i]][0] == (int)(*it).first ){
00278               new_states.insert(nfa[e->nstate[i]][1]);
00279             }
00280           }
00281 
00282           // Now create the new state to push on the stack
00283           e2 = new state;
00284           e2->dstate = n;
00285           e2->letter = (*it).first;
00286           e2->length = new_states.size();
00287           e2->nstate = new int[e2->length];
00288           int j=0;
00289           for(set<int>::iterator it2=new_states.begin(); it2!=new_states.end(); it2++,j++){
00290             e2->nstate[j] = (*it2);
00291           }
00292           p.push(e2);
00293           
00294           if(debug){
00295             cerr << "  + Push nou estat " << e2->dstate << "---(" << e2->letter << ")--->[";
00296             for(int k=0;k<e2->length; cerr << e2->nstate[k++] << ","){}
00297             cerr << "]" << endl;
00298           }
00299           tr_tmp.erase((char)(*it).first);
00300 
00301         }
00302 
00303         // 4B  Secondly add all the transitions from e->nstate
00304         for(map<char, set<int> >::iterator it=tr_tmp.begin(); it!=tr_tmp.end(); it++){
00305           // Now create the new state to push on the stack
00306           e2 = new state;
00307           e2->dstate = n;
00308           e2->letter = (*it).first;
00309           e2->length = (*it).second.size();
00310           e2->nstate = new int[e2->length];
00311           int j=0;
00312           for(set<int>::iterator it2=(*it).second.begin(); it2!=(*it).second.end(); it2++,j++){
00313             e2->nstate[j] = (*it2);
00314           }
00315           p.push(e2);
00316           
00317           if(debug){
00318             cerr << "  + Push nou estat " << e2->dstate << "---(" << e2->letter << ")--->[";
00319             for(int k=0;k<e2->length; cerr << e2->nstate[k++] << ","){}
00320             cerr << "]" << endl;
00321           }
00322           
00323         }
00324 
00325       }
00326 
00327       delete[](e->nstate);
00328       delete(e);
00329 
00330       /*
00331        *   5 <- Nothing to do
00332        */
00333 
00334     } // end while(p.empty)
00335 
00336     nstates = transitions.size();
00337     dfa = new int[WIDTH*nstates];  
00338     for(int i=0; i<WIDTH*nstates; i++) 
00339       dfa[i]=0;
00340 
00341     for( map<int,map<char,int> >::iterator it = transitions.begin(); it!=transitions.end(); it++){
00342       dfa[(*it).first*WIDTH] = (*it).first;
00343       if( dfa_final.find((*it).first)!=dfa_final.end() ) dfa[(*it).first*WIDTH+1] = 1;
00344       for( map<char,int>::iterator it2 = (*it).second.begin(); it2!=(*it).second.end(); it2++){
00345         dfa[ (*it).first*WIDTH + (int)(*it2).first - DESP] = (*it2).second;
00346       }
00347     }
00348 
00349     for(vector<int*>::iterator it=SDFA.begin(); it!=SDFA.end(); it++){
00350       delete[](*it);
00351     }
00352 
00353   } // end createDFA
00354 
00355 
00356   /*
00357    *  SHAPE OF THE DFA MATRIX:
00358    *
00359    *  Width = 97
00360    *  Column 0: State number
00361    *  Column 1: Final state flag
00362    *  Initial state: 0
00363    *  Input symbols: Characters between ASCI: 32..126. The characters are shifted 31 positions.
00364    *
00365    */
00366 
00367 
00368 
00369 public:
00370 
00371   
00372   golem(const char* const _query, const int _length, const int _t, int _debug){
00373     debug= _debug>0 ? debug : 0;
00374     t = _t;
00375     n = _length;
00376     query = new char[_length+1];
00377     strncpy(query,_query,_length);
00378     query[_length]=0;
00379     createNFA();
00380     createDFA();
00381     status = 0;
00382   }
00383 
00384   void reset(){
00385     status = 0;
00386   }
00387 
00388   int read(char const c){
00389     if(c<DESP) return 0;
00390     //cerr << status;
00391     status = dfa[ status*WIDTH + (int)c-DESP];
00392     //cerr <<"->"<<c<< "->"<<status<<" ("<<dfa[status*WIDTH+1]<<")"<<endl;
00393     return dfa[status*WIDTH+1];
00394   }
00395 
00396   void show(){
00397     cerr << endl << "AQUEST ES L'AUTOMAT DFA: " << endl;
00398     for( int i=0; i<nstates; i++){
00399       cerr << dfa[i*WIDTH];
00400       if( dfa[i*WIDTH+1] == 1 ) cerr << "+"; 
00401       cerr << "\t";
00402       for(int j=2; j<WIDTH; j++){
00403         cerr << (char)(j+DESP) << "-" << dfa[i*WIDTH+j] << " ";
00404       }
00405       cerr << endl;
00406     }
00407   }
00408 
00409   ~golem(){
00410     delete[](dfa);
00411     delete[](t0);
00412     delete[](query);
00413     for(int i=0; i<nst; i++){
00414       delete[](nfa[i]);
00415     }
00416     delete[](nfa);
00417 
00418   }
00419 
00420 };
00421 
00422 #endif
00423 
Generated on Tue Jul 27 16:29:25 2010 for FreeLing by  doxygen 1.6.3