golem.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef _golem_h
00010
00011 #define _golem_h
00012 #include <string.h>
00013 #include <cstring>
00014 #include <math.h>
00015 #include <iostream>
00016 #include <stack>
00017 #include <vector>
00018 #include <set>
00019 #include <map>
00020
00021 #define WIDTH 97 // Width of the DFA table (see below)
00022 #define DESP 30 // Number of shiftes positions of each character in the DFA table (see below)
00023
00024 using namespace std;
00025
00026
00027
00028 class golem{
00029
00030 private:
00031
00032
00033 int** nfa;
00034 int* t0;
00035 int st;
00036 int nst;
00037 int bst;
00038 set<int> nfa_final;
00039
00040 int* dfa;
00041 int nstates;
00042
00043
00044 char* query;
00045 int t;
00046 int debug;
00047 int status;
00048 int n;
00049
00050 struct state {
00051 int length;
00052 int dstate;
00053 char letter;
00054 int* nstate;
00055 };
00056
00057
00058
00059
00060 void createNFA(){
00061
00062 bst = n-t+1;
00063 nst = 1 + t*bst;
00064 int nt = 2+t;
00065 st = bst+1;
00066
00067 if(debug){
00068 cerr << "n: " << n << endl;
00069 cerr << "bst: " << bst << endl;
00070 cerr << "nst: " << nst << endl;
00071 cerr << "nt: " << nt << endl;
00072 cerr << "st: " << st << endl;
00073 }
00074
00075
00076 t0 = new int[bst*2];
00077 for(int i=0;i<bst;i++){
00078 t0[2*i] = (int)query[i];
00079 t0[2*i+1] = i+1;
00080 }
00081
00082
00083 nfa = new int*[nst];
00084
00085
00086 nfa[0] = new int[4];
00087 nfa[0][0] = '?';
00088 nfa[0][1] = 0;
00089 nfa[0][2] = '?';
00090 nfa[0][3] = 0;
00091
00092
00093 for(int i=1; i<=bst; i++){
00094
00095 nfa[i] = new int[4];
00096
00097 if(t==1){
00098 nfa[i][0] = '?';
00099 nfa[i][1] = 0;
00100 nfa_final.insert(i);
00101 }else{
00102 nfa[i][0] = query[i];
00103 nfa[i][1] = st;
00104 }
00105 nfa[i][2] = '?';
00106 nfa[i][3] = 0;
00107
00108 if(st<nst){
00109
00110
00111 for(int j=i+1; j<i+t-1; j++,st++){
00112
00113 nfa[st] = new int[4];
00114 nfa[st][0] = query[j];
00115 nfa[st][1] = st+1;
00116 nfa[st][2] = '?';
00117 nfa[st][3] = 0;
00118 }
00119
00120
00121 nfa[st] = new int[4];
00122 nfa[st][0] = '?';
00123 nfa[st][1] = 0;
00124 nfa[st][2] = '?';
00125 nfa[st][3] = 0;
00126 nfa_final.insert(st);
00127 }
00128 st++;
00129
00130 }
00131
00132 if(debug){
00133 cerr << endl << "THE GOLEM" << endl
00134 << "==========" << endl
00135 << "The commonest:" << endl
00136 << "\t";
00137 for(int j=0;j<bst;j++){
00138 cerr << (char)t0[2*j] <<"-->" << t0[2*j+1] << ", ";
00139 }
00140 cerr << endl;
00141 for(int i=0;i<nst;i++){
00142 cerr << "state[" << i <<"]" << endl << "\t";
00143 for(int j=0;j<2;j++){
00144 cerr << (char)nfa[i][j*2] <<"-->" << nfa[i][j*2+1] << ", ";
00145 }
00146 cerr << endl;
00147 }
00148 }
00149
00150 }
00151
00152
00153
00154
00155 void createDFA(){
00156
00157
00158
00159
00160
00161
00162
00163 stack<state*> p;
00164 map<char,set<int> > d0;
00165 int dst;
00166 dst = 0;
00167 int n;
00168 vector<int*> SDFA;
00169 vector<int> SNDFA;
00170 SDFA.reserve(nst*2);
00171 set<int> dfa_final;
00172 set<int> new_states;
00173 map<int, map<char,int> > transitions;
00174 map<char, set<int> >::iterator it;
00175
00176
00177 for(int i=0;i<bst;i++){ d0[(char)t0[2*i]].insert(t0[2*i+1]); }
00178
00179
00180 state* e2;
00181 state* e = new state;
00182 e->length = 1;
00183 e->nstate = new int[e->length];
00184 e->nstate[0] = 0;
00185 e->dstate = 0;
00186 e->letter = '?';
00187
00188
00189 p.push(e);
00190
00191 while(!p.empty()){
00192
00193 e = p.top();
00194 p.pop();
00195
00196
00197
00198
00199 if(debug){
00200 cerr << endl << "POP ESTAT " << e->dstate
00201 << "---(" << e->letter << ")-->[";
00202 for(int i=0;i<e->length;i++){ cerr << e->nstate[i] << ",";}
00203 cerr << endl;
00204 }
00205
00206 int final = 0;
00207 for(unsigned int it=0; it<SDFA.size() && !final; it++){
00208 if( e->length != SNDFA[it] ) {
00209 final = 0;
00210 } else {
00211 final = 1;
00212 for(int i=0; i<e->length && final; i++){
00213 final = final && (e->nstate[i] == SDFA[it][i] );
00214 }
00215
00216 n = final ? it : n;
00217 }
00218 }
00219
00220 if( final ){
00221 if(debug){
00222 cerr << " - Found that it is state " << n << endl;
00223 cerr << " + ADD TRANSITION " << e->dstate
00224 << "---(" << e->letter << ")-->" << n << endl;
00225 }
00226 transitions[e->dstate][e->letter] = n;
00227
00228 }else{
00229
00230
00231
00232 n=dst++;
00233 SDFA.push_back(new int[e->length]);
00234 memcpy( SDFA[n], e->nstate, sizeof(int)*e->length);
00235 SNDFA.push_back(e->length);
00236
00237 for(int i=0 ; i<e->length ; i++ ){
00238 if(nfa_final.find(SDFA[n][i])!=nfa_final.end() ) dfa_final.insert(n);
00239 }
00240 transitions[e->dstate][e->letter] = n;
00241
00242 if(debug){
00243 cerr << " + CREATED DFA STATE " << n << " = [";
00244 for(int i=0 ; i<e->length ; i++ ) cerr << SDFA[n][i] << ",";
00245 cerr << "]" << endl;
00246 cerr << " + ADD TRANSITION " << e->dstate
00247 << "---(" << e->letter << ")-->" << n << endl;
00248 }
00249
00250
00251
00252
00253
00254
00255 map<char,set<int> > tr_tmp;
00256 for(int i=0; i<e->length; i++){
00257 if((char)nfa[e->nstate[i]][0] != '?' )
00258 tr_tmp[ (char)nfa[e->nstate[i]][0] ].insert( nfa[e->nstate[i]][1] );
00259 }
00260
00261
00262 for(map<char, set<int> >::iterator it=d0.begin(); it!=d0.end(); it++){
00263
00264 if(debug) cerr << " + Estat " << e->dstate
00265 << ", tractant amb el carącter " << (*it).first << endl;
00266
00267 new_states.clear();
00268
00269
00270
00271 for( set<int>::iterator it2=(*it).second.begin(); it2!=(*it).second.end(); it2++){
00272 new_states.insert( (*it2) );
00273 }
00274
00275 for(int i=0; i<e->length; i++){
00276
00277 if( nfa[e->nstate[i]][0] == (int)(*it).first ){
00278 new_states.insert(nfa[e->nstate[i]][1]);
00279 }
00280 }
00281
00282
00283 e2 = new state;
00284 e2->dstate = n;
00285 e2->letter = (*it).first;
00286 e2->length = new_states.size();
00287 e2->nstate = new int[e2->length];
00288 int j=0;
00289 for(set<int>::iterator it2=new_states.begin(); it2!=new_states.end(); it2++,j++){
00290 e2->nstate[j] = (*it2);
00291 }
00292 p.push(e2);
00293
00294 if(debug){
00295 cerr << " + Push nou estat " << e2->dstate << "---(" << e2->letter << ")--->[";
00296 for(int k=0;k<e2->length; cerr << e2->nstate[k++] << ","){}
00297 cerr << "]" << endl;
00298 }
00299 tr_tmp.erase((char)(*it).first);
00300
00301 }
00302
00303
00304 for(map<char, set<int> >::iterator it=tr_tmp.begin(); it!=tr_tmp.end(); it++){
00305
00306 e2 = new state;
00307 e2->dstate = n;
00308 e2->letter = (*it).first;
00309 e2->length = (*it).second.size();
00310 e2->nstate = new int[e2->length];
00311 int j=0;
00312 for(set<int>::iterator it2=(*it).second.begin(); it2!=(*it).second.end(); it2++,j++){
00313 e2->nstate[j] = (*it2);
00314 }
00315 p.push(e2);
00316
00317 if(debug){
00318 cerr << " + Push nou estat " << e2->dstate << "---(" << e2->letter << ")--->[";
00319 for(int k=0;k<e2->length; cerr << e2->nstate[k++] << ","){}
00320 cerr << "]" << endl;
00321 }
00322
00323 }
00324
00325 }
00326
00327 delete[](e->nstate);
00328 delete(e);
00329
00330
00331
00332
00333
00334 }
00335
00336 nstates = transitions.size();
00337 dfa = new int[WIDTH*nstates];
00338 for(int i=0; i<WIDTH*nstates; i++)
00339 dfa[i]=0;
00340
00341 for( map<int,map<char,int> >::iterator it = transitions.begin(); it!=transitions.end(); it++){
00342 dfa[(*it).first*WIDTH] = (*it).first;
00343 if( dfa_final.find((*it).first)!=dfa_final.end() ) dfa[(*it).first*WIDTH+1] = 1;
00344 for( map<char,int>::iterator it2 = (*it).second.begin(); it2!=(*it).second.end(); it2++){
00345 dfa[ (*it).first*WIDTH + (int)(*it2).first - DESP] = (*it2).second;
00346 }
00347 }
00348
00349 for(vector<int*>::iterator it=SDFA.begin(); it!=SDFA.end(); it++){
00350 delete[](*it);
00351 }
00352
00353 }
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369 public:
00370
00371
00372 golem(const char* const _query, const int _length, const int _t, int _debug){
00373 debug= _debug>0 ? debug : 0;
00374 t = _t;
00375 n = _length;
00376 query = new char[_length+1];
00377 strncpy(query,_query,_length);
00378 query[_length]=0;
00379 createNFA();
00380 createDFA();
00381 status = 0;
00382 }
00383
00384 void reset(){
00385 status = 0;
00386 }
00387
00388 int read(char const c){
00389 if(c<DESP) return 0;
00390
00391 status = dfa[ status*WIDTH + (int)c-DESP];
00392
00393 return dfa[status*WIDTH+1];
00394 }
00395
00396 void show(){
00397 cerr << endl << "AQUEST ES L'AUTOMAT DFA: " << endl;
00398 for( int i=0; i<nstates; i++){
00399 cerr << dfa[i*WIDTH];
00400 if( dfa[i*WIDTH+1] == 1 ) cerr << "+";
00401 cerr << "\t";
00402 for(int j=2; j<WIDTH; j++){
00403 cerr << (char)(j+DESP) << "-" << dfa[i*WIDTH+j] << " ";
00404 }
00405 cerr << endl;
00406 }
00407 }
00408
00409 ~golem(){
00410 delete[](dfa);
00411 delete[](t0);
00412 delete[](query);
00413 for(int i=0; i<nst; i++){
00414 delete[](nfa[i]);
00415 }
00416 delete[](nfa);
00417
00418 }
00419
00420 };
00421
00422 #endif
00423