phd.h

Go to the documentation of this file.
00001 /*
00002  * Phonetic Distance Scorer for the PHAST package
00003  * + See the features.ALL file for input description
00004  * + April 2006 pcomas@lsi.upc.edu
00005  *
00006  */
00007 #ifndef _phd_h
00008 #define _phd_h
00009 
00010 #include <cstdlib>
00011 #include <iostream>
00012 #include <sstream>
00013 #include <fstream>
00014 #include <map>
00015 #include <set>
00016 #include <string>
00017 #include <math.h>
00018 using namespace std;
00019 
00020 #define MAX 128
00021 
00022 template <typename T=int> class phd{
00023 
00024 private:
00025   T csub, cexp, cvowel, cskip, cspace;
00026   T distance[MAX][MAX];
00027   set<char> svowels;      // set of vowel phonemes
00028   set<char> sconsonants;  // set of consonant phonemes
00029   int debug;
00030 
00031   inline T V(char a){ return svowels.find(a) != svowels.end() ? cvowel : 0; }
00032   inline T max(const T& a, const T& b){ return a > b ? a : b; }
00033 
00034 public:
00035 
00036   phd(string const fname){
00037 
00038     debug = 0;
00039     ifstream is;
00040     string s;
00041     char c;
00042     T t;
00043     int i,fcount;
00044     map<const string, int> flist;   // set of the features' names with its index
00045     map<const string, T> fweight; // set of the features' saliences
00046     map<const string, T> values;  // set of the numerical values of the multivaluated features
00047     set<string> svfeatures;  // set of attributes for vowel comparison
00048     set<string> scfeatures;  // set of attributes for other comparisons
00049     csub = 0;
00050     cskip = 0;
00051     cexp = 0;
00052     cvowel = 0;
00053 
00054     /**************************************************************
00055      *
00056      * READ INPUT FILES, BUILD MATRIX OF FEATURES
00057      *
00058      **************************************************************/
00059 
00060     T features [MAX][MAX];
00061 
00062     try{
00063       is.open(fname.c_str());
00064     } catch(...){
00065       cerr << "Some problem opening " << fname << endl;
00066       cerr << "Exiting..." << endl;
00067       exit(-1);
00068     }
00069 
00070     fcount = 0; 
00071 
00072     while(!is.eof()){
00073 
00074       is >> s;
00075 
00076       if( s[0] == '#'){ 
00077         getline(is,s);
00078 
00079       } else if( s=="FON:") {
00080         is >> c;     // this is the phoneme
00081         //cerr << "FONEMA "<< c << endl;
00082         getline(is,s); 
00083         stringstream ss(s,stringstream::in);
00084         i = 0;
00085         while(ss>>s){
00086           if(s=="+"){
00087             features[(int)c][i] = 100;
00088           }else if(s=="-"){
00089             features[(int)c][i] = 0;
00090           }else{  // is a multivaluated feature
00091             features[(int)c][i] = values[s];
00092           }
00093           //cerr << "Posant " << features[c][i] << " a " << i << " (" << s << ")"<< endl;
00094           i++;
00095         }
00096 
00097       } else if( s=="VALUE:") {
00098         is >> s >> t; // feature value is i
00099         values[s] = t;
00100         //cerr << "VALUE ADD: " << s << " <-- " << i << endl;
00101 
00102       } else if( s=="WEIGHT:") {
00103         is >> s >> t; // feature s weights i
00104         fweight[s] = t;
00105 
00106       } else if( s=="CONSTANT:") {
00107         is >> s >> t; // s takes value i
00108         if (s=="Cskip")   { cskip = t;}
00109         else if(s=="Csub"){ csub  = t;}
00110         else if(s=="Cexp"){  cexp = t;}
00111         else if(s=="Cvowel"){ cvowel = t;}
00112         else if(s=="Cspace"){ cspace = t;}
00113         else{ cerr << "UNEXPECTED CONSTANT DEFINITION" << s << endl; }
00114 
00115       } else if( s=="VOWELS:") {
00116         //create a list with the vocalic phonemes
00117         getline(is,s); 
00118         stringstream ss(s, stringstream::in);
00119         while( ss>>c ){  svowels.insert(c); }
00120 
00121       } else if( s=="CONSONANTS:") {
00122         //create a set with the consonantic phonemes
00123         getline(is,s); 
00124         stringstream ss(s, stringstream::in);
00125         while( ss>>c ){  sconsonants.insert(c); }
00126 
00127       } else if( s=="FEATURES:") {
00128         //create a list with the index inside the matrix for each feature
00129         getline(is,s); 
00130         stringstream ss(s, stringstream::in);
00131         i = 0;
00132         while( ss>>s ){ flist[s]=i; i++; }
00133 
00134       } else if( s=="FVOWELS:") {
00135         //create a set with 
00136         getline(is,s); 
00137         stringstream ss(s, stringstream::in);
00138         while( ss>>s ){ svfeatures.insert(s); }
00139 
00140       } else if( s=="FOTHER:") {
00141         //create a set with 
00142         getline(is,s); 
00143         stringstream ss(s, stringstream::in);
00144         while( ss>>s ){ scfeatures.insert(s); }
00145 
00146       } else {
00147         //skip
00148       }
00149       
00150     }
00151     
00152     is.close();
00153 
00154 
00155     /**************************************************************
00156      *
00157      * BUILD MATRIX OF DISTANCES
00158      *
00159      **************************************************************/
00160     /*
00161     */
00162     
00163     set<char>::iterator it1;
00164     set<char>::iterator it2;
00165     set<string>::iterator it3;
00166     T d;
00167     int f;
00168 
00169     for(int i=0;i<MAX;i++){
00170       for(int j=0;j<MAX;j++){
00171         distance[i][j]= i==j ? 0 : (T)8000;
00172       }
00173     }
00174 
00175     //Build vowels vs vowels
00176 
00177     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00178       for( it2 = svowels.begin(); it2!=it1; ++it2){
00179         //calculate distance between it1 and it2 using features in it3
00180         d=0;
00181         for(it3 = svfeatures.begin(); it3!=svfeatures.end(); ++it3){
00182           f = flist[(*it3)];
00183           d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00184         }
00185         distance[(int)(*it1)][(int)(*it2)] = d;
00186         distance[(int)(*it2)][(int)(*it1)] = d;
00187       }
00188     }
00189 
00190 
00191     //Build vowels vs consonants
00192     for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){
00193       for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00194         //calculate distance between it1 and it2 using features in it3
00195         d=0;
00196         for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){
00197           f = flist[(*it3)];
00198           d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00199         }
00200         distance[(int)(*it1)][(int)(*it2)] = d;
00201         distance[(int)(*it2)][(int)(*it1)] = d;
00202       }
00203     }
00204 
00205 
00206     //Build consonants vs consonants
00207     for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){
00208       for( it2 = sconsonants.begin(); it2!=it1; ++it2){
00209         //calculate distance between it1 and it2 using features in it3
00210         d=0;
00211         for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){
00212           f = flist[(*it3)];
00213           d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00214         }
00215         distance[(int)(*it1)][(int)(*it2)] = d;
00216         distance[(int)(*it2)][(int)(*it1)] = d;
00217       }
00218     }
00219 
00220     if(debug>2){
00221       cerr << "\t";
00222       for( int i=85; i<MAX; i++ ){
00223         cerr << (char)i << "\t";
00224       }
00225       cerr << endl;
00226 
00227       for( int i=85; i<MAX; i++ ){
00228         cerr << (char)i << "\t";
00229         for( int j=85; j<MAX; j++ ){
00230           cerr << distance[i][j] << "\t";
00231         }
00232         cerr << endl;
00233       }
00234 
00235     }
00236 
00237 
00238   } //constructor
00239 
00240 
00241   void show(ostream &o){
00242 
00243     set<char>::iterator it1;
00244     set<char>::iterator it2;
00245     set<string>::iterator it3;
00246 
00247     o << "Distances between phonemes" << endl << "==========================" << endl << endl;
00248 
00249     o << "Read values: cskip:" << cskip << ", csub:" << csub << ", cexp:" << cexp << ", cvowel:" << cvowel << endl;
00250 
00251 
00252     o << "\t";
00253     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << "\t";
00254     o << endl;
00255 
00256     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00257       o << (*it1) << "\t";
00258       for( it2 = svowels.begin(); it2!=it1; ++it2){
00259         o << distance[(int)(*it1)][(int)(*it2)] << "\t";
00260       }
00261       o << endl;
00262     }
00263 
00264     o << endl << "\t";
00265     for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << "\t";
00266     o << endl;
00267 
00268     // vowels vs consonants
00269     for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){
00270       o << (*it2) << "\t";
00271       for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00272         o << distance[(int)(*it1)][(int)(*it2)] << "\t";
00273       }
00274       o << endl;
00275     }
00276 
00277     o << endl << "\t";
00278     for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1) o << (*it1) << "\t";
00279     o << endl;
00280 
00281     // consonants vs consonants
00282     for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){
00283       o << (*it1) << "\t";
00284       for( it2 = sconsonants.begin(); it2!=it1; ++it2){
00285         o << distance[(int)(*it1)][(int)(*it2)] << "\t";
00286       }
00287       o << endl;
00288     }
00289   }
00290 
00291 
00292   T getCskip(){
00293     return cskip;
00294   }
00295 
00296   T dSkip(int c){
00297     return c==' ' || c=='_' ? cskip+cspace : cskip;
00298     //return cskip;
00299   }
00300 
00301   T dSub(int const a, int const b){
00302     if( ( (char)a==' ' || (char)a=='_' ) && ( (char)b==' ' || (char)b=='_' ) ){ return cspace; }
00303     return (char)a=='_' || (char)a==' ' || (char)b==' ' || (char)b=='_' ? -cspace/2 : csub - distance[a][b] - V(a) - V(b);
00304   }
00305 
00306   T dExp(int const a, int const b, int const c){
00307     return cexp - distance[a][b] - distance[a][c] - V(a) - max(V(b),V(c));
00308   }
00309   
00310 };
00311 
00312 
00313 
00314 #endif
Generated on Tue Jul 27 16:29:25 2010 for FreeLing by  doxygen 1.6.3