• [Compiling Principles] LEX基本功能的实现


    学习编译原理的时候做的一个小型LEX,及词法分析产生器,可以根据输入的正则表达式生成自动机,从中识别出代码中的token串,存入一个二维表,也就是SymbolTable。

    主体代码如下:

    代码
      1 #include<iostream>
      2 #include "stdio.h"
      3 using namespace std;
      4 #include<String>
      5 #include<Stack>
      6 #include<set>
      7 #include<queue>
      8 #include<fstream>
      9 #include<iomanip>
     10 struct path
     11 {
     12     int from;                  //一个path代表从一个状态到另一个状态的路径
     13     int to;
     14     string strPath;            //strPath代表这个结构体处理的路径
     15 };
     16 struct dealStruct
     17 {
     18     int stCount;
     19     struct path* resPtr; 
     20 };
     21 struct setNode
     22 {
     23     set<int> *Dset;
     24     int num;
     25     struct setNode * next;   
     26 };
     27 bool judgeLD( char v )                      //判断正则表达式中是否含有非法字符
     28 {
     29     if(  ( int(v) > 47 &&  int(v) <= 57 ) || 
     30          ( int(v) >= 65 &&  int(v) <= 90 )|| 
     31          ( int(v) >= 97 &&  int(v) <= 122 ) ||
     32            v == '>' || v == '<' || v == '=' ||
     33            v == '`' || v == '+' || v == '-' )
     34         return true;
     35     else
     36         return false;
     37 }
     38 struct dealStruct * dealWithStr( stringint ); //通过正则表达式生成NFA
     39 int jihe( int, stack<struct path*> & );         //处理正则表达式时'*’的运算函数
     40 int lianjie( int, stack<struct path*> & );      //处理正则表达式时'&’(自定义运算符ab相当于a&b)的运算函数 
     41 int huo( int, stack<struct path*> & );;         //处理正则表达式时'|’的运算函数 
     42 int reduceNFA( intint*int );                //NFA->DFA函数 
     43 void BFS( intint );                           //计算一个状态的ξ闭包 
     44 void produceC( intstring * );                 //生成词法分析器的.cpp文件 
     45 int arrForBFS[100= {0};                       //存放BFS函数生成的闭包的数组 
     46 string T[2000][2000];                           //NFA的数组表示 
     47 string recT[2000][2000];                        //DFA的数组表示
     48 int stateCountDFA;                              //DFA状态数量
     49 int reduceState[2][2000];                       //化简DFA用的数组
     50 set<char> tv;                                   //所有终结符
     51 set<char>::iterator Vt;
     52 void reduceDFA();                               //化简DFA
     53 void combine(int,int);                          //化简DFA时合并两个状态
     54 int regCount;                                   //输入正则表达式的个数
     55 
     56 int main()
     57 {
     58     regCount = 0;
     59     int i,j,k;
     60     int temp = 0;
     61     
     62     string * regArr;
     63     string * typeName;
     64     int * typeIndex;
     65     int **resultEnterExt;
     66 
     67 
     68     ifstream inRegular;
     69     inRegular.open("RegularExpressions.txt");
     70     inRegular>>regCount;                          //正则表达式个数
     71     typeName = new string[regCount];
     72     typeIndex = new int[regCount];
     73     resultEnterExt = new int*[regCount];      //定义一个二位动态数组用于存放每个正则表达式所生成的NFA的起始状态和终结状态 
     74     for( i = 0;i < regCount;i++ )
     75         resultEnterExt[i] = new int[2];
     76     regArr = new string[regCount];
     77     for( i = 0;i < regCount;i++ )
     78     {
     79         inRegular>>typeName[i];
     80         inRegular>>regArr[i];
     81     }                                           //读入正则表达式
     82     
     83     int stateCount = -1
     84     for( i = 0;i < regCount;i++ )                 //遍历每个正则表达式
     85     {
     86         for( j = 0;j < regArr[i].length()-1;j++ )
     87            if(   (  judgeLD(regArr[i].at(j) ) && (  judgeLD(regArr[i].at(j+1)) || regArr[i].at(j+1== '(' )  ) ||
     88                  (  regArr[i].at(j) == ')'  &&   ( regArr[i].at(j+1== '(' || judgeLD(regArr[i].at(j+1)) )   ) ||
     89                  (  regArr[i].at(j) == '*'  &&    ( regArr[i].at(j+1== '(' || judgeLD(regArr[i].at(j+1)) )  )   )
     90               regArr[i].insert(j+1,"&");           //预处理正则表达式,添加自定义运算符'&' 
     91 
     92                  
     93         regArr[i].insert(regArr[i].length(),"#%");  //在末尾添加结束符 
     94         cout<<regArr[i]<<endl;                      //输出处理后的正则表达式 
     95         
     96         
     97         j = 0;
     98         struct path *= NULL;
     99         struct dealStruct * result = dealWithStr( regArr[i],stateCount );  //对于每个正则表达式调用NFA生成函数 
    100 
    101         stateCount = result->stCount;
    102         resultEnterExt[i][0= result->resPtr->from;    //记录该正则表达式的起始状态和终结状态 
    103         resultEnterExt[i][1= result->resPtr->to;
    104         typeIndex[i] = resultEnterExt[i][1];
    105 /*        cout<<"The number of states now is "<<stateCount<<endl;
    106         cout<<"The regular perform dealt with just now is "<<result->resPtr->strPath<<endl;
    107         cout<<"The from dealt with just now is "<<resultEnterExt[i][0]<<endl;
    108         cout<<"The to dealt with just now is "<<resultEnterExt[i][1]<<endl;*/
    109     }
    110     
    111     stateCount++;                                  //汇总所有正则表达式的NFA 
    112     cout<<"NFA合并完毕!"<<endl;
    113                                   
    114     for( i = 0;i < regCount;i++ )
    115          T[stateCount][resultEnterExt[i][0]] = '`';      //从公共起始状态出发 
    116 
    117   /*  for( i = 0;i <= stateCount;i++ )       //输出产生的NFA以检验 
    118     {
    119         for( j = 0;j <= stateCount;j++ )
    120         {
    121              if( T[i][j] == "" ) cout<<" |";
    122              else cout<<T[i][j]<<'|';
    123         }
    124         cout<<endl;
    125     }*/
    126 
    127     stateCountDFA = reduceNFA( stateCount, typeIndex, regCount ); //调用NFA->DFA函数生成DFA 
    128     reduceDFA();
    129     
    130     produceC( stateCountDFA, typeName );                           //产生.cpp代码
    131 
    132     system("pause");
    133     
    134     return 0;
    135 }
    136 
    137 /*以下为NFA的生成函数,采用三个栈实现Thompson算法,第一个栈用于读入字符和运算符
    138  第二个栈用于逆序,第三个栈专门用于处理一对括号内部的表达式,其实用两个栈就可以
    139  实现,及把第三个栈舍弃,直接把括号的计算放在第一个栈中进行也可。为了计算更加
    140  清楚不易出错故而多加了一个栈*
    141  算法思路:读入正则表达式放入栈A中,*,&,|的优先级分别定为3,2,1,而表达式末
    142  尾的'#'优先级最低,看做0。用一个三元数组存放当各个运算符的个数,字符均放入栈A中,
    143  读到运算符时,当栈外的运算符的优先级大于等于站内所有运算符时,该运算符进栈,
    144  如果低于所有站内运算符,计算站内所有运算符表的运算,直到栈内没有比栈外更高优先
    145  级的运算符。
    146  读到(的时候,(进栈,之后不论读到什么均进栈直到读到第一个),将这对()中的字符串 
    147  提出,用栈B逆序后放入栈C,在栈C中按上述方式处理直到只有一个元素,将元素返回栈A
    148  栈中存储的是指向path结构体的指针*/
    149 struct dealStruct * dealWithStr( string str, int stateCountV )
    150 {
    151     int j = 0;int temp = 0;
    152     struct path *= NULL;
    153     int currentPri[3= {0};       //用于储存栈中三种运算符(*,&,|)的个数,便于计算
    154     stack<struct path *> stackA;
    155     while( str.at(j) != '%' )
    156     { 
    157             if(  judgeLD(str.at(j))  )   //如果是字符,放入栈A中
    158             {
    159                  tv.insert(str.at(j));   //放入tv中
    160                  stateCountV++;                     
    161                  temp = stateCountV;
    162                  stateCountV++;
    163                  T[temp][stateCountV] += str.at(j);
    164                  p = new path();
    165                  p->from = temp;
    166                  p->to = stateCountV;
    167                  p->strPath += str.at(j);
    168                  stackA.push(p);
    169                  //cout<<"push a new letter "<<p->strPath<<endl;
    170                  j++;             
    171             }
    172             else if( str.at(j) == '*' )  //如果是'*'由于没有比它优先级更高的了, 
    173             {                               //故必然是入栈 
    174                 currentPri[0]++;
    175                 p = new path();
    176                 p->from = -1;
    177                 p->to = -1;
    178                 p->strPath += "*";
    179                 stackA.push(p);
    180                 //cout<<"push *"<<endl;
    181                 j++;
    182             }
    183             else if( str.at(j) == '&' )  //如果是'&',检查之前有没有'*' 
    184             {
    185                 if( currentPri[0== 0 )  //没有的话,入栈 
    186                     currentPri[1]++;
    187                 else
    188                 {
    189                     currentPri[0]--;      //有,计算* 
    190                     currentPri[1]++;
    191                     
    192                     stateCountV = jihe( stateCountV, stackA );      
    193                 }
    194                 p = new path();
    195                 p->from = -2;
    196                 p->to = -2;
    197                 p->strPath = "&";
    198                 stackA.push(p);          //不论有没有'*','&'必然会入栈 
    199                 //cout<<"push &"<<endl;  
    200                 j++;
    201             }
    202             else if( str.at(j) == '|' )   //如果是'|',查看前面有没有*,& 
    203             {
    204                  while( currentPri[0> 0 || currentPri[1> 0 )//一直算直到没有 
    205                  {
    206                       if( currentPri[0> 0 )
    207                       {
    208                           currentPri[0]--;
    209                           stateCountV = jihe( stateCountV, stackA );
    210                       }
    211                       else
    212                       {
    213                           currentPri[1]--;
    214                           stateCountV = lianjie( stateCountV, stackA );
    215                       }
    216                  }
    217                  currentPri[2]++;       //不论有没有*,&,最后'|'必然还是要入栈 
    218                  p = new path();
    219                  p->from = -3;
    220                  p->to = -3;
    221                  p->strPath = "|";
    222                  stackA.push(p);
    223                  //cout<<"push |"<<endl;
    224                  j++;    
    225             }
    226             else if( str.at(j) == '#' )   //如果是'#',则计算前面所有的运算
    227             {
    228                 while( currentPri[0> 0 || currentPri[1> 0 || currentPri[2> 0 )
    229                 {
    230                       if( currentPri[0> 0 )
    231                       {
    232                           currentPri[0]--;
    233                           stateCountV = jihe( stateCountV, stackA );
    234                       }
    235                       else if(  currentPri[1> 0  )
    236                       {
    237                           currentPri[1]--;
    238                           stateCountV = lianjie( stateCountV, stackA );
    239                       }
    240                       else
    241                       {
    242                           currentPri[2]--;
    243                           stateCountV = huo( stateCountV, stackA );
    244                       }
    245                       
    246                 }                
    247                 //cout<<"The End, there is one struct path in stackA: "<<stackA.size()<<endl;                
    248                 j++;
    249                 //cout<<"It is "<<(stackA.top())->strPath<<endl;                
    250             }
    251             else if( str.at(j) == '(' )           //发现左括号,进入括号处理过程
    252             {                 
    253                  //cout<<"发现一个最外层括号"<<endl;           
    254                  int parenthCount = 1;
    255                  bool flag = true;
    256                  while( parenthCount > 0 )
    257                  {
    258                      if( str.at(j) == '(' )
    259                      {                         
    260                          p = new path();
    261                          p->from = -5;
    262                          p->to = -5;
    263                          p->strPath += "(";
    264                          stackA.push(p);
    265                          //cout<<"放入一左个括号"<<endl;
    266                          if!flag )parenthCount++;
    267                          flag = false;
    268                      }   
    269                      else if( str.at(j) == ')' )              //发现第一个右括号 
    270                      {
    271                          //cout<<"发现一个内层右括号"<<endl;
    272                          parenthCount--;
    273                          
    274                          stack<struct path *> stackB;            //定义另两个栈 
    275                          stack<struct path *> stackC;
    276                          
    277                          p = new path();
    278                          p->from = -6;
    279                          p->to = -6;
    280                          p->strPath += "#";
    281                          stackB.push(p);
    282                          while( (stackA.top())->from != -5 )  //栈Apop到第一个( 
    283                          {
    284                              stackB.push( stackA.top() );  //将栈Apop出的放入B 
    285                             // cout<<"stackA弹出一个"<<stackA.top()->strPath<<endl;
    286                              stackA.pop();
    287                              
    288                          }
    289                          //cout<<"stackA想弹出一个左小括号"<<stackA.top()->strPath<<endl; 
    290                          stackA.pop();                         //去掉左小括号
    291                                                 
    292                          int pri[3= {0};
    293                          while!stackB.empty() )       //开始处理栈B内的表达式 
    294                          {
    295                               p = stackB.top();
    296                               stackB.pop(); 
    297                               //cout<<"stackB弹出一个"<<p->from<<endl;
    298                               if( p->from >= 0 )
    299                                   stackC.push(p);      //用栈C处理栈B的表达式 
    300                               else if( p->from == -1 )
    301                               {
    302                                    pri[0]++;
    303                                    stackC.push(p);
    304                               }
    305                               else if( p->from == -2 )
    306                               {
    307                                    if( pri[0> 0 )
    308                                    {
    309                                         pri[0]--;                                       
    310                                         stateCountV = jihe( stateCountV, stackC );
    311                                    }
    312                                    
    313                                    stackC.push(p);
    314                                    pri[1]++;
    315                               }
    316                               else if( p->from == -3 )
    317                               {
    318                                    while( pri[0> 0 || pri[1> 0 )
    319                                    {
    320                                        if( pri[0> 0 )
    321                                        {
    322                                            pri[0]--;                                       
    323                                            stateCountV = jihe( stateCountV, stackC );
    324                                        }
    325                                        else
    326                                        {
    327                                            pri[1]--
    328                                            stateCountV = lianjie( stateCountV, stackC );
    329                                        }
    330                                    
    331                                    }
    332                                    pri[2]++;
    333                                    stackC.push(p);
    334                               }
    335                               else if( p->from == -6 )
    336                               {
    337                                    while( pri[0> 0 || pri[1> 0 || pri[2> 0 )
    338                                    {
    339                                        if( pri[0> 0 )
    340                                        {
    341                                            pri[0]--;
    342                                            stateCountV = jihe( stateCountV, stackC );
    343                                        }
    344                                        else if( pri[1> 0 )  
    345                                        {
    346                                            pri[1]--;
    347                                            stateCountV = lianjie( stateCountV, stackC );
    348                                        }
    349                                        else
    350                                        {
    351                                            pri[2]--;
    352                                            stateCountV = huo( stateCountV, stackC );
    353                                        }
    354                                    }
    355                               }
    356                               else
    357                                   cout<<"括号内含非法字符!"<<endl; 
    358                          }
    359                          //cout<<"stackC 中此时应该有1个东东"<<stackC.size()<<"它是"<<stackC.top()->strPath<<endl;
    360                          
    361                          //cout<<"此时的parenthCount为"<<parenthCount<<endl;           
    362                          stackA.push(stackC.top());    //处理完成,栈C弹出结果 
    363                          stackC.pop();                   //放入栈A 
    364                      }
    365                      else if( str.at(j) == '*' )
    366                      {
    367                          p = new path();
    368                          p->from = -1;
    369                          p->to = -1;
    370                          p->strPath += "*";
    371                          stackA.push(p);
    372                      }
    373                      else if( str.at(j) == '&' )
    374                      {
    375                          p = new path();
    376                          p->from = -2;
    377                          p->to = -2;
    378                          p->strPath += "&";
    379                          stackA.push(p);
    380                      }
    381                      else if( str.at(j) == '|' )
    382                      {
    383                          p = new path();
    384                          p->from = -3;
    385                          p->to = -3;
    386                          p->strPath += "|";
    387                          stackA.push(p);
    388                      }
    389                      else
    390                      {
    391                          stateCountV++;                        
    392                          temp = stateCountV;
    393                          stateCountV++;
    394                          T[temp][stateCountV] += str.at(j);
    395                          p = new path();
    396                          p->from = temp;
    397                          p->to = stateCountV;
    398                          p->strPath += str.at(j);
    399                          stackA.push(p);
    400                      }
    401                      tv.insert(str.at(j));   //放入tv中
    402                      j++;
    403                  }
    404             }
    405             else
    406                 cout<<"含有非法字符!"<<endl;
    407     }
    408     
    409     struct dealStruct * res = new dealStruct();
    410     res->stCount = stateCountV;
    411     res->resPtr = stackA.top();
    412     stackA.pop();
    413     cout<<"NFA生成完毕!"<<endl;
    414     return res;
    415 }
    416 
    417 
    418 int jihe( int stateV, stack<struct path*> &stackA )
    419 {
    420     struct path *= stackA.top();stackA.pop();
    421     struct path *= stackA.top();stackA.pop();
    422     //cout<<"pop the value for * "<<a->strPath<<endl;
    423     //cout<<"pop the sign for * "<<b->strPath<<endl;
    424     
    425     stateV++;
    426     int temp = stateV;
    427     stateV++;
    428     T[temp][a->from] += "`";
    429     T[temp][stateV]  += "`";
    430     T[a->to][stateV] += "`";
    431     T[a->to][a->from] += "`";
    432     
    433     struct path *= new path();
    434     p->from = temp;
    435     p->to = stateV;
    436     p->strPath = "(" + a->strPath + ")*";
    437     stackA.push(p);
    438     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
    439     
    440     delete a;delete b;
    441     return stateV;
    442 }
    443 int lianjie( int stateV, stack<struct path*> &stackV  )
    444 {
    445     struct path *= stackV.top();stackV.pop();
    446     //cout<<"pop the second value for & "<<a->strPath<<endl;
    447     struct path *= stackV.top();stackV.pop();
    448     //cout<<"pop the sign for & "<<b->strPath<<endl;
    449     struct path *= stackV.top();stackV.pop();    
    450     //cout<<"pop the first value for & "<<c->strPath<<endl;
    451     T[c->to][a->from] += "`";
    452     
    453     struct path *= new path();
    454     p->from = c->from;
    455     p->to = a->to;
    456     p->strPath = "(" + c->strPath + ")&(" + a->strPath + ")";
    457     stackV.push(p);
    458     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
    459     delete a;delete b;delete c;
    460     
    461     return stateV;
    462 }
    463 int huo( int stateV, stack<struct path*> &stackA  )
    464 {
    465     struct path *= stackA.top();stackA.pop();
    466     struct path *= stackA.top();stackA.pop();
    467     struct path *= stackA.top();stackA.pop();
    468     //cout<<"pop the first value for | "<<c->strPath<<endl;
    469     //cout<<"pop the second value for | "<<a->strPath<<endl;
    470     //cout<<"pop the sign for | "<<b->strPath<<endl;
    471     
    472     stateV++;
    473     int temp = stateV;
    474     stateV++;
    475     T[temp][a->from] += "`";
    476     T[temp][c->from] += "`";
    477     T[a->to][stateV] += "`";
    478     T[c->to][stateV] += "`";
    479     
    480     struct path *= new path();
    481     p->from = temp;
    482     p->to = stateV;
    483     p->strPath = "(" + c->strPath + ")|(" + a->strPath + ")";
    484     stackA.push(p);
    485     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
    486     delete a;delete b;delete c;
    487     
    488     return stateV;
    489 }
    490 
    491 int reduceNFA( int stateCountV, int *typeIndexV, int typeNumV )    //stateCountV为NFA的起始状态
    492 {
    493     int i,j,k;
    494     struct setNode *head,*setPtr,*end; //用于构造链表的指针,新状态用链表存储 
    495     struct setNode *currentPtr;        //用于遍历链表中元素的指针 
    496                                                   
    497     set<int>::iterator ite_int;        //每个新的状态为一个set,封装到setNode中 
    498     int countNode = 1;
    499     setPtr = new setNode(); 
    500     setPtr->num = 1;
    501     setPtr->Dset = new set<int>();
    502     setPtr->next = NULL;  
    503     BFS(stateCountV, stateCountV);
    504     for( i = 0;arrForBFS[i] >= 0;i++  )//先把起始状态的ε闭包求出作为新起始状态 
    505         setPtr->Dset->insert(arrForBFS[i]);
    506              
    507     head = setPtr;
    508     end = setPtr;
    509     currentPtr = head;
    510     
    511     string letter1[80];                //这里定义了两个数组,一个存放读入的字符 
    512     set<int> letter2[80];              //一个存放读入相应字符后所到达的状态集合 
    513     int up = 0;
    514     
    515     
    516     while( currentPtr != NULL )            //遍历新状态链表
    517     {
    518          //cout<<"currentPtr遍历到"<<currentPtr->num<<endl; 
    519          //下面一个双重循环,第一个循环表示遍历新状态集合的每一个状态 
    520          //第二个循环用于从原NFA找从这个状态出发到达的所有状态 
    521          for( ite_int = currentPtr->Dset->begin(); ite_int != currentPtr->Dset->end(); ++ite_int )
    522              for( i = 0;i <= stateCountV;i++ )
    523                  if( T[*ite_int][i] != "`" && T[*ite_int][i] != "" )
    524                  {
    525                      bool flag2 = 0;
    526                      for( j = 0;j < up;j++ )
    527                          if( letter1[j] == T[*ite_int][i] )
    528                          {
    529                              flag2 = 1;
    530                              break;
    531                          }
    532                      if!flag2 )
    533                      {
    534                          up++;
    535                          letter1[up-1= T[*ite_int][i];  //读入的字符都记录下 
    536                      }
    537                      BFS( i, stateCountV );
    538                      for( k = 0;arrForBFS[k] >= 0;k++ )
    539                              letter2[j].insert(arrForBFS[k]);//所到的状态也记下 
    540                  }
    541          
    542          /*cout<<"从新的状态"<<currentPtr->num<<"生成的矩阵是"<<endl;
    543          for( i = 0;i < up;i++ )
    544          {
    545              cout<<letter1[i]<<"  ";
    546              for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
    547                  cout<<*ite_int<<" ";
    548              cout<<endl;
    549          }//最后letter1,letter2记录了这个新状态读入不同的字符后会到达的状态合集*/ 
    550 
    551      
    552          for( i = 0;i < up;i++ ) //下面开始比较这些状态是不是以前有过的 
    553          {
    554               struct setNode *= head;
    555               while( p != NULL )
    556               {
    557                   if( letter2[i] == *(p->Dset) )
    558                       break;
    559                   p = p->next;
    560               }
    561               if( p != NULL )   //有这个状态,只要加一条路径即可 
    562                   recT[currentPtr->num][p->num] += letter1[i];
    563               else       //没有这个状态,要在链表末加一个节点,并在recT中加路径 
    564               {
    565                   countNode++;
    566                   recT[currentPtr->num][countNode] += letter1[i];
    567                   setPtr = new setNode();
    568                   setPtr->num = countNode;
    569                   setPtr->Dset = new set<int>();
    570                   for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
    571                       setPtr->Dset->insert( *ite_int );
    572                   setPtr->next = NULL;
    573                   end->next = setPtr;
    574                   end = setPtr;
    575                   
    576                   for( k = 0;k < typeNumV;k++ )
    577                   {
    578                       ite_int = setPtr->Dset->find(typeIndexV[k]);   
    579                       if( ite_int != setPtr->Dset->end() || *(setPtr->Dset->end()) == stateCountV )
    580                       {
    581                           char temp[10];
    582                           string str = itoa(k,temp,10);
    583                           recT[setPtr->num][0= "-"+str; //如果是终止状态,做记号
    584                           recT[0][setPtr->num] = "-"+str;
    585                       }
    586                   }
    587               }
    588          }
    589          for( i = 0;i < up;i++ )          //清空letter1,letter2,用于下一轮遍历
    590          {
    591               letter1[i] = "";
    592               letter2[i].clear();
    593          }
    594          up = 0;
    595          currentPtr = currentPtr->next;
    596          
    597          /*cout<<"此时的链表为"<<endl;
    598          struct setNode *p2 = head;
    599          while( p2 != NULL )
    600               {
    601                   cout<<p2->num<<" ";
    602                   p2 = p2->next;
    603               }
    604          cout<<endl;*/
    605     }
    606     
    607     cout<<"DFA生成完毕!"<<endl;
    608     /*cout<<"最后DFA为:"<<endl; 
    609     for( i = 0;i <= countNode;i++,cout<<endl )
    610         for( j = 0; j <= countNode;j++ )
    611         { 
    612            if( recT[i][j] == "" )cout<<"  |";  
    613            else 
    614                cout<<setw(2)<<recT[i][j]<<"|";
    615         }*/
    616     return countNode;
    617     
    618 }
    619 
    620 void BFS( int row, int stateCountV )  //广度搜索法求一个状态的ε闭包 
    621 {
    622     forint i = 0;i < 100;i++ )
    623         arrForBFS[i] = -1;
    624     int ite = -1;
    625     queue<int> que;
    626     que.push(row);
    627     ite++;
    628     arrForBFS[ite] = row;
    629     while!que.empty() )
    630     {
    631         int t = que.front();
    632         que.pop();
    633         forint i = 0;i <= stateCountV;i++ )
    634             if( T[t][i] == "`" )
    635             {                
    636                 ite++;
    637                 arrForBFS[ite] = i;
    638                 que.push(i);
    639             }
    640     }    
    641 }
    642 
    643 void reduceDFA()   //化简DFA算法
    644 {
    645     /*
    646     算法思想:
    647     构造二维数组reduceState[2][stateCountDFA]
    648     矩阵第一行存储各状态当前所属集合标记
    649     矩阵第二行存储个状态读入某个字符后的目标状态所属集合标记
    650     每读入某个字符后,矩阵两行相对应位置都相等的状态为同一集合
    651     第一行相等而第二行不相等的两状态分到不同集合,即将后一个状态的所属集合标记+1
    652     全部字符读完后,扫描矩阵的第一行,即当前所属集合标记,相等的状态合并
    653     简化完成
    654     */
    655     int pos = 1;   //非终结状态所属集合标记
    656     int poe;       //终结状态所属集合标记
    657     for (int i=1;i<=stateCountDFA;i++//该循环用于给所有状态的所属集合赋初值,终结状态为stateCountDFA,非终结状态为1.
    658     {
    659         if(recT[0][i].find('-')!=string::npos)
    660         {
    661             poe=stateCountDFA+atoi(&recT[0][i].at(1));//将不同正则表达式的终结状态标注为不同集合:stateCountDFA+正则表达式序号
    662             reduceState[0][i]=poe;
    663             reduceState[1][i]=poe;
    664         }
    665         else
    666         {
    667             reduceState[0][i]=pos;
    668             reduceState[1][i]=pos;
    669         }
    670     }
    671     poe=stateCountDFA+regCount+1;
    672     bool flag;  //bool标记,读每个字符,调整集合状态后,重新读入该字符,直到不发生集合改变为止
    673     for(Vt=tv.begin();Vt!=tv.end();Vt++)  //读入每个字符的循环
    674     {
    675         //cout<<*Vt;
    676         flag=true;
    677         while(flag)  //该循环即为重新读入当前字符的循环
    678         {
    679             flag = false;
    680             bool item = true;  //用于标记某状态读入当前字符是否产生状态变化
    681             for(int j=1;j<=stateCountDFA;j++
    682             {
    683                 item=false;
    684                 for(int k=1;k<=stateCountDFA;k++)//循环遍历DFA
    685                 {
    686                     if(recT[j][k].find(*Vt)!=string::npos)//若当前状态读入该字符产生状态变化
    687                     {
    688                         item=true;
    689                         reduceState[1][j]=reduceState[0][k];//目标集合赋值为目标状态所属集合
    690                     }
    691                     if(!item)  //不产生状态变化
    692                         reduceState[1][j]=0;  //目标集合标记为0
    693                 }
    694                 
    695             }
    696             bool b1=false;//标记用于判断pos是否加1
    697             bool b2=false;//标记用于判断poe是否加1
    698             for(int p=1;p<=stateCountDFA;p++)//循环扫描reduceState矩阵.第一个标记位置
    699             {
    700                 b1=false;b2=false;
    701                 for(int q=2;q<=stateCountDFA;q++)//循环扫描reduceState矩阵.第二个标记位置
    702                 {                    
    703                     if(reduceState[0][p]==reduceState[0][q]&&reduceState[1][p]!=reduceState[1][q])
    704                     { 
    705                         //如果存在所属集合相等,目标集合不等的情况,将所属集合分开
    706                         if(reduceState[0][q]<stateCountDFA)  //当前状态为非终结状态
    707                         {
    708                             b1=true;
    709                             reduceState[0][q]=pos;
    710                         }
    711                         else  //当前状态为终结状态
    712                         {
    713                             b2=true;
    714                             reduceState[0][q]=poe;
    715                         }
    716                         flag=true;  //发生集合调整,标记为true
    717                     }                    
    718                 }
    719                 if(b1)
    720                     pos++;
    721                 if(b2)
    722                     poe++;
    723             }
    724         }
    725     }
    726 
    727 //    for(int i1=0;i1<stateCountDFA;i1++)
    728 //        cout<<reduceState[0][i1]<<" ";
    729 //    cout<<endl;
    730 //    for(i1=0;i1<stateCountDFA;i1++)
    731 //        cout<<reduceState[1][i1]<<" ";
    732 
    733     for(int j=1;j<stateCountDFA;j++)  //循环扫描reduceState矩阵第一行.第一个标记位置
    734     {
    735         if(reduceState[0][j]!=0)
    736         {
    737             for(int k=j+1;k<=stateCountDFA;k++)  //循环扫描reduceState矩阵第一行.第二个标记位置
    738             {
    739                 if(reduceState[0][j]==reduceState[0][k])//两个标记位置值相等,即所属集合相同
    740                     combine(j,k);  //调用合并函数
    741             }
    742         }
    743     }
    744     /*cout<<"化简后的DFA为:"<<endl; 
    745     for(int m = 0;m <= stateCountDFA;m++,cout<<endl )
    746         for(int n = 0; n <= stateCountDFA;n++ )
    747         { 
    748            if( recT[m][n] == "" )cout<<" |";  
    749            else 
    750                cout<<recT[m][n]<<"|";
    751         }*/
    752         
    753     cout<<"DFA简化完毕!"<<endl;
    754 }
    755 
    756 void combine(int j,int k)  
    757 {
    758     /*
    759     函数用于合并两个状态
    760     把第K状态的相应的行和列的值加到第j状态上
    761     并把k状态所有的值置空
    762     */
    763     for(int i=1;i<=stateCountDFA;i++)
    764     {
    765         recT[j][i]+=recT[k][i];
    766         recT[i][j]+=recT[i][k];
    767     }
    768     for(i=0;i<=stateCountDFA;i++)
    769     {
    770         recT[k][i]="";
    771         recT[i][k]="";
    772     }
    773     reduceState[0][k]=0;
    774 }
    775 
    776 void produceC( int nodeCountV, string *typeNameV )       //产生.cpp文件
    777 {
    778     int i, j ,k;
    779     char buf[1024];                //临时保存读取出来的文件内容
    780     string message = "";
    781     string remainCode = "";
    782     ifstream in;
    783     in.open("Remain.txt");
    784     if(in.is_open())          //文件打开成功,说明曾经写入过东西
    785     {
    786         while(in.good() && !in.eof())
    787         {
    788             memset(buf,0,1024);
    789             in.getline(buf,1204);
    790             message = buf;
    791             message += "\n";
    792             remainCode += message;
    793         }
    794         in.close();
    795     }
    796     
    797     ofstream fout("scanner.cpp",ios::out);
    798     fout<<remainCode;
    799     fout<<"while(flag)\n"<<"{\n";
    800     fout<<"switch(state)\n"<<"{\n";
    801     for( i = 1;i <= nodeCountV;i++ )   //遍历每个新状态 
    802     {
    803          fout<<"case "<<i<<":\n";
    804          fout<<"last++;\n";
    805          fout<<"c = code[last];\n";
    806          if( i == 1 )
    807          {
    808              fout<<"if( c == '%' )\n";  //读到文件末尾,终结 
    809              fout<<"{\n";
    810              fout<<"flag = 0;\n";
    811              fout<<"break;\n}\n";
    812          }
    813          bool flag3 = 0;bool flag2 = 0;
    814          for( j = 1;j <= nodeCountV;j++ )
    815               if( recT[i][j] != "'" && recT[i][j] != "" )
    816               {
    817                   flag3 = 1;                  
    818                   if( flag2 )
    819                       fout<<"else ";
    820                       
    821                   fout<<"if( contain( \""<<recT[i][j]<<"\",c ) ) ";
    822                   fout<<"state = "<<j<<";\n";
    823                   flag2 = 1;
    824               }
    825          if( flag3 )
    826              fout<<"else\n{\n";
    827          //fout<<"{\n"<<"endState.find("<<i<<");\n";
    828          //fout<<"if( ite_int == endState.find && *(endState.end()) != "<<i<<")\n";
    829          if( recT[i][0][0== '-' )
    830          {
    831              string tempIndexStr = recT[i][0].substr(1,recT[i][0].length()-1);
    832              int tempIndex = atoi(tempIndexStr.c_str());
    833              string type = typeNameV[tempIndex];
    834              fout<<"flag2 = true;\n";
    835              fout<<"for( k = 0;k < count;k++ )\n";;
    836              fout<<"{\n"<<"if( SymbolTable[k][1] == code.substr(first,last-first) )\n";
    837              fout<<"{\nflag2 = false;\nbreak;\n}\n}\n";
    838              fout<<"if(flag2)\n{\n";
    839              fout<<"count++;\n";
    840              fout<<"SymbolTable[count-1][0] = \""<<type<<"\";\n";
    841              fout<<"SymbolTable[count-1][1] = code.substr(first,last-first);\n";
    842              fout<<"}\n";
    843 
    844              fout<<"cout<<code.substr(first,last-first)<<endl;\n";
    845              fout<<"last--;\n";           
    846              fout<<"first = last+1;\n";
    847              fout<<"state = 1;\n";
    848          }
    849          else
    850          {
    851              fout<<"cout<<\"编译错误!\\n\";\n";
    852              fout<<"flag = 0;\n"
    853          }
    854          if( flag3 )
    855              fout<<"}\n";
    856          fout<<"break;\n";
    857     }
    858     fout<<"default:\n"<<"break;\n"<<"}\n}\n";
    859     
    860     fout<<"cout<<\"Symbol 表如下:\"<<endl;\n";
    861     fout<<"cout<<setw(3)<<\"序号\"<<setw(10)<<\"token类型\"<<\"     \"<<\"token名字\"<<endl;\n";
    862     fout<<"for( k2 = 0;k2 < count;k2++ )\n";
    863     fout<<"{\n"<<"cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<\"     \"<<SymbolTable[k2][1]<<endl;\n";
    864     fout<<"}\n";
    865     fout<<"system(\"pause\");\n";
    866     fout<<"return 0;\n}\n";
    867 
    868     cout<<"代码生成完毕!见scanner.cpp文件。"<<endl;
    869 }

    正则表达式在RegularExpressions.txt中写明,示例如下:

    4
    Identity
    a(bc)
    *|bc*|d
    Number
    (
    1|2|3|4|5|6|7|8|9|0)*
    key
    if|else|then
    operator
    >(`|=)|<(`|=)|=(=|`)

    第一行为正则表达式的个数,之后输入表达式代表的类型名,回车,表达式,回车,以此类推。

     remain.txt存放着生成的代码文件中固定的代码,生成代码时会先从此文件读取。

     里面的代码如下:

    remain.txt
    #include<iostream>
    using namespace std;
    #include
    <string>
    #include
    <fstream>
    #include
    <set>
    #include
    <iomanip>
    string SymbolTable[1000][2= {""}; 
    bool contain( string str, char v )
    {
         
    forint i = 0;i < str.length();i++ )
             
    if( str[i] == v )
                 
    return true;
         
    return false;
    }
    int main()
    {  
        
    string tempCode = "";
        
    string code = "";
        
        
    int i,j,k,k2=0;
        
    char buf[1024];                //临时保存读取出来的文件内容
        string message = "";
        ifstream infile;
        infile.open(
    "code.txt");
        
    if(infile.is_open())          //文件打开成功,说明曾经写入过东西
        {
            
    while(infile.good() && !infile.eof())
            {
                memset(buf,
    0,1024);
                infile.getline(buf,
    1204);
                message 
    = buf;
                tempCode 
    += message;
            }
            infile.close();
        }
        
    for( i = 0;i < tempCode.length();i++ )
        {
             
    if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
             code 
    += tempCode[i];
        }
        code 
    += "%";
        cout
    <<code<<endl;
        
        
    int first = 0, last = -1;
        
    char c;
        
    string tempStr = "";
        
    int state = 1;
        
    bool flag = 1;
        
    bool flag2 = true;
        
    int count = 0;
        
    //set<int> endState;
        
    //set<int>::iterator ite_int;
    ///////////////////////////////////////////保留代码///////////////////////////////////////////////////

    表达式读入生成的DFA简化后生成.cpp文件,命名为scanner.cpp

    根据上面的那个示例,生成的scanner.cpp文件内容为:

    scanner.txt
    #include<iostream>
    using namespace std;
    #include
    <string>
    #include
    <fstream>
    #include
    <set>
    #include
    <iomanip>
    string SymbolTable[1000][2= {""}; 
    bool contain( string str, char v )
    {
         
    forint i = 0;i < str.length();i++ )
             
    if( str[i] == v )
                 
    return true;
         
    return false;
    }
    int main()
    {  
        
    string tempCode = "";
        
    string code = "";
        
        
    int i,j,k,k2=0;
        
    char buf[1024];                //临时保存读取出来的文件内容
        string message = "";
        ifstream infile;
        infile.open(
    "code.txt");
        
    if(infile.is_open())          //文件打开成功,说明曾经写入过东西
        {
            
    while(infile.good() && !infile.eof())
            {
                memset(buf,
    0,1024);
                infile.getline(buf,
    1204);
                message 
    = buf;
                tempCode 
    += message;
            }
            infile.close();
        }
        
    for( i = 0;i < tempCode.length();i++ )
        {
             
    if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
             code 
    += tempCode[i];
        }
        code 
    += "%";
        cout
    <<code<<endl;
        
        
    int first = 0, last = -1;
        
    char c;
        
    string tempStr = "";
        
    int state = 1;
        
    bool flag = 1;
        
    bool flag2 = true;
        
    int count = 0;
        
    //set<int> endState;
        
    //set<int>::iterator ite_int;
    ///////////////////////////////////////////保留代码///////////////////////////////////////////////////
    while(flag)
    {
    switch(state)
    {
    case 1:
    last
    ++;
    = code[last];
    if( c == '%' )
    {
    flag 
    = 0;
    break;
    }
    if( contain( "a",c ) ) state = 2;
    else if( contain( "b",c ) ) state = 3;
    else if( contain( "d",c ) ) state = 4;
    else if( contain( "1234567890",c ) ) state = 5;
    else if( contain( "i",c ) ) state = 15;
    else if( contain( "e",c ) ) state = 16;
    else if( contain( "t",c ) ) state = 17;
    else if( contain( "><=",c ) ) state = 18;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 2:
    last
    ++;
    = code[last];
    if( contain( "bb",c ) ) state = 21;
    else
    {
    flag2 
    = true;
    for( k = 0;k < count;k++ )
    {
    if( SymbolTable[k][1== code.substr(first,last-first) )
    {
    flag2 
    = false;
    break;
    }
    }
    if(flag2)
    {
    count
    ++;
    SymbolTable[count
    -1][0= "Identity";
    SymbolTable[count
    -1][1= code.substr(first,last-first);
    }
    cout
    <<code.substr(first,last-first)<<endl;
    last
    --;
    first 
    = last+1;
    state 
    = 1;
    }
    break;
    case 3:
    last
    ++;
    = code[last];
    if( contain( "c",c ) ) state = 3;
    else
    {
    flag2 
    = true;
    for( k = 0;k < count;k++ )
    {
    if( SymbolTable[k][1== code.substr(first,last-first) )
    {
    flag2 
    = false;
    break;
    }
    }
    if(flag2)
    {
    count
    ++;
    SymbolTable[count
    -1][0= "Identity";
    SymbolTable[count
    -1][1= code.substr(first,last-first);
    }
    cout
    <<code.substr(first,last-first)<<endl;
    last
    --;
    first 
    = last+1;
    state 
    = 1;
    }
    break;
    case 4:
    last
    ++;
    = code[last];
    flag2 
    = true;
    for( k = 0;k < count;k++ )
    {
    if( SymbolTable[k][1== code.substr(first,last-first) )
    {
    flag2 
    = false;
    break;
    }
    }
    if(flag2)
    {
    count
    ++;
    SymbolTable[count
    -1][0= "Identity";
    SymbolTable[count
    -1][1= code.substr(first,last-first);
    }
    cout
    <<code.substr(first,last-first)<<endl;
    last
    --;
    first 
    = last+1;
    state 
    = 1;
    break;
    case 5:
    last
    ++;
    = code[last];
    if( contain( "1121233123444123455551234566666123456777777123456788888881234567899999999123456789000000000",c ) ) state = 5;
    else
    {
    flag2 
    = true;
    for( k = 0;k < count;k++ )
    {
    if( SymbolTable[k][1== code.substr(first,last-first) )
    {
    flag2 
    = false;
    break;
    }
    }
    if(flag2)
    {
    count
    ++;
    SymbolTable[count
    -1][0= "Number";
    SymbolTable[count
    -1][1= code.substr(first,last-first);
    }
    cout
    <<code.substr(first,last-first)<<endl;
    last
    --;
    first 
    = last+1;
    state 
    = 1;
    }
    break;
    case 6:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 7:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 8:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 9:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 10:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 11:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 12:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 13:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 14:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 15:
    last
    ++;
    = code[last];
    if( contain( "f",c ) ) state = 23;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 16:
    last
    ++;
    = code[last];
    if( contain( "l",c ) ) state = 24;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 17:
    last
    ++;
    = code[last];
    if( contain( "h",c ) ) state = 25;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 18:
    last
    ++;
    = code[last];
    if( contain( "===",c ) ) state = 26;
    else
    {
    flag2 
    = true;
    for( k = 0;k < count;k++ )
    {
    if( SymbolTable[k][1== code.substr(first,last-first) )
    {
    flag2 
    = false;
    break;
    }
    }
    if(flag2)
    {
    count
    ++;
    SymbolTable[count
    -1][0= "operator";
    SymbolTable[count
    -1][1= code.substr(first,last-first);
    }
    cout
    <<code.substr(first,last-first)<<endl;
    last
    --;
    first 
    = last+1;
    state 
    = 1;
    }
    break;
    case 19:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 20:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 21:
    last
    ++;
    = code[last];
    if( contain( "c",c ) ) state = 2;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 22:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 23:
    last
    ++;
    = code[last];
    flag2 
    = true;
    for( k = 0;k < count;k++ )
    {
    if( SymbolTable[k][1== code.substr(first,last-first) )
    {
    flag2 
    = false;
    break;
    }
    }
    if(flag2)
    {
    count
    ++;
    SymbolTable[count
    -1][0= "key";
    SymbolTable[count
    -1][1= code.substr(first,last-first);
    }
    cout
    <<code.substr(first,last-first)<<endl;
    last
    --;
    first 
    = last+1;
    state 
    = 1;
    break;
    case 24:
    last
    ++;
    = code[last];
    if( contain( "s",c ) ) state = 30;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 25:
    last
    ++;
    = code[last];
    if( contain( "e",c ) ) state = 31;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 26:
    last
    ++;
    = code[last];
    flag2 
    = true;
    for( k = 0;k < count;k++ )
    {
    if( SymbolTable[k][1== code.substr(first,last-first) )
    {
    flag2 
    = false;
    break;
    }
    }
    if(flag2)
    {
    count
    ++;
    SymbolTable[count
    -1][0= "operator";
    SymbolTable[count
    -1][1= code.substr(first,last-first);
    }
    cout
    <<code.substr(first,last-first)<<endl;
    last
    --;
    first 
    = last+1;
    state 
    = 1;
    break;
    case 27:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 28:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 29:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 30:
    last
    ++;
    = code[last];
    if( contain( "e",c ) ) state = 23;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 31:
    last
    ++;
    = code[last];
    if( contain( "n",c ) ) state = 23;
    else
    {
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    }
    break;
    case 32:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    case 33:
    last
    ++;
    = code[last];
    cout
    <<"编译错误!\n";
    flag 
    = 0;
    break;
    default:
    break;
    }
    }
    cout
    <<"Symbol 表如下:"<<endl;
    cout
    <<setw(3)<<"序号"<<setw(10)<<"token类型"<<"     "<<"token名字"<<endl;
    for( k2 = 0;k2 < count;k2++ )
    {
    cout
    <<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<"     "<<SymbolTable[k2][1]<<endl;
    }
    system(
    "pause");
    return 0;
    }

    等待编译的代码写在code.txt中,这个txt相当于代码输入界面,这里的内容对应着上面的那些正则表实例,内容如下:

    if a >= 3
    bcc 
    = 123;
    else if d < 9
    abc 
    = 12

    运行主体代码后,scanner.cpp里的代码编译后的程序会读入code.txt中的代码,再在运行界面上输出token串和Symbol Table,大功告成!

  • 相关阅读:
    Vue的条件渲染详解
    Vue的style绑定
    Vue的class绑定总结
    v-model详解
    MUI手势锁
    mysql主从搭建
    处理绿盟科技安全评估的系统漏洞
    微信小程序发送ajax
    微信小程序上拉下拉刷新
    微信小程序覆盖自定义组件样式
  • 原文地址:https://www.cnblogs.com/felixfang/p/1648765.html
Copyright © 2020-2023  润新知