[Compiling Principles] LEX基本功能的实现

代码

  1 #include<iostream>
  2 #include "stdio.h"
  3 using namespace std;
  4 #include<String>
  5 #include<Stack>
  6 #include<set>
  7 #include<queue>
  8 #include<fstream>
  9 #include<iomanip>
10 struct path
11 {
12     int from;                  //一个path代表从一个状态到另一个状态的路径
13     int to;
14     string strPath;            //strPath代表这个结构体处理的路径
15 };
16 struct dealStruct
17 {
18     int stCount;
19     struct path* resPtr;
20 };
21 struct setNode
22 {
23     set<int> *Dset;
24     int num;
25     struct setNode * next;
26 };
27 bool judgeLD( char v )                      //判断正则表达式中是否含有非法字符
28 {
29     if(  ( int(v) > 47 &&  int(v) <= 57 ) ||
30          ( int(v) >= 65 &&  int(v) <= 90 )||
31          ( int(v) >= 97 &&  int(v) <= 122 ) ||
32            v == '>' || v == '<' || v == '=' ||
33            v == '`' || v == '+' || v == '-' )
34         return true;
35     else
36         return false;
37 }
38 struct dealStruct * dealWithStr( string, int ); //通过正则表达式生成NFA
39 int jihe( int, stack<struct path*> & );         //处理正则表达式时'*’的运算函数
40 int lianjie( int, stack<struct path*> & );      //处理正则表达式时'&’（自定义运算符ab相当于a&b）的运算函数
41 int huo( int, stack<struct path*> & );;         //处理正则表达式时'|’的运算函数
42 int reduceNFA( int, int*, int );                //NFA->DFA函数
43 void BFS( int, int );                           //计算一个状态的ξ闭包
44 void produceC( int, string * );                 //生成词法分析器的.cpp文件
45 int arrForBFS[100] = {0};                       //存放BFS函数生成的闭包的数组
46 string T[2000][2000];                           //NFA的数组表示
47 string recT[2000][2000];                        //DFA的数组表示
48 int stateCountDFA;                              //DFA状态数量
49 int reduceState[2][2000];                       //化简DFA用的数组
50 set<char> tv;                                   //所有终结符
51 set<char>::iterator Vt;
52 void reduceDFA();                               //化简DFA
53 void combine(int,int);                          //化简DFA时合并两个状态
54 int regCount;                                   //输入正则表达式的个数
55
56 int main()
57 {
58     regCount = 0;
59     int i,j,k;
60     int temp = 0;
61
62     string * regArr;
63     string * typeName;
64     int * typeIndex;
65     int **resultEnterExt;
66
67
68     ifstream inRegular;
69     inRegular.open("RegularExpressions.txt");
70     inRegular>>regCount;                          //正则表达式个数
71     typeName = new string[regCount];
72     typeIndex = new int[regCount];
73     resultEnterExt = new int*[regCount];      //定义一个二位动态数组用于存放每个正则表达式所生成的NFA的起始状态和终结状态
74     for( i = 0;i < regCount;i++ )
75         resultEnterExt[i] = new int[2];
76     regArr = new string[regCount];
77     for( i = 0;i < regCount;i++ )
78     {
79         inRegular>>typeName[i];
80         inRegular>>regArr[i];
81     }                                           //读入正则表达式
82
83     int stateCount = -1;
84     for( i = 0;i < regCount;i++ )                 //遍历每个正则表达式
85     {
86         for( j = 0;j < regArr[i].length()-1;j++ )
87            if(   (  judgeLD(regArr[i].at(j) ) && (  judgeLD(regArr[i].at(j+1)) || regArr[i].at(j+1) == '(' )  ) ||
88                  (  regArr[i].at(j) == ')'  &&   ( regArr[i].at(j+1) == '(' || judgeLD(regArr[i].at(j+1)) )   ) ||
89                  (  regArr[i].at(j) == '*'  &&    ( regArr[i].at(j+1) == '(' || judgeLD(regArr[i].at(j+1)) )  )   )
90               regArr[i].insert(j+1,"&");           //预处理正则表达式，添加自定义运算符'&'
91
92
93         regArr[i].insert(regArr[i].length(),"#%");  //在末尾添加结束符
94         cout<<regArr[i]<<endl;                      //输出处理后的正则表达式
95
96
97         j = 0;
98         struct path *p = NULL;
99         struct dealStruct * result = dealWithStr( regArr[i],stateCount );  //对于每个正则表达式调用NFA生成函数
100
101         stateCount = result->stCount;
102         resultEnterExt[i][0] = result->resPtr->from;    //记录该正则表达式的起始状态和终结状态
103         resultEnterExt[i][1] = result->resPtr->to;
104         typeIndex[i] = resultEnterExt[i][1];
105 /*        cout<<"The number of states now is "<<stateCount<<endl;
106         cout<<"The regular perform dealt with just now is "<<result->resPtr->strPath<<endl;
107         cout<<"The from dealt with just now is "<<resultEnterExt[i][0]<<endl;
108         cout<<"The to dealt with just now is "<<resultEnterExt[i][1]<<endl;*/
109     }
110
111     stateCount++;                                  //汇总所有正则表达式的NFA
112     cout<<"NFA合并完毕！"<<endl;
113
114     for( i = 0;i < regCount;i++ )
115          T[stateCount][resultEnterExt[i][0]] = '`';      //从公共起始状态出发
116
117   /*  for( i = 0;i <= stateCount;i++ )       //输出产生的NFA以检验
118     {
119         for( j = 0;j <= stateCount;j++ )
120         {
121              if( T[i][j] == "" ) cout<<" |";
122              else cout<<T[i][j]<<'|';
123         }
124         cout<<endl;
125     }*/
126
127     stateCountDFA = reduceNFA( stateCount, typeIndex, regCount ); //调用NFA->DFA函数生成DFA
128     reduceDFA();
129
130     produceC( stateCountDFA, typeName );                           //产生.cpp代码
131
132     system("pause");
133
134     return 0;
135 }
136
137 /*以下为NFA的生成函数，采用三个栈实现Thompson算法，第一个栈用于读入字符和运算符
138  第二个栈用于逆序，第三个栈专门用于处理一对括号内部的表达式，其实用两个栈就可以
139  实现，及把第三个栈舍弃，直接把括号的计算放在第一个栈中进行也可。为了计算更加
140  清楚不易出错故而多加了一个栈*
141  算法思路：读入正则表达式放入栈A中，*,&,|的优先级分别定为3,2,1，而表达式末
142  尾的'#'优先级最低，看做0。用一个三元数组存放当各个运算符的个数，字符均放入栈A中，
143  读到运算符时，当栈外的运算符的优先级大于等于站内所有运算符时，该运算符进栈，
144  如果低于所有站内运算符，计算站内所有运算符表的运算，直到栈内没有比栈外更高优先
145  级的运算符。
146  读到(的时候，(进栈，之后不论读到什么均进栈直到读到第一个)，将这对()中的字符串
147  提出，用栈B逆序后放入栈C，在栈C中按上述方式处理直到只有一个元素，将元素返回栈A
148  栈中存储的是指向path结构体的指针*/
149 struct dealStruct * dealWithStr( string str, int stateCountV )
150 {
151     int j = 0;int temp = 0;
152     struct path *p = NULL;
153     int currentPri[3] = {0};       //用于储存栈中三种运算符（*，&，|）的个数，便于计算
154     stack<struct path *> stackA;
155     while( str.at(j) != '%' )
156     {
157             if(  judgeLD(str.at(j))  )   //如果是字符，放入栈A中
158             {
159                  tv.insert(str.at(j));   //放入tv中
160                  stateCountV++;
161                  temp = stateCountV;
162                  stateCountV++;
163                  T[temp][stateCountV] += str.at(j);
164                  p = new path();
165                  p->from = temp;
166                  p->to = stateCountV;
167                  p->strPath += str.at(j);
168                  stackA.push(p);
169                  //cout<<"push a new letter "<<p->strPath<<endl;
170                  j++;
171             }
172             else if( str.at(j) == '*' )  //如果是'*'由于没有比它优先级更高的了，
173             {                               //故必然是入栈
174                 currentPri[0]++;
175                 p = new path();
176                 p->from = -1;
177                 p->to = -1;
178                 p->strPath += "*";
179                 stackA.push(p);
180                 //cout<<"push *"<<endl;
181                 j++;
182             }
183             else if( str.at(j) == '&' )  //如果是'&',检查之前有没有'*'
184             {
185                 if( currentPri[0] == 0 )  //没有的话，入栈
186                     currentPri[1]++;
187                 else
188                 {
189                     currentPri[0]--;      //有，计算*
190                     currentPri[1]++;
191
192                     stateCountV = jihe( stateCountV, stackA );
193                 }
194                 p = new path();
195                 p->from = -2;
196                 p->to = -2;
197                 p->strPath = "&";
198                 stackA.push(p);          //不论有没有'*'，'&'必然会入栈
199                 //cout<<"push &"<<endl;
200                 j++;
201             }
202             else if( str.at(j) == '|' )   //如果是'|'，查看前面有没有*,&
203             {
204                  while( currentPri[0] > 0 || currentPri[1] > 0 )//一直算直到没有
205                  {
206                       if( currentPri[0] > 0 )
207                       {
208                           currentPri[0]--;
209                           stateCountV = jihe( stateCountV, stackA );
210                       }
211                       else
212                       {
213                           currentPri[1]--;
214                           stateCountV = lianjie( stateCountV, stackA );
215                       }
216                  }
217                  currentPri[2]++;       //不论有没有*，&，最后'|'必然还是要入栈
218                  p = new path();
219                  p->from = -3;
220                  p->to = -3;
221                  p->strPath = "|";
222                  stackA.push(p);
223                  //cout<<"push |"<<endl;
224                  j++;
225             }
226             else if( str.at(j) == '#' )   //如果是'#'，则计算前面所有的运算
227             {
228                 while( currentPri[0] > 0 || currentPri[1] > 0 || currentPri[2] > 0 )
229                 {
230                       if( currentPri[0] > 0 )
231                       {
232                           currentPri[0]--;
233                           stateCountV = jihe( stateCountV, stackA );
234                       }
235                       else if(  currentPri[1] > 0  )
236                       {
237                           currentPri[1]--;
238                           stateCountV = lianjie( stateCountV, stackA );
239                       }
240                       else
241                       {
242                           currentPri[2]--;
243                           stateCountV = huo( stateCountV, stackA );
244                       }
245
246                 }
247                 //cout<<"The End, there is one struct path in stackA: "<<stackA.size()<<endl;
248                 j++;
249                 //cout<<"It is "<<(stackA.top())->strPath<<endl;
250             }
251             else if( str.at(j) == '(' )           //发现左括号，进入括号处理过程
252             {
253                  //cout<<"发现一个最外层括号"<<endl;
254                  int parenthCount = 1;
255                  bool flag = true;
256                  while( parenthCount > 0 )
257                  {
258                      if( str.at(j) == '(' )
259                      {
260                          p = new path();
261                          p->from = -5;
262                          p->to = -5;
263                          p->strPath += "(";
264                          stackA.push(p);
265                          //cout<<"放入一左个括号"<<endl;
266                          if( !flag )parenthCount++;
267                          flag = false;
268                      }
269                      else if( str.at(j) == ')' )              //发现第一个右括号
270                      {
271                          //cout<<"发现一个内层右括号"<<endl;
272                          parenthCount--;
273
274                          stack<struct path *> stackB;            //定义另两个栈
275                          stack<struct path *> stackC;
276
277                          p = new path();
278                          p->from = -6;
279                          p->to = -6;
280                          p->strPath += "#";
281                          stackB.push(p);
282                          while( (stackA.top())->from != -5 )  //栈Apop到第一个(
283                          {
284                              stackB.push( stackA.top() );  //将栈Apop出的放入B
285                             // cout<<"stackA弹出一个"<<stackA.top()->strPath<<endl;
286                              stackA.pop();
287
288                          }
289                          //cout<<"stackA想弹出一个左小括号"<<stackA.top()->strPath<<endl;
290                          stackA.pop();                         //去掉左小括号
291
292                          int pri[3] = {0};
293                          while( !stackB.empty() )       //开始处理栈B内的表达式
294                          {
295                               p = stackB.top();
296                               stackB.pop();
297                               //cout<<"stackB弹出一个"<<p->from<<endl;
298                               if( p->from >= 0 )
299                                   stackC.push(p);      //用栈C处理栈B的表达式
300                               else if( p->from == -1 )
301                               {
302                                    pri[0]++;
303                                    stackC.push(p);
304                               }
305                               else if( p->from == -2 )
306                               {
307                                    if( pri[0] > 0 )
308                                    {
309                                         pri[0]--;
310                                         stateCountV = jihe( stateCountV, stackC );
311                                    }
312
313                                    stackC.push(p);
314                                    pri[1]++;
315                               }
316                               else if( p->from == -3 )
317                               {
318                                    while( pri[0] > 0 || pri[1] > 0 )
319                                    {
320                                        if( pri[0] > 0 )
321                                        {
322                                            pri[0]--;
323                                            stateCountV = jihe( stateCountV, stackC );
324                                        }
325                                        else
326                                        {
327                                            pri[1]--;
328                                            stateCountV = lianjie( stateCountV, stackC );
329                                        }
330
331                                    }
332                                    pri[2]++;
333                                    stackC.push(p);
334                               }
335                               else if( p->from == -6 )
336                               {
337                                    while( pri[0] > 0 || pri[1] > 0 || pri[2] > 0 )
338                                    {
339                                        if( pri[0] > 0 )
340                                        {
341                                            pri[0]--;
342                                            stateCountV = jihe( stateCountV, stackC );
343                                        }
344                                        else if( pri[1] > 0 )
345                                        {
346                                            pri[1]--;
347                                            stateCountV = lianjie( stateCountV, stackC );
348                                        }
349                                        else
350                                        {
351                                            pri[2]--;
352                                            stateCountV = huo( stateCountV, stackC );
353                                        }
354                                    }
355                               }
356                               else
357                                   cout<<"括号内含非法字符！"<<endl;
358                          }
359                          //cout<<"stackC 中此时应该有1个东东"<<stackC.size()<<"它是"<<stackC.top()->strPath<<endl;
360
361                          //cout<<"此时的parenthCount为"<<parenthCount<<endl;
362                          stackA.push(stackC.top());    //处理完成，栈C弹出结果
363                          stackC.pop();                   //放入栈A
364                      }
365                      else if( str.at(j) == '*' )
366                      {
367                          p = new path();
368                          p->from = -1;
369                          p->to = -1;
370                          p->strPath += "*";
371                          stackA.push(p);
372                      }
373                      else if( str.at(j) == '&' )
374                      {
375                          p = new path();
376                          p->from = -2;
377                          p->to = -2;
378                          p->strPath += "&";
379                          stackA.push(p);
380                      }
381                      else if( str.at(j) == '|' )
382                      {
383                          p = new path();
384                          p->from = -3;
385                          p->to = -3;
386                          p->strPath += "|";
387                          stackA.push(p);
388                      }
389                      else
390                      {
391                          stateCountV++;
392                          temp = stateCountV;
393                          stateCountV++;
394                          T[temp][stateCountV] += str.at(j);
395                          p = new path();
396                          p->from = temp;
397                          p->to = stateCountV;
398                          p->strPath += str.at(j);
399                          stackA.push(p);
400                      }
401                      tv.insert(str.at(j));   //放入tv中
402                      j++;
403                  }
404             }
405             else
406                 cout<<"含有非法字符！"<<endl;
407     }
408
409     struct dealStruct * res = new dealStruct();
410     res->stCount = stateCountV;
411     res->resPtr = stackA.top();
412     stackA.pop();
413     cout<<"NFA生成完毕！"<<endl;
414     return res;
415 }
416
417
418 int jihe( int stateV, stack<struct path*> &stackA )
419 {
420     struct path *b = stackA.top();stackA.pop();
421     struct path *a = stackA.top();stackA.pop();
422     //cout<<"pop the value for * "<<a->strPath<<endl;
423     //cout<<"pop the sign for * "<<b->strPath<<endl;
424
425     stateV++;
426     int temp = stateV;
427     stateV++;
428     T[temp][a->from] += "`";
429     T[temp][stateV]  += "`";
430     T[a->to][stateV] += "`";
431     T[a->to][a->from] += "`";
432
433     struct path *p = new path();
434     p->from = temp;
435     p->to = stateV;
436     p->strPath = "(" + a->strPath + ")*";
437     stackA.push(p);
438     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
439
440     delete a;delete b;
441     return stateV;
442 }
443 int lianjie( int stateV, stack<struct path*> &stackV  )
444 {
445     struct path *a = stackV.top();stackV.pop();
446     //cout<<"pop the second value for & "<<a->strPath<<endl;
447     struct path *b = stackV.top();stackV.pop();
448     //cout<<"pop the sign for & "<<b->strPath<<endl;
449     struct path *c = stackV.top();stackV.pop();
450     //cout<<"pop the first value for & "<<c->strPath<<endl;
451     T[c->to][a->from] += "`";
452
453     struct path *p = new path();
454     p->from = c->from;
455     p->to = a->to;
456     p->strPath = "(" + c->strPath + ")&(" + a->strPath + ")";
457     stackV.push(p);
458     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
459     delete a;delete b;delete c;
460
461     return stateV;
462 }
463 int huo( int stateV, stack<struct path*> &stackA  )
464 {
465     struct path *a = stackA.top();stackA.pop();
466     struct path *b = stackA.top();stackA.pop();
467     struct path *c = stackA.top();stackA.pop();
468     //cout<<"pop the first value for | "<<c->strPath<<endl;
469     //cout<<"pop the second value for | "<<a->strPath<<endl;
470     //cout<<"pop the sign for | "<<b->strPath<<endl;
471
472     stateV++;
473     int temp = stateV;
474     stateV++;
475     T[temp][a->from] += "`";
476     T[temp][c->from] += "`";
477     T[a->to][stateV] += "`";
478     T[c->to][stateV] += "`";
479
480     struct path *p = new path();
481     p->from = temp;
482     p->to = stateV;
483     p->strPath = "(" + c->strPath + ")|(" + a->strPath + ")";
484     stackA.push(p);
485     //cout<<"push a new one of \""<<p->strPath<<"\""<<endl;
486     delete a;delete b;delete c;
487
488     return stateV;
489 }
490
491 int reduceNFA( int stateCountV, int *typeIndexV, int typeNumV )    //stateCountV为NFA的起始状态
492 {
493     int i,j,k;
494     struct setNode *head,*setPtr,*end; //用于构造链表的指针，新状态用链表存储
495     struct setNode *currentPtr;        //用于遍历链表中元素的指针
496
497     set<int>::iterator ite_int;        //每个新的状态为一个set，封装到setNode中
498     int countNode = 1;
499     setPtr = new setNode();
500     setPtr->num = 1;
501     setPtr->Dset = new set<int>();
502     setPtr->next = NULL;
503     BFS(stateCountV, stateCountV);
504     for( i = 0;arrForBFS[i] >= 0;i++  )//先把起始状态的ε闭包求出作为新起始状态
505         setPtr->Dset->insert(arrForBFS[i]);
506
507     head = setPtr;
508     end = setPtr;
509     currentPtr = head;
510
511     string letter1[80];                //这里定义了两个数组，一个存放读入的字符
512     set<int> letter2[80];              //一个存放读入相应字符后所到达的状态集合
513     int up = 0;
514
515
516     while( currentPtr != NULL )            //遍历新状态链表
517     {
518          //cout<<"currentPtr遍历到"<<currentPtr->num<<endl;
519          //下面一个双重循环，第一个循环表示遍历新状态集合的每一个状态
520          //第二个循环用于从原NFA找从这个状态出发到达的所有状态
521          for( ite_int = currentPtr->Dset->begin(); ite_int != currentPtr->Dset->end(); ++ite_int )
522              for( i = 0;i <= stateCountV;i++ )
523                  if( T[*ite_int][i] != "`" && T[*ite_int][i] != "" )
524                  {
525                      bool flag2 = 0;
526                      for( j = 0;j < up;j++ )
527                          if( letter1[j] == T[*ite_int][i] )
528                          {
529                              flag2 = 1;
530                              break;
531                          }
532                      if( !flag2 )
533                      {
534                          up++;
535                          letter1[up-1] = T[*ite_int][i];  //读入的字符都记录下
536                      }
537                      BFS( i, stateCountV );
538                      for( k = 0;arrForBFS[k] >= 0;k++ )
539                              letter2[j].insert(arrForBFS[k]);//所到的状态也记下
540                  }
541
542          /*cout<<"从新的状态"<<currentPtr->num<<"生成的矩阵是"<<endl;
543          for( i = 0;i < up;i++ )
544          {
545              cout<<letter1[i]<<"  ";
546              for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
547                  cout<<*ite_int<<" ";
548              cout<<endl;
549          }//最后letter1,letter2记录了这个新状态读入不同的字符后会到达的状态合集*/
550
551
552          for( i = 0;i < up;i++ ) //下面开始比较这些状态是不是以前有过的
553          {
554               struct setNode *p = head;
555               while( p != NULL )
556               {
557                   if( letter2[i] == *(p->Dset) )
558                       break;
559                   p = p->next;
560               }
561               if( p != NULL )   //有这个状态，只要加一条路径即可
562                   recT[currentPtr->num][p->num] += letter1[i];
563               else       //没有这个状态，要在链表末加一个节点，并在recT中加路径
564               {
565                   countNode++;
566                   recT[currentPtr->num][countNode] += letter1[i];
567                   setPtr = new setNode();
568                   setPtr->num = countNode;
569                   setPtr->Dset = new set<int>();
570                   for( ite_int = letter2[i].begin(); ite_int != letter2[i].end(); ++ite_int )
571                       setPtr->Dset->insert( *ite_int );
572                   setPtr->next = NULL;
573                   end->next = setPtr;
574                   end = setPtr;
575
576                   for( k = 0;k < typeNumV;k++ )
577                   {
578                       ite_int = setPtr->Dset->find(typeIndexV[k]);
579                       if( ite_int != setPtr->Dset->end() || *(setPtr->Dset->end()) == stateCountV )
580                       {
581                           char temp[10];
582                           string str = itoa(k,temp,10);
583                           recT[setPtr->num][0] = "-"+str; //如果是终止状态，做记号
584                           recT[0][setPtr->num] = "-"+str;
585                       }
586                   }
587               }
588          }
589          for( i = 0;i < up;i++ )          //清空letter1,letter2，用于下一轮遍历
590          {
591               letter1[i] = "";
592               letter2[i].clear();
593          }
594          up = 0;
595          currentPtr = currentPtr->next;
596
597          /*cout<<"此时的链表为"<<endl;
598          struct setNode *p2 = head;
599          while( p2 != NULL )
600               {
601                   cout<<p2->num<<" ";
602                   p2 = p2->next;
603               }
604          cout<<endl;*/
605     }
606
607     cout<<"DFA生成完毕！"<<endl;
608     /*cout<<"最后DFA为："<<endl;
609     for( i = 0;i <= countNode;i++,cout<<endl )
610         for( j = 0; j <= countNode;j++ )
611         {
612            if( recT[i][j] == "" )cout<<"  |";
613            else
614                cout<<setw(2)<<recT[i][j]<<"|";
615         }*/
616     return countNode;
617
618 }
619
620 void BFS( int row, int stateCountV )  //广度搜索法求一个状态的ε闭包
621 {
622     for( int i = 0;i < 100;i++ )
623         arrForBFS[i] = -1;
624     int ite = -1;
625     queue<int> que;
626     que.push(row);
627     ite++;
628     arrForBFS[ite] = row;
629     while( !que.empty() )
630     {
631         int t = que.front();
632         que.pop();
633         for( int i = 0;i <= stateCountV;i++ )
634             if( T[t][i] == "`" )
635             {
636                 ite++;
637                 arrForBFS[ite] = i;
638                 que.push(i);
639             }
640     }
641 }
642
643 void reduceDFA()   //化简DFA算法
644 {
645     /*
646     算法思想:
647     构造二维数组reduceState[2][stateCountDFA]
648     矩阵第一行存储各状态当前所属集合标记
649     矩阵第二行存储个状态读入某个字符后的目标状态所属集合标记
650     每读入某个字符后,矩阵两行相对应位置都相等的状态为同一集合
651     第一行相等而第二行不相等的两状态分到不同集合,即将后一个状态的所属集合标记+1
652     全部字符读完后,扫描矩阵的第一行,即当前所属集合标记,相等的状态合并
653     简化完成
654     */
655     int pos = 1;   //非终结状态所属集合标记
656     int poe;       //终结状态所属集合标记
657     for (int i=1;i<=stateCountDFA;i++) //该循环用于给所有状态的所属集合赋初值,终结状态为stateCountDFA,非终结状态为1.
658     {
659         if(recT[0][i].find('-')!=string::npos)
660         {
661             poe=stateCountDFA+atoi(&recT[0][i].at(1));//将不同正则表达式的终结状态标注为不同集合：stateCountDFA+正则表达式序号
662             reduceState[0][i]=poe;
663             reduceState[1][i]=poe;
664         }
665         else
666         {
667             reduceState[0][i]=pos;
668             reduceState[1][i]=pos;
669         }
670     }
671     poe=stateCountDFA+regCount+1;
672     bool flag;  //bool标记,读每个字符,调整集合状态后,重新读入该字符,直到不发生集合改变为止
673     for(Vt=tv.begin();Vt!=tv.end();Vt++)  //读入每个字符的循环
674     {
675         //cout<<*Vt;
676         flag=true;
677         while(flag)  //该循环即为重新读入当前字符的循环
678         {
679             flag = false;
680             bool item = true;  //用于标记某状态读入当前字符是否产生状态变化
681             for(int j=1;j<=stateCountDFA;j++)
682             {
683                 item=false;
684                 for(int k=1;k<=stateCountDFA;k++)//循环遍历DFA
685                 {
686                     if(recT[j][k].find(*Vt)!=string::npos)//若当前状态读入该字符产生状态变化
687                     {
688                         item=true;
689                         reduceState[1][j]=reduceState[0][k];//目标集合赋值为目标状态所属集合
690                     }
691                     if(!item)  //不产生状态变化
692                         reduceState[1][j]=0;  //目标集合标记为0
693                 }
694
695             }
696             bool b1=false;//标记用于判断pos是否加1
697             bool b2=false;//标记用于判断poe是否加1
698             for(int p=1;p<=stateCountDFA;p++)//循环扫描reduceState矩阵.第一个标记位置
699             {
700                 b1=false;b2=false;
701                 for(int q=2;q<=stateCountDFA;q++)//循环扫描reduceState矩阵.第二个标记位置
702                 {
703                     if(reduceState[0][p]==reduceState[0][q]&&reduceState[1][p]!=reduceState[1][q])
704                     {
705                         //如果存在所属集合相等,目标集合不等的情况,将所属集合分开
706                         if(reduceState[0][q]<stateCountDFA)  //当前状态为非终结状态
707                         {
708                             b1=true;
709                             reduceState[0][q]=pos;
710                         }
711                         else  //当前状态为终结状态
712                         {
713                             b2=true;
714                             reduceState[0][q]=poe;
715                         }
716                         flag=true;  //发生集合调整,标记为true
717                     }
718                 }
719                 if(b1)
720                     pos++;
721                 if(b2)
722                     poe++;
723             }
724         }
725     }
726
727 //    for(int i1=0;i1<stateCountDFA;i1++)
728 //        cout<<reduceState[0][i1]<<" ";
729 //    cout<<endl;
730 //    for(i1=0;i1<stateCountDFA;i1++)
731 //        cout<<reduceState[1][i1]<<" ";
732
733     for(int j=1;j<stateCountDFA;j++)  //循环扫描reduceState矩阵第一行.第一个标记位置
734     {
735         if(reduceState[0][j]!=0)
736         {
737             for(int k=j+1;k<=stateCountDFA;k++)  //循环扫描reduceState矩阵第一行.第二个标记位置
738             {
739                 if(reduceState[0][j]==reduceState[0][k])//两个标记位置值相等,即所属集合相同
740                     combine(j,k);  //调用合并函数
741             }
742         }
743     }
744     /*cout<<"化简后的DFA为："<<endl;
745     for(int m = 0;m <= stateCountDFA;m++,cout<<endl )
746         for(int n = 0; n <= stateCountDFA;n++ )
747         {
748            if( recT[m][n] == "" )cout<<" |";
749            else
750                cout<<recT[m][n]<<"|";
751         }*/
752
753     cout<<"DFA简化完毕！"<<endl;
754 }
755
756 void combine(int j,int k)
757 {
758     /*
759     函数用于合并两个状态
760     把第K状态的相应的行和列的值加到第j状态上
761     并把k状态所有的值置空
762     */
763     for(int i=1;i<=stateCountDFA;i++)
764     {
765         recT[j][i]+=recT[k][i];
766         recT[i][j]+=recT[i][k];
767     }
768     for(i=0;i<=stateCountDFA;i++)
769     {
770         recT[k][i]="";
771         recT[i][k]="";
772     }
773     reduceState[0][k]=0;
774 }
775
776 void produceC( int nodeCountV, string *typeNameV )       //产生.cpp文件
777 {
778     int i, j ,k;
779     char buf[1024];                //临时保存读取出来的文件内容
780     string message = "";
781     string remainCode = "";
782     ifstream in;
783     in.open("Remain.txt");
784     if(in.is_open())          //文件打开成功,说明曾经写入过东西
785     {
786         while(in.good() && !in.eof())
787         {
788             memset(buf,0,1024);
789             in.getline(buf,1204);
790             message = buf;
791             message += "\n";
792             remainCode += message;
793         }
794         in.close();
795     }
796
797     ofstream fout("scanner.cpp",ios::out);
798     fout<<remainCode;
799     fout<<"while(flag)\n"<<"{\n";
800     fout<<"switch(state)\n"<<"{\n";
801     for( i = 1;i <= nodeCountV;i++ )   //遍历每个新状态
802     {
803          fout<<"case "<<i<<":\n";
804          fout<<"last++;\n";
805          fout<<"c = code[last];\n";
806          if( i == 1 )
807          {
808              fout<<"if( c == '%' )\n";  //读到文件末尾，终结
809              fout<<"{\n";
810              fout<<"flag = 0;\n";
811              fout<<"break;\n}\n";
812          }
813          bool flag3 = 0;bool flag2 = 0;
814          for( j = 1;j <= nodeCountV;j++ )
815               if( recT[i][j] != "'" && recT[i][j] != "" )
816               {
817                   flag3 = 1;
818                   if( flag2 )
819                       fout<<"else ";
820
821                   fout<<"if( contain( \""<<recT[i][j]<<"\",c ) ) ";
822                   fout<<"state = "<<j<<";\n";
823                   flag2 = 1;
824               }
825          if( flag3 )
826              fout<<"else\n{\n";
827          //fout<<"{\n"<<"endState.find("<<i<<");\n";
828          //fout<<"if( ite_int == endState.find && *(endState.end()) != "<<i<<")\n";
829          if( recT[i][0][0] == '-' )
830          {
831              string tempIndexStr = recT[i][0].substr(1,recT[i][0].length()-1);
832              int tempIndex = atoi(tempIndexStr.c_str());
833              string type = typeNameV[tempIndex];
834              fout<<"flag2 = true;\n";
835              fout<<"for( k = 0;k < count;k++ )\n";;
836              fout<<"{\n"<<"if( SymbolTable[k][1] == code.substr(first,last-first) )\n";
837              fout<<"{\nflag2 = false;\nbreak;\n}\n}\n";
838              fout<<"if(flag2)\n{\n";
839              fout<<"count++;\n";
840              fout<<"SymbolTable[count-1][0] = \""<<type<<"\";\n";
841              fout<<"SymbolTable[count-1][1] = code.substr(first,last-first);\n";
842              fout<<"}\n";
843
844              fout<<"cout<<code.substr(first,last-first)<<endl;\n";
845              fout<<"last--;\n";
846              fout<<"first = last+1;\n";
847              fout<<"state = 1;\n";
848          }
849          else
850          {
851              fout<<"cout<<\"编译错误!\\n\";\n";
852              fout<<"flag = 0;\n";
853          }
854          if( flag3 )
855              fout<<"}\n";
856          fout<<"break;\n";
857     }
858     fout<<"default:\n"<<"break;\n"<<"}\n}\n";
859
860     fout<<"cout<<\"Symbol 表如下：\"<<endl;\n";
861     fout<<"cout<<setw(3)<<\"序号\"<<setw(10)<<\"token类型\"<<\"     \"<<\"token名字\"<<endl;\n";
862     fout<<"for( k2 = 0;k2 < count;k2++ )\n";
863     fout<<"{\n"<<"cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<\"     \"<<SymbolTable[k2][1]<<endl;\n";
864     fout<<"}\n";
865     fout<<"system(\"pause\");\n";
866     fout<<"return 0;\n}\n";
867
868     cout<<"代码生成完毕！见scanner.cpp文件。"<<endl;
869 }

4
Identity
a(bc)*|bc*|d
Number
(1|2|3|4|5|6|7|8|9|0)*
key
if|else|then
operator
>(`|=)|<(`|=)|=(=|`)

remain.txt

#include<iostream>
using namespace std;
#include<string>
#include<fstream>
#include<set>
#include<iomanip>
string SymbolTable[1000][2] = {""};
bool contain( string str, char v )
{
     for( int i = 0;i < str.length();i++ )
         if( str[i] == v )
             return true;
     return false;
}
int main()
{
    string tempCode = "";
    string code = "";

    int i,j,k,k2=0;
    char buf[1024];                //临时保存读取出来的文件内容
    string message = "";
    ifstream infile;
    infile.open("code.txt");
    if(infile.is_open())          //文件打开成功,说明曾经写入过东西
    {
        while(infile.good() && !infile.eof())
        {
            memset(buf,0,1024);
            infile.getline(buf,1204);
            message = buf;
            tempCode += message;
        }
        infile.close();
    }
    for( i = 0;i < tempCode.length();i++ )
    {
         if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
         code += tempCode[i];
    }
    code += "%";
    cout<<code<<endl;

    int first = 0, last = -1;
    char c;
    string tempStr = "";
    int state = 1;
    bool flag = 1;
    bool flag2 = true;
    int count = 0;
    //set<int> endState;
    //set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////

scanner.txt

#include<iostream>
using namespace std;
#include<string>
#include<fstream>
#include<set>
#include<iomanip>
string SymbolTable[1000][2] = {""};
bool contain( string str, char v )
{
     for( int i = 0;i < str.length();i++ )
         if( str[i] == v )
             return true;
     return false;
}
int main()
{
    string tempCode = "";
    string code = "";

    int i,j,k,k2=0;
    char buf[1024];                //临时保存读取出来的文件内容
    string message = "";
    ifstream infile;
    infile.open("code.txt");
    if(infile.is_open())          //文件打开成功,说明曾经写入过东西
    {
        while(infile.good() && !infile.eof())
        {
            memset(buf,0,1024);
            infile.getline(buf,1204);
            message = buf;
            tempCode += message;
        }
        infile.close();
    }
    for( i = 0;i < tempCode.length();i++ )
    {
         if( tempCode[i] != ' ' && tempCode[i] != '\t' && tempCode[i] != '\n' && tempCode[i] != ';' )
         code += tempCode[i];
    }
    code += "%";
    cout<<code<<endl;

    int first = 0, last = -1;
    char c;
    string tempStr = "";
    int state = 1;
    bool flag = 1;
    bool flag2 = true;
    int count = 0;
    //set<int> endState;
    //set<int>::iterator ite_int;
///////////////////////////////////////////保留代码///////////////////////////////////////////////////
while(flag)
{
switch(state)
{
case 1:
last++;
c = code[last];
if( c == '%' )
{
flag = 0;
break;
}
if( contain( "a",c ) ) state = 2;
else if( contain( "b",c ) ) state = 3;
else if( contain( "d",c ) ) state = 4;
else if( contain( "1234567890",c ) ) state = 5;
else if( contain( "i",c ) ) state = 15;
else if( contain( "e",c ) ) state = 16;
else if( contain( "t",c ) ) state = 17;
else if( contain( "><=",c ) ) state = 18;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 2:
last++;
c = code[last];
if( contain( "bb",c ) ) state = 21;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 3:
last++;
c = code[last];
if( contain( "c",c ) ) state = 3;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 4:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Identity";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 5:
last++;
c = code[last];
if( contain( "1121233123444123455551234566666123456777777123456788888881234567899999999123456789000000000",c ) ) state = 5;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "Number";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 6:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 7:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 8:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 9:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 10:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 11:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 12:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 13:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 14:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 15:
last++;
c = code[last];
if( contain( "f",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 16:
last++;
c = code[last];
if( contain( "l",c ) ) state = 24;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 17:
last++;
c = code[last];
if( contain( "h",c ) ) state = 25;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 18:
last++;
c = code[last];
if( contain( "===",c ) ) state = 26;
else
{
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "operator";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
}
break;
case 19:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 20:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 21:
last++;
c = code[last];
if( contain( "c",c ) ) state = 2;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 22:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 23:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "key";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 24:
last++;
c = code[last];
if( contain( "s",c ) ) state = 30;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 25:
last++;
c = code[last];
if( contain( "e",c ) ) state = 31;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 26:
last++;
c = code[last];
flag2 = true;
for( k = 0;k < count;k++ )
{
if( SymbolTable[k][1] == code.substr(first,last-first) )
{
flag2 = false;
break;
}
}
if(flag2)
{
count++;
SymbolTable[count-1][0] = "operator";
SymbolTable[count-1][1] = code.substr(first,last-first);
}
cout<<code.substr(first,last-first)<<endl;
last--;
first = last+1;
state = 1;
break;
case 27:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 28:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 29:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 30:
last++;
c = code[last];
if( contain( "e",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 31:
last++;
c = code[last];
if( contain( "n",c ) ) state = 23;
else
{
cout<<"编译错误!\n";
flag = 0;
}
break;
case 32:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
case 33:
last++;
c = code[last];
cout<<"编译错误!\n";
flag = 0;
break;
default:
break;
}
}
cout<<"Symbol 表如下："<<endl;
cout<<setw(3)<<"序号"<<setw(10)<<"token类型"<<"     "<<"token名字"<<endl;
for( k2 = 0;k2 < count;k2++ )
{
cout<<setw(3)<<k2<<setw(10)<<SymbolTable[k2][0]<<"     "<<SymbolTable[k2][1]<<endl;
}
system("pause");
return 0;
}

if a >= 3
bcc = 123;
else if d < 9
abc = 12;