使用NFA来构造正则表达式,详细解释可以参见Algorithm 4th。书中代码为JAVA版本。
书中内容前后耦合度较大,遂用cpp简单的实现了一下(基于vector的存储方式主要是我发现std::stack不能访问中间的元素,性能上的优化我就忽略掉了)。
代码接受“(A*C|AB)V.*”等类似的正则表达式,不检测中间匹配的情况(主要是为了缩减代码量,我对200这个数很纠结)
由于判断检测成功的方式特别简单(这样好写),正则式需要去掉最外面的括号“(A.*)”是不允许的,“A.*”可以。
1 #include<iostream> 2 #include<stack> 3 #include<vector> 4 #include<memory> 5 using namespace std; 6 7 struct Node 8 { 9 char charValue; 10 vector<int> nextIndices; 11 }; 12 class LinkGraph 13 { 14 public: 15 LinkGraph(std::string str) 16 { 17 for(int i=0;i<(int)str.size();i++) 18 { 19 Node node; 20 node.charValue = str[i]; 21 nodes.push_back(node); 22 } 23 } 24 bool AddEdge(int i,int j) 25 { 26 if(i>=nodes.size()||j>=nodes.size()) 27 return false; 28 //cheack duplicate node 29 for(int k=0;k<nodes[i].nextIndices.size();k++) 30 if(nodes[i].nextIndices[k]==j) 31 return false; 32 nodes[i].nextIndices.push_back(j); 33 return true; 34 } 35 void Print() 36 { 37 for(int i=0;i<nodes.size();i++) 38 { 39 std::cout<<i<<":"; 40 for(int j=0;j<nodes[i].nextIndices.size();j++) 41 std::cout<<"->"<<nodes[i].nextIndices[j]; 42 std::cout<<std::endl; 43 } 44 } 45 const Node& GetNode(int index) 46 { 47 if(index<nodes.size()) 48 return nodes[index]; 49 50 throw "Index Out Of Range"; 51 } 52 int GetNodesSize() 53 { 54 return nodes.size(); 55 } 56 private: 57 vector<Node> nodes; 58 }; 59 class NFA 60 { 61 public: 62 NFA(std::string str) 63 { 64 65 TestString='('+str+')'; 66 graph=new LinkGraph(TestString); 67 ComposeGraph(TestString); 68 } 69 70 void ComposeGraph(std::string TestString) 71 { 72 std::stack<int> ops; 73 for(int i=0;i<TestString.size();i++) 74 { 75 char CurChar=TestString[i]; 76 int lp=i; 77 if(CurChar=='|'||CurChar=='(') 78 ops.push(i); 79 else if(CurChar==')') 80 { 81 //int or = (int)ops.top(); 82 int orValue = (int)ops.top(); 83 if(TestString[orValue]=='|')//or 84 { 85 graph->AddEdge(orValue,i); 86 ops.pop();//pop '|' 87 lp = ops.top(); 88 graph->AddEdge(lp,orValue+1); 89 ops.pop();//pop '(' 90 } 91 else 92 { 93 lp=orValue; 94 ops.pop();//pop '(' 95 } 96 } 97 98 if(i<TestString.size()-1&&TestString[i+1]=='*') 99 { 100 graph->AddEdge(i+1,lp); 101 graph->AddEdge(lp,i+1); 102 } 103 104 if(CurChar=='('||CurChar==')'||CurChar=='*') 105 graph->AddEdge(i,i+1); 106 107 } 108 } 109 void fillVector(vector<int>& v,int index) 110 { 111 v.push_back(index); 112 for(int i=0;i<graph->GetNode(index).nextIndices.size();i++) 113 { 114 bool contain = false; 115 for(int j=0;j<v.size();j++) 116 if(v[j]==graph->GetNode(index).nextIndices[i]) 117 { 118 contain = true; 119 break; 120 } 121 if(!contain) 122 fillVector(v,graph->GetNode(index).nextIndices[i]); 123 } 124 125 } 126 void initVector(vector<int>& v,int index) 127 { 128 v.clear(); 129 fillVector(v,index); 130 } 131 void PrintVector(const vector<int>& v) 132 { 133 for(int i=0;i<v.size();i++) 134 std::cout<<v[i]<<" "; 135 std::cout<<std::endl; 136 } 137 bool RecognizeText(std::string str) 138 { 139 vector<int> potentialIndex; 140 //init 141 initVector(potentialIndex,0); 142 PrintVector(potentialIndex); 143 //test 144 int strIndex=0; 145 vector<int> passedIndex; 146 int NFAMaxIndex=0; 147 while(potentialIndex.size()&&strIndex<str.size()) 148 { 149 for(int i=0;i<potentialIndex.size();i++) 150 if(TestString[potentialIndex[i]]==str[strIndex]|| 151 TestString[potentialIndex[i]]=='.') 152 { 153 passedIndex.push_back(potentialIndex[i]+1); 154 } 155 156 if(passedIndex.size()==0) 157 //return false; 158 //return strIndex==str.size(); 159 break;//return NFAMaxIndex==TestString.size()-1; 160 strIndex++; 161 potentialIndex.clear(); 162 PrintVector(passedIndex); 163 for(int k=0;k<passedIndex.size();k++) 164 fillVector(potentialIndex,passedIndex[k]); 165 for(int k=0;k<potentialIndex.size();k++) 166 if(potentialIndex[k]>NFAMaxIndex) 167 NFAMaxIndex = potentialIndex[k]; 168 passedIndex.clear(); 169 PrintVector(potentialIndex); 170 } 171 PrintVector(potentialIndex); 172 std::cout<<"maxIndex="<<NFAMaxIndex; 173 return NFAMaxIndex>=TestString.size()-1; 174 } 175 ~NFA() 176 { 177 if(graph) 178 delete graph; 179 } 180 void Print() 181 { 182 graph->Print(); 183 } 184 private: 185 LinkGraph* graph; 186 std::string TestString; 187 }; 188 int main() 189 { 190 // std::auto_ptr<LinkGraph> graph (new LinkGraph("abcd")); 191 // graph->AddEdge(0,1); 192 // graph->AddEdge(0,1); 193 // graph->Print(); 194 std::auto_ptr<NFA> nfa(new NFA(std::string("(A*B|AC)D"))); 195 nfa->Print(); 196 bool result = nfa->RecognizeText(std::string("AABD")); 197 std::cout<<" Is this string ACCEPTED? "<<(result?"Accepted":"Didn't Match")<<std::endl; 198 return 0; 199 }
200行,恩。200。