• 编译原理之正则表达式转NFA


    本文转载自http://chriszz.sinaapp.com/?p=257

    输入一个正则表达式,输出一个NFA。

    我的做法:输入一个字符串表示正则,输出则是把输出到一个.dot文件中并将dot文件编译成pdf,fedora需要sudo yum install dot,然后evince XXX.pdf就可以查看生成的NFA了。

    具体算法是按照龙书上的Tompson算法来的。

    废话不多说,放码过来:

    /*
    Author:ChrisZZ(zchrissirhcz@gmail.com)
    Time:2013-12-25 14:13:09
    输入:正则表达式
    输出:自动机
    算法步骤:
    1.把正则表达式转化为后缀表达式
    2.把后缀表达式转化为NFA
    3.用dot语言把NFA输出到PDF
    参考:
    1.Regular Expression Matching Can Be Simple And Fast
    http://swtch.com/~rsc/regexp/regexp1.html
    2.龙书 chap3.7.4 从正则表达式构造NFA
    3.YCC学长的project中dot语言的使用
    其他说明:
    1.需要安装dot,并添加到系统path中
    2.在windows下运行时,控制台因为编码不支持可能导致中文提示无法显示
    */
    #include <iostream>
    #include <string>
    #include <stdio.h>
    #include <stack>
    #include <string.h>
    #include <stdexcept>
    #include <stdlib.h>
    
    using namespace std;
    
    const int Match = 256;
    const int Split = 257;//表示epsilon分支
    
    struct Paren{//括号结构体
        int natom;
        int nalt;
    };
    
    string re2post(string re){
        Paren paren;//括号
        stack<struct Paren>parenStk;
        string postExpr="";
        int i, len=re.length();
        int nalt=0, natom=0;
        const string invalidRegExp = "非法的正则表达式";
        for(i=0; i<len; i++){
            if(isspace(re[i])) continue;
            if(isalpha(re[i])){
                if(natom>1){
                    natom--;
                    postExpr = postExpr + '.';
                }
                natom++;
                postExpr = postExpr + re[i];
            }
            else if(re[i]=='('){
                if(natom>1){
                    postExpr = postExpr + '.';
                }
                paren.natom = natom;
                paren.nalt = nalt;
                parenStk.push(paren);
                nalt = 0;
                natom = 0;
            }
            else if(re[i]==')'){
                if(natom==0 || parenStk.empty())
                    throw runtime_error(invalidRegExp+":括号不匹配");
                while(--natom>0){//比如((a|b)(c|d))模式,当上一次匹配完倒数第二个右括号后,natom为2,需要添加'.'
                    postExpr = postExpr + '.';
                }
                while(nalt-->0){
                    postExpr = postExpr + '|';
                }
                paren=parenStk.top();
                parenStk.pop();
                natom = paren.natom;
                nalt = paren.nalt;
                natom++;
            }
            else if(re[i]=='*'){
                if(natom==0)
                    throw runtime_error(invalidRegExp+":提前出现'*'");
                postExpr = postExpr + re[i];
            }
            else if(re[i]=='|'){
                if(natom==0) throw runtime_error(invalidRegExp+":提前出现'|'");
                while(--natom>0){
                    postExpr = postExpr + '.';
                }
                nalt++;
            }
            else
                throw runtime_error(invalidRegExp);
        }
        if(!parenStk.empty())
            throw runtime_error(invalidRegExp+":括号不匹配");
        while(--natom>0){
            postExpr = postExpr + '.';
        }
        while(nalt-->0){
            postExpr = postExpr + '|';
        }
        return postExpr;
    }
    
    class NFA;
    
    /*
    * c<256表示edge权重为c;
    * c=256表示终结状态,匹配成功
    * c=257表示分支(split)
    */
    class State{
        friend class NFA;
        friend void nfa2graph(State* head, const string& re);
    public:
        State(int c=256, State* out=NULL, State* out1=NULL){
            this->c = c;
            this->out = out;
            this->out1 = out1;
            this->id = 0;
        }
        void setId(int id){
            this->id = id;
        }
    
    private:
        int c;
        int id;//状态的编号
        State* out;//从本状态出去的状态集合的头指针
        State* out1;//两个分支的情况
    };
    
    class NFA{
    public:
        NFA(){
            head = NULL;
            tail = NULL;
        }
        NFA(const int& c){
            tail = new State(Match, NULL, NULL);
            head = new State(c, tail, NULL);
        }
        void doCat(NFA& nfa){
            tail->out = nfa.head;
            tail->c = Split;
            tail = nfa.tail;
            nfa.head = NULL;
            nfa.tail = NULL;
        }
        void doUnion(NFA& nfa){
            State* newHead = new State(Split, head, nfa.head);
            State* newTail = new State(Match, NULL, NULL);
            tail->c = Split;
            tail->out = newTail;
            nfa.tail->c = Split;
            nfa.tail->out = newTail;
            tail = newTail;
            head = newHead;
            nfa.head = NULL;
            nfa.tail = NULL;
        }
        void doStar(){
            State* newTail = new State(Match, NULL, NULL);
            State* newHead = new State(Split, head, newTail);
            tail->c = Split;
            tail->out = newTail;
            tail->out1 = head;
            tail = newTail;
            head = newHead;
        }
    
        void nfa2graph(const string& re){
            char myfile[100];
            printf("请输入一个文件名,用来保存生成的NFA-graph(不必提供后缀):
    ");
            scanf("%s", myfile);
            printf("已将DOT文件存储在"%s.dot",
    ", myfile);
            printf("PDF文件则存储在"%s.dot.pdf".
    ", myfile);
            int i;
            while(myfile[i]!='')
                i++;
            myfile[i] = '.';
            myfile[i+1] = 'd';
            myfile[i+2] = 'o';
            myfile[i+3] = 't';
            myfile[i+4] = '';
    
            FILE *file = fopen(myfile, "w");
    
            fputs("digraph {
    ", file);
            fputs("	"", file);
            int len=re.length();
            for(i=0; i<len; i++){
                fprintf(file, "%c", re[i]);
            }
    
            fputs("" [shape = plaintext]
    ", file);
            fputs("	rankdir = LR
    ", file);
            fputs("	"" [shape = point]
    ", file);
            fputs("	"" -> 1 [label = Start]
    
    ", file);
    
            int id = 1;
    
            char circle[2000];
            memset(circle, 0, sizeof(circle));
            State* p;
            stack<State*> staStk;
    
            head->setId(id++);
            staStk.push(head);
    
            while(!staStk.empty()){
                p = staStk.top();
                staStk.pop();
                char flag = 1;
                cout << "p->c=" << p->c << endl;
                if(p->c < Match){
                    cout << "p->out->id=" << p->out->id << endl;
                    if(p->out->id==0){
                        p->out->id = id++;
                        cout << "id=" << id << endl;                }
                    else
                        flag = 0;
                    fprintf(file, "	%d -> %d [label = "%c"]
    ", p->id, (p->out)->id, p->c);
                    State *what = p->out;
                    if(flag) //push(*what);
                        staStk.push(what);
                } else if(p->c == Match){
                    circle[p->id] = 1;
                } else{     //对应Split的情形
                    if(p->out->id==0)
                        p->out->id = id++;
                    else
                        flag = 0;
                    fprintf(file, "	%d -> %d [label = <ε>]
    ", p->id, p->out->id);
                    State *what = p->out;
                    if(flag) staStk.push(what);
    
                    if(p->out1!=NULL){
                        flag = 1;
    
                        if(p->out1->id==0)
                            p->out1->id = id++;
                        else
                            flag = 0;
                        fprintf(file, "	%d -> %d [label = <ε>]
    ", p->id, p->out1->id);
                        what = p->out1;
                        if(flag) staStk.push(what);
                    }
                }
            }
    
            for(i=1; i<id; i++){
                fprintf(file, "	%d [shape = circle", i);
                if(circle[i])
                    fputs(", peripheries = 2", file);
                fprintf(file, "]
    ");
            }
    
            fputs("}", file);
            fclose(file);
    
            char cmd[108];
            sprintf(cmd, "dot %s -O -Tpdf", myfile);
            if(system(cmd)==0){
                printf("成功生成pdf图像!
    ");
                //printf("Linux用户可以使用evince file.pdf &命令打开~
    ");
            }
            else
                printf("悲剧!生成pdf图像时出现错误..
    ");
        }
    private:
        State* head;
        State* tail;
    };
    
    NFA post2nfa(const string& postExpr){
        stack<NFA> nfaStk;
        NFA e1, e2, e;
        int i, len=postExpr.length();
        for(i=0; i<len; i++){
            switch(postExpr[i]){
            case '.':
                e2 = nfaStk.top();
                nfaStk.pop();
                e1 = nfaStk.top();
                nfaStk.pop();
                e1.doCat(e2);
                nfaStk.push(e1);
                break;
            case '|':
                e2 = nfaStk.top();
                nfaStk.pop();
                e1 = nfaStk.top();
                nfaStk.pop();
                e1.doUnion(e2);
                nfaStk.push(e1);
                break;
            case '*':
                e = nfaStk.top();
                nfaStk.pop();
                e.doStar();
                nfaStk.push(e);
                break;
            default://
                NFA alpha(postExpr[i]);
                nfaStk.push(alpha);
            }
        }
        e = nfaStk.top();
        nfaStk.pop();
        if(!nfaStk.empty())
            throw runtime_error("未知错误");
        return e;
    }
    
    int main(){
        string re;
        while(true){
            cout << "请输入一个正则表达式:
    ";
            cin >> re;
            string postExpr = re2post(re);
            cout << "postExpr is : " << postExpr << endl;
            NFA nfa = post2nfa(postExpr);
            nfa.nfa2graph(re);
            cout << "继续吗?(y/n)
    " << endl;
            char c;
            cin >> c;
            while(c!='y' && c!='n'){
                cout << "请输入'y'或'n'.
    ";
                c=getchar();
            }
            if(c=='n')
                break;
        }
        cout << "Bye~
    ";
        return 0;
    }
  • 相关阅读:
    C# 导出 Excel 自定义输出格式
    ONE路由协议模块分析与应用
    2440(ARM9) L3G4200D ADXL345 裸机程序
    C# SerialPort.close() bug解决方法
    C++ Primer 第五章 表达式
    C++ Primer 第一章 快速入门
    C++ Primer 第二章 变量和基本类型
    虚拟机virtualbox中挂载新硬盘
    C++ Primer 第四章 数组与指针
    C++ Primer 第三章 标准库类型
  • 原文地址:https://www.cnblogs.com/zjutzz/p/3538093.html
Copyright © 2020-2023  润新知