在前面的文章使用MPLex实现语法高亮显示的功能里面,贴了一个实现语法高亮显示的代码,是采用类似于编译器自动状态机的方法来判断代码里面每个单词的类型。
有限自动状态机是表示有限个状态以及在这些状态之间的转移和动作等行为的数学模型。状态之间只有一个转移的动作。 MPLex或者说相关软件(例如flex)通过分析用户给定的词法文件,自动生成相应的有限自动机,将自动机的状态保存在一个表里面。
#include <iostream> #include <string>
using namespace std;
enum TokenType { BOOM_ERROR = -1, // 啊哈,出错了 NUMBER = 1, IDENTIFIER = 2, IF = 4 };
int DFA_Table[][37] = { // 0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z ! {1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,-1}, // s0 -- 起始状态 {1,1,1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}, // s1 -- 到这里说明是数字 {3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,-1}, // s2 -- 变量 {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,-1}, {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,-1} // s4 -- 这是IF };
// // Match: // 给定一个字符串str,判断这个字符串的类型 // // 例子: // if, 返回IF // 数字,返回NUMBER // 变量,返回IDENTIFIER // TokenType Match(string str) { int state = 0;
for (string::iterator iter = str.begin(); iter != str.end(); ++iter ) { char c = *iter; int index = 0; if ( c >= '0' && c <= '9' ) { index = c - '0'; } else if (c >= 'a' && c <= 'z') { index = c - 'a' + 10; // a列在DFA_Table中的索引值 } else { index = 36; // !列在DFA_Table中的索引值,到这里说明不匹配了 }
state = DFA_Table[state][index];
if (state == BOOM_ERROR) break; }
return (TokenType)state; }
int g_line = 0; void Print(TokenType type) { switch (type) { case BOOM_ERROR: cout << ++g_line << ": BOOM_ERROR\n" <<> break;
case IF: cout << ++g_line << ": IF\n" <<> break;
case NUMBER: cout << ++g_line << ": NUMBER\n" <<> break;
case IDENTIFIER: cout << ++g_line << ": IDENTIFIER\n" <<> break;
default: cout << ++g_line << ": Error\n" <<> break; } }
int main() { Print(Match("if")); Print(Match("iff")); Print(Match("if0")); Print(Match("0if")); Print(Match("i0f")); Print(Match("ia")); Print(Match("01")); Print(Match("123")); Print(Match("1f")); Print(Match("abcd")); Print(Match("ab")); Print(Match("a")); Print(Match("0")); Print(Match("i")); Print(Match("_"));
return 0; } |
例子1:一个简单的DFA表驱动匹配程序
上面的例子里,字符串的匹配或者说是分类是通过有限自动机来完成的,有限自动机在代码里面的表示就是那个二维数组 DFA_Table。DFA_Table的每一行(DFA_Table[i])表示有限自动机的状态,而列表示从当前状态可以执行的状态转换(Transfer)。例如在匹配的时候,程序先从DFA_Table[0],也就是起始状态开始,如果第一个字符串是i,则根据DFA_Table[0]['i']指定的转换规则跳转到下一个状态(State)去,这里下一个状态是2,也就是DFA_Table的第三行,再根据str的下一个字符来确定要转换的状态。匹配过程一直循环到字符串被全部处理掉,这时程序判断当前的状态是不是一个可以接受的状态(Acceptable State),也就是说这个状态是否在TokenType中定义,如果状态在TokenType中定义,那好,我们给出的字符串匹配成功,否则……BOOM。
我在Match函数的for循环中用了if判断来根据当前的字符选择正确的索引,其实如果你不嫌麻烦的话,你的Match函数中的for循环可以简化成这样:
for (string::iterator iter = str.begin();
iter != str.end();
++iter )
{
state = DFA_Table[state][*iter];
}
前提是愿意把DFA_Table扩展成一个127 * 5的二维表格。
知道了有限自动机是如何匹配代码里的关键字以后,接下来要做的就是生成保存有限自动机里面的状态的状态表了。生成状态表的工作就由MPLex来完成了,因为手工写实在是太复杂了。下面这个词法定义文件就是告诉MPLex有哪些元素需要进行特殊处理,例如将注释、字符串、数字和关键字与其他普通代码文本区分开来。
%namespace Coder.LexScanner
%x COMMENT
White0 [ \t\r\f\v] White {White0}|\n CmntStart \/\* CmntEnd \*\/ ABStar [^\*\n]*
%%
\'[^\'\n]{0,3}\' { return (int)TokenType.LEX_STRINGLITERAL; } \"[^\"\n]*\" { return (int)TokenType.LEX_STRINGLITERAL; } \/\/{ABStar}\n { return (int)TokenType.LEX_COMMENT; } \'[^\'\n]*\n { return (int)TokenType.LEX_COMMENT; } {CmntStart}{ABStar}\**{CmntEnd} { return (int)TokenType.LEX_COMMENT; } {CmntStart}{ABStar}\** { BEGIN(COMMENT); return (int)TokenType.LEX_MULTILINECOMMENT_BEGIN; } [^\n]*\**{CmntEnd} { BEGIN(INITIAL); return (int)TokenType.LEX_MULTILINECOMMENT_END; }
if { return (int)TokenType.LEX_IF; } while { return (int)TokenType.LEX_WHILE; } do { return (int)TokenType.LEX_DO; } abstract { return (int)TokenType.LEX_ABSTRACT; } as { return (int)TokenType.LEX_AS; } base { return (int)TokenType.LEX_BASE; } bool { return (int)TokenType.LEX_BOOL; } break { return (int)TokenType.LEX_BREAK; } byte { return (int)TokenType.LEX_BYTE; } case { return (int)TokenType.LEX_CASE; } catch { return (int)TokenType.LEX_CATCH; } char { return (int)TokenType.LEX_CHAR; } checked { return (int)TokenType.LEX_CHECKED; } class { return (int)TokenType.LEX_CLASS; } const { return (int)TokenType.LEX_CONST; } continue { return (int)TokenType.LEX_CONTINUE; } decimal { return (int)TokenType.LEX_DECIMAL; } default { return (int)TokenType.LEX_DEFAULT; } delegate { return (int)TokenType.LEX_DELEGATE; } double { return (int)TokenType.LEX_DOUBLE; } else { return (int)TokenType.LEX_ELSE; } enum { return (int)TokenType.LEX_ENUM; } event { return (int)TokenType.LEX_EVENT; } explicit { return (int)TokenType.LEX_EXPLICIT; } extern { return (int)TokenType.LEX_EXTERN; } false { return (int)TokenType.LEX_FALSE; } finally { return (int)TokenType.LEX_FINALLY; } fixed { return (int)TokenType.LEX_FIXED; } float { return (int)TokenType.LEX_FLOAT; } for { return (int)TokenType.LEX_FOR; } foreach { return (int)TokenType.LEX_FOREACH; } goto { return (int)TokenType.LEX_GOTO; } implicit { return (int)TokenType.LEX_IMPLICIT; } in { return (int)TokenType.LEX_IN; } int { return (int)TokenType.LEX_INT; } interface { return (int)TokenType.LEX_INTERFACE; } internal { return (int)TokenType.LEX_INTERNAL; } is { return (int)TokenType.LEX_IS; } lock { return (int)TokenType.LEX_LOCK; } long { return (int)TokenType.LEX_LONG; } namespace { return (int)TokenType.LEX_NAMESPACE; } new { return (int)TokenType.LEX_NEW; } null { return (int)TokenType.LEX_NULL; } object { return (int)TokenType.LEX_OBJECT; } operator { return (int)TokenType.LEX_OPERATOR; } out { return (int)TokenType.LEX_OUT; } override { return (int)TokenType.LEX_OVERRIDE; } params { return (int)TokenType.LEX_PARAMS; } private { return (int)TokenType.LEX_PRIVATE; } protected { return (int)TokenType.LEX_PROTECTED; } public { return (int)TokenType.LEX_PUBLIC; } readonly { return (int)TokenType.LEX_READONLY; } ref { return (int)TokenType.LEX_REF; } return { return (int)TokenType.LEX_RETURN; } sbyte { return (int)TokenType.LEX_SBYTE; } sealed { return (int)TokenType.LEX_SEALED; } short { return (int)TokenType.LEX_SHORT; } sizeof { return (int)TokenType.LEX_SIZEOF; } stackalloc { return (int)TokenType.LEX_STACKALLOC; } static { return (int)TokenType.LEX_STATIC; } string { return (int)TokenType.LEX_STRING; } struct { return (int)TokenType.LEX_STRUCT; } switch { return (int)TokenType.LEX_SWITCH; } this { return (int)TokenType.LEX_THIS; } throw { return (int)TokenType.LEX_THROW; } true { return (int)TokenType.LEX_TRUE; } try { return (int)TokenType.LEX_TRY; } typeof { return (int)TokenType.LEX_TYPEOF; } uint { return (int)TokenType.LEX_UINT; } ulong { return (int)TokenType.LEX_ULONG; } unchecked { return (int)TokenType.LEX_UNCHECKED; } unsafe { return (int)TokenType.LEX_UNSAFE; } ushort { return (int)TokenType.LEX_USHORT; } using { return (int)TokenType.LEX_USING; } virtual { return (int)TokenType.LEX_VIRTUAL; } volatile { return (int)TokenType.LEX_VOLATILE; } void { return (int)TokenType.LEX_VOID; }
[0-9]+ { return (int)TokenType.LEX_NUMBER; } [a-zA-Z_][a-zA-Z0-9_]* { return (int)TokenType.LEX_INDENTIFIER; }
{White0}+ { return (int)TokenType.LEX_WHITE; } \n { return (int)TokenType.LEX_WHITE; } . { return (int)TokenType.LEX_ERROR; }
%% |
Scanner.lex文件
上面代码里面的TokenType枚举需要在另外的C#文件里面定义:
using System;
namespace Coder.LexScanner { public class TokenType { public const int LEX_NUMBER = 1; public const int LEX_INDENTIFIER = 2; public const int LEX_KEYWORD = 1 << 30; public const int LEX_IF = 3 | LEX_KEYWORD; public const int LEX_WHILE = 4 | LEX_KEYWORD; public const int LEX_DO = 5 | LEX_KEYWORD; public const int LEX_ABSTRACT = 6 | LEX_KEYWORD; public const int LEX_AS = 7 | LEX_KEYWORD; public const int LEX_BASE = 8 | LEX_KEYWORD; public const int LEX_BOOL = 9 | LEX_KEYWORD; public const int LEX_BREAK = 10 | LEX_KEYWORD; public const int LEX_BYTE = 11 | LEX_KEYWORD; public const int LEX_CASE = 12 | LEX_KEYWORD; public const int LEX_CATCH = 13 | LEX_KEYWORD; public const int LEX_CHAR = 14 | LEX_KEYWORD; public const int LEX_CHECKED = 15 | LEX_KEYWORD; public const int LEX_CLASS = 16 | LEX_KEYWORD; public const int LEX_CONST = 17 | LEX_KEYWORD; public const int LEX_CONTINUE = 18 | LEX_KEYWORD; public const int LEX_DECIMAL = 19 | LEX_KEYWORD; public const int LEX_DEFAULT = 20 | LEX_KEYWORD; public const int LEX_DELEGATE = 21 | LEX_KEYWORD; public const int LEX_DOUBLE = 22 | LEX_KEYWORD; public const int LEX_ELSE = 23 | LEX_KEYWORD; public const int LEX_ENUM = 24 | LEX_KEYWORD; public const int LEX_EVENT = 25 | LEX_KEYWORD; public const int LEX_EXPLICIT = 26 | LEX_KEYWORD; public const int LEX_EXTERN = 27 | LEX_KEYWORD; public const int LEX_FALSE = 28 | LEX_KEYWORD; public const int LEX_FINALLY = 29 | LEX_KEYWORD; public const int LEX_FIXED = 30 | LEX_KEYWORD; public const int LEX_FLOAT = 31 | LEX_KEYWORD; public const int LEX_FOR = 32 | LEX_KEYWORD; public const int LEX_FOREACH = 33 | LEX_KEYWORD; public const int LEX_GOTO = 34 | LEX_KEYWORD; public const int LEX_IMPLICIT = 35 | LEX_KEYWORD; public const int LEX_IN = 36 | LEX_KEYWORD; public const int LEX_INT = 37 | LEX_KEYWORD; public const int LEX_INTERFACE = 38 | LEX_KEYWORD; public const int LEX_INTERNAL = 39 | LEX_KEYWORD; public const int LEX_IS = 40 | LEX_KEYWORD; public const int LEX_LOCK = 41 | LEX_KEYWORD; public const int LEX_LONG = 42 | LEX_KEYWORD; public const int LEX_NAMESPACE = 43 | LEX_KEYWORD; public const int LEX_NEW = 44 | LEX_KEYWORD; public const int LEX_NULL = 45 | LEX_KEYWORD; public const int LEX_OBJECT = 46 | LEX_KEYWORD; public const int LEX_OPERATOR = 47 | LEX_KEYWORD; public const int LEX_OUT = 48 | LEX_KEYWORD; public const int LEX_OVERRIDE = 49 | LEX_KEYWORD; public const int LEX_PARAMS = 50 | LEX_KEYWORD; public const int LEX_PRIVATE = 51 | LEX_KEYWORD; public const int LEX_PROTECTED = 52 | LEX_KEYWORD; public const int LEX_PUBLIC = 53 | LEX_KEYWORD; public const int LEX_READONLY = 54 | LEX_KEYWORD; public const int LEX_REF = 55 | LEX_KEYWORD; public const int LEX_RETURN = 56 | LEX_KEYWORD; public const int LEX_SBYTE = 57 | LEX_KEYWORD; public const int LEX_SEALED = 58 | LEX_KEYWORD; public const int LEX_SHORT = 59 | LEX_KEYWORD; public const int LEX_SIZEOF = 60 | LEX_KEYWORD; public const int LEX_STACKALLOC = 61 | LEX_KEYWORD; public const int LEX_STATIC = 62 | LEX_KEYWORD; public const int LEX_STRING = 63 | LEX_KEYWORD; public const int LEX_STRUCT = 64 | LEX_KEYWORD; public const int LEX_SWITCH = 65 | LEX_KEYWORD; public const int LEX_THIS = 66 | LEX_KEYWORD; public const int LEX_THROW = 67 | LEX_KEYWORD; public const int LEX_TRUE = 68 | LEX_KEYWORD; public const int LEX_TRY = 69 | LEX_KEYWORD; public const int LEX_TYPEOF = 70 | LEX_KEYWORD; public const int LEX_UINT = 71 | LEX_KEYWORD; public const int LEX_ULONG = 72 | LEX_KEYWORD; public const int LEX_UNCHECKED = 73 | LEX_KEYWORD; public const int LEX_UNSAFE = 74 | LEX_KEYWORD; public const int LEX_USHORT = 75 | LEX_KEYWORD; public const int LEX_USING = 76 | LEX_KEYWORD; public const int LEX_VIRTUAL = 77 | LEX_KEYWORD; public const int LEX_VOLATILE = 78 | LEX_KEYWORD; public const int LEX_VOID = 79 | LEX_KEYWORD;
public const int LEX_MULTILINECOMMENT = 80; public const int LEX_MULTILINECOMMENT_BEGIN = 81; public const int LEX_MULTILINECOMMENT_END = 82; public const int LEX_COMMENT = 25; public const int LEX_WHITE = 26; public const int LEX_ERROR = 27; public const int LEX_STRINGLITERAL = 28; }
public interface IErrorHandler { int ErrNum { get; }
int WrnNum { get; }
void AddError(string msg, int lin, int col, int len, int severity); } } |
然后使用命令根据词法文件生成词法匹配的C#代码:
Mplex.exe scanner.lex
最后为了判断生成的C#代码是否有用,我写了一个小程序调用词法匹配函数测试了一下:
using System; using Coder.LexScanner;
public class TestClass { public static void Main() { string text = Console.ReadLine(); Scanner scnr = new Scanner(); int state = 0; int result = 0; int start, end;
if ( !string.IsNullOrEmpty(text.Trim()) ) { scnr.SetSource(text, 0); result = scnr.GetNext(ref state, out start, out end); while (result != (int)Tokens.EOF) { Console.WriteLine(string.Format( "result: {0}, state: {1}, start: {2}, end: {3}", result, state, start, end)); result = scnr.GetNext(ref state, out start, out end); } } } } |
实际上,Visual Studio的代码高亮显示功能也是通过Mplex和Mppg实现,这样做的好处是,新的编程语言可以以插件的形式加入到Visual Studio里面来,而Visual Studio照样能够在编辑新编程语言的程序时,实现高亮显示以及其他,例如变量和函数定义查找、智能提示框之类的功能。