确定有限自动机（DFA）——一个简单的C++词法分析器

现在的位置: 首页 > 综合 > 正文

确定有限自动机（DFA）——一个简单的C++词法分析器

2014年02月01日 ⁄ 综合 ⁄ 共 14050字 ⁄ 字号小中大 ⁄ 评论关闭

开始想运用确定有限自动机去实现一个简单的C++词法分析器时，我感到很困难，不知从何处下手，因为C++词法太多太复杂，并且为了体现c++的特性又不得不去对它的语法作一点引入。幸运的是，在搜索相关资料时，一位网友的关于用c++构造词法自动机文章帮我解决了心中的难题，在此万分感谢！。（作者原文地址：http://hi.baidu.com/peiwenlin/blog/item/8c768d31137024af5edf0e0e.html）我根据作者的代码以及自己的思路对程序重新进行了改造，具体改造如下：

（1）比较深入地阐述了DFA理论，给出了DFA的模拟程序；

（2）使程序能识别所有的C++关键字；

（3）增加了对特殊符号的识别，比如. ' "；

（4）增加了对c++注释符号//的识别；

（5）增加了对标识符的识别力度，如ID = (_ | letter)(letter | digit | _)* ；

（6）修正了专有符号命名与编译器系统命名的冲突，比如“<”映射成字符串LT，LT与宏IDAStatics_LT(This,a_0,b_1,ret_2)相冲突，尽管在win32控制台程序中未报错，但在MFC应用程序中却会报错，解决办法是在把LT改为LTEN（-EN即enum)。

（7）最大的创新是用MFC开发了一个图形化的C++词法分析器。

（8）最后是简化了原代码，比如删掉了无用的函数isReservedWords（）。

前言

词法分析是编译的第一个阶段，它的主要任务是从左至右逐个字符地对源程序进行扫描，产生一个个单词序列，用以语法分析。实际上，词法也是语法的一部分，词法描述完全可以归并到语法描述中去，只不过词法规则更简单些。为什么将词法分析做为一个独立的阶段？为什么把编译过程的分析工作划分成词法分析和语法分析两个阶段？主要的考虑因素为：　① 使整个编译程序的结构更简洁、清晰和条理化。　② 编译程序的效率会改进。　③ 增强编译程序的可移植性。
词法分析程序的任务是：读源程序，产生单词符号；滤掉空格，跳过注释、换行符；追踪换行标志，复制出错源程序等等。

描述程序设计语言的词法的机制是正则表达式，识别机制是有穷状态自动机。

设计思想

      DFA（确定的有穷自动机）定义：一个确定的有穷自动机（DFA）M是一个五元组：M=（K，Σ，f，S，Z）其中
　　      ① K是一个有穷集，它的每个元素称为一个状态；
　　      ② Σ是一个有穷字母表，它的每个元素称为一个输入符号，所以也称Σ为输入符号字母表；
　　      ③ f是转换函数，是K×Σ→K上的映射，即如 f（ki，a）=kj，（ki∈K，kj∈K）就意味着，
                 当前状态为ki，输入符为a时，将转换为下一个状态kj，我们把kj称作ki的一个后继状态；
　　      ④ S ∈ K是唯一的一个初态；
　　      ⑤ Z K是一个终态集，终态也称可接受状态或结束状态。
       模拟DFA的程序思想如下伪代码示例：
       K=S；
　　　 c=getchar();
　　　 while(c!=EOF)
　　　 {
          K=f(K,c);
　　　　　c=getchar();
　　　 }
　　　 if( K== Z )return TRUE
　　　 else return FALSE

C++的单词识别基本思想

标识符ID = (_ | letter)(letter | digit | _)*

数字NUM =digit*

字母letter = a|..|z|A|..|Z|

数字digit = 0|..|9

C++专用符号+ - * / < <= > >= == != = ; , . ' " ( ) [ ] { } /* */

C++语言的关键字(所有的关键字都是保留字，并且必须是小写)

auto const dynamic_cast for mutable register static_cast try virtual

bool const_cast else friend namespace reinterpret_cast struct typedef void

break continue enum goto new return switch typeid volatile

case default explicit if operator short template typename while

catch delete extern inline private signed this union

char do false int protected sizeof throw unsigned

class double float long public static true using

小写和大写字母是有区别的。

空格由空白、换行符和制表符组成。空格通常被忽略，除了它必须分开ID、NUM关键字。

注释用通常的C语言符号/ * . . . * /围起来。注释可以放在任何空白出现的(即注释不能放在标记内)上，且可以超过一行。注释不能嵌套。

DFA控制台应用程序代码

#include <iostream>
#include <fstream>
#include <string>
using namespace std;

#define MAXRESERVED 60 //define the ReservedWords total num.
#define MAXSPECIALSYMBOLS 24 //define the specialSymbols total num
// define StateType,5 states.
typedef enum
{
START,ISCOMMENT,ISNUM,ISID,DONE
}StateType;

//define TokenType
typedef enum
{
    AUTO,BOOL,BREAK,CASE,CATCH,CHAR,CLASS, /* reserved words */
CONST,CONST_CAST,CONTINUE,DEFAULT,DELETE,DO,DOUBLE,
DYNAMIC_CAST,ELSE,ENUM,EXPLICIT,EXTERN,FALSE1,FLOAT,
FOR,FRIEND,GOTO,IF,INLINE,INT,LONG,
MUTABLE,NAMESPACE,NEW,OPERATOR,PRIVATE,PROTECTED,PUBLIC,
REGISTER,REINTERPRET_CAST,RETURN,SHORT,SIGNED,SIZEOF,STATIC,
STATIC_CAST,STRUCT,SWITCH,TEMPLATE,THIS,THROW,TRUE1,
TRY,TYPEDEF,TYPEID,TYPENAME,UNION,UNSIGNED,USING,
VIRTUAL,VOID,VOLATILE,WHILE,
    ID,NUM, /* multicharacter tokens */
    /* special symbols */
    PLUS,MINUS,MTPLUS,DEVIDE,EQ,ASSIGN,LT,MT,TIMES,DOT,SQ,DQ,LXKH,RXKH,LZKH,RZKH,LDKH,RDKH,SEMI,LE,ME,NOTEQ,FH,LZS,RZS,ZS,
    ENDFILE,UNKNOWN,ERROR /*stateToken */
}TokenType;

//define the specialSymbols' map table between Tokenstr and TokenType
string strsq=""+'/'',strdq=""+'/"';
struct{
        char* str;
        TokenType tt;
}specialSymbols[MAXSPECIALSYMBOLS] = { {"+",PLUS},{"-",MINUS},{"*",MTPLUS},{"/",DEVIDE},{"<",LT},{"<=",LE},{">",MT},
{">=",ME},{"==",EQ},{"!=",NOTEQ},{"=",ASSIGN},{",",SEMI},{";",FH},{".",DOT},{(char*)strsq.c_str(),SQ},{( char*)strdq.c_str(),DQ},{"(",LXKH},
           {")",RXKH},{"[",LZKH},{"]",RZKH},{"{",LDKH},{"}",RDKH},{"/*",LZS},{"*/",RZS} };

//define the reservedWords' map table between Tokenstr and TokenType
struct{
char* str;
TokenType tt;
}reservedWords[MAXRESERVED] = {{"auto",AUTO},{"bool",BOOL},{"break",BREAK},{"case",CASE},{"catch",CATCH},{"char",CHAR},{"class",CLASS},
{"const",CONST},{"const_cast",CONST_CAST},{"continue",CONTINUE},{"default",DEFAULT},{"delete",DELETE},{"do",DO},{"double",DOUBLE},
{"dynamic_cast",DYNAMIC_CAST},{"else",ELSE},{"enum",ENUM},{"explicit",EXPLICIT},{"extern",EXTERN},{"false",FALSE1},{"float",FLOAT},
{"for",FOR},{"friend",FRIEND},{"goto",GOTO},{"if",IF},{"inline",INLINE},{"int",INT},{"long",LONG},
{"mutable",MUTABLE},{"namespace",NAMESPACE},{"new",NEW},{"operator",OPERATOR},{"private",PRIVATE},{"protected",PROTECTED},{"public",PUBLIC},
{"register",REGISTER},{"reinterpret_cast",REINTERPRET_CAST},{"return",RETURN},{"short",SHORT},{"signed",SIGNED},{"sizeof",SIZEOF},{"static",STATIC},
{"static_cast",STATIC_CAST},{"struct",STRUCT},{"switch",SWITCH},{"template",TEMPLATE},{"this",THIS},{"throw",THROW},{"true",TRUE1},
{"try",TRY},{"typedef",TYPEDEF},{"typeid",TYPEID},{"typename",TYPENAME},{"union",UNION},{"unsigned",UNSIGNED},{"using",USING},
{"virtual",VIRTUAL},{"void",VOID},{"volatile",VOLATILE},{"while",WHILE}};

//output to the ofstream associated with output.txt.
void outputFile(ofstream& outf,TokenType tt,string str)
{
    for(int i = 0;i < MAXRESERVED;i++)
{
       if(reservedWords[i].str == str)
    {
          tt = reservedWords[i].tt;
          outf << "ReservedWords: " << str<<endl;
    }
}
    for(int j=0;j<MAXSPECIALSYMBOLS;j++)
{
    if(specialSymbols[j].tt == tt)
          outf << "Special Symbol: " << str <<endl;
}
    if(tt == ZS)
       outf << "ZS: " << str << endl;
    if(tt == ID)
       outf<<"ID: "<< str <<endl;
    if(tt == NUM)
       outf<<"NUM: "<< str.c_str()<<endl;
    if(tt == ERROR)
       outf << "ERROR: " << str << endl;
    if(tt == UNKNOWN)
       outf << "UNKNOWN Symbol: " << str << endl;
}//outputFile()

// get the token,确定有穷自动机（DFA）的实现
void GetToken(ifstream& inf,ofstream& outf,int& lineno)
{
    TokenType tt;
    StateType state = START;
    string str = "";
    static int flag = 1;
bool isdq=true;
bool isdbline=false;
    while(state != DONE)
{
       char c = inf.get();
       if(flag == 1)
    {
          outf << "-------------------------------------------" << endl;
          outf << "In the line " << lineno << " : " <<endl;
          flag = 0;
    }
       switch(state)
    {
          case START:
     if(isalpha(c)||c=='_')
     {
                state = ISID;
                str += c;
     }
     else if(isdigit(c))
     {
                state = ISNUM;
                str += c;
              }
     else if(c == '/t' || c == ' ') {}
           else if(c == '/n')
     {
                lineno = lineno + 1;
                outf << "-------------------------------------------" << endl;
                outf << "In the line " << lineno << " : " <<endl;
     }
     else if(c == '/' && inf.peek() == '*')
     {
                str = str + c;
                c = inf.get();
                str = str + c;
                state = ISCOMMENT;
     }
     else if(c=='/'&&inf.peek()=='/')
     {
        str = str + c;
                 c = inf.get();
                 str = str + c;
                 state = ISCOMMENT;
     isdbline=true;
     }
     else
     {
    state = DONE;
                str += c;
                //judge if c is special symbols,if FALSE ,output UNKNOWN
                switch(c) {
                   case EOF: tt = ENDFILE;break;
                   case '+': tt = PLUS;break;
                   case '-': tt = MINUS;break;
                   case '*': tt = MTPLUS;break;
                   case '/': tt = DEVIDE;break;
                   case ';': tt = FH;break;
                   case '.': tt = DOT;break;
       case '/'':tt = SQ;break;
       case '/"':tt = DQ;break;
       case '(': tt = LXKH;break;
                   case ')': tt = RXKH;break;
                   case '=':
         //judge if is '=='
                        if(inf.peek() == '=')
      {
                          c = inf.get();
                          tt = EQ;
                          str+=c;
                        }
      else //judge if is '='
         tt = ASSIGN;
      break;
                   case '<': //judge if is '<='
                        if(inf.peek() == '=')
      {
                          c = inf.get();
                          tt = LE;str+=c;
                        }
      else //judge if is '<'
                            tt = LT;
         break;
                   case '>':
         //judge if is '>='
                        if(inf.peek() == '=')
      {
                          c = inf.get();
                          tt = ME;
                          str+=c;
      }
      else //judge if is '>'
                            tt = MT;
               break;
                   case ',': tt = SEMI;break;
                   case '[': tt = LZKH;break;
                   case ']': tt = RXKH;break;
                   case '{': tt = LDKH;break;
                   case '}': tt = RDKH;break;
                   case '!':
        c = inf.get();
                       if(c == '=')
        {
                         tt = NOTEQ;
                         str+=c;
        }
        else
        {
                          inf.putback(c);
                          tt=UNKNOWN;
        }
        break;
                   default: tt = UNKNOWN;break;
    }//switch(c)
       } //else
          //case START
           break;
       case ISCOMMENT:
     if(c=='/n')
     {
      if(isdbline==true)
      {
          state = DONE;tt=ZS;
       inf.putback(c);
      }
      else
       lineno=lineno+1;
     }
           if(c == '*' && inf.peek() == '/')
     {
             str = str + c;
             c = inf.get();
             state = DONE;tt=ZS;
     }
     else if(inf.eof())
     {
             tt = ERROR;
             state = DONE;
     }
     str = str + c;
     break;

       case ISNUM:
     if(!isdigit(c))
     {
             if(c=='.'&&isdq==true)
    {
       if(isdigit(inf.peek()))
    {
        str += c;isdq=false;
              break;
    }
                else
    {
        str = str + c;
                    if(inf.peek() == ' ' || inf.peek() == '/t' || inf.peek() == '/n')
     {
                      tt = ERROR;state = DONE;
                      break;
     }
     break;
    }
    }

             if(isalpha(c)||c=='.'||c=='_')
    {
               str = str + c;
               if(inf.peek() == ' ' || inf.peek() == '/t' || inf.peek() == '/n')
      {
                 tt = ERROR;state = DONE;
                 break;
      }
               tt=ERROR;
             }
    else
    {
      if(tt!=ERROR)
         tt=NUM;
      inf.putback(c);
               state = DONE;
    }
             break;
     }
     str += c;
     break;

       case ISID:
     if(!isalpha(c))
     {
             if(c=='.')
    {
       str = str + c;
                if(inf.peek() == ' ' || inf.peek() == '/t' || inf.peek() == '/n')
    {
                  tt = ERROR;state = DONE;break;
    }
    tt=ERROR;break;
    }
             if(isdigit(c)|| c=='_')
    {
               str = str + c;
               if(inf.peek() == ' ' || inf.peek() == '/t' || inf.peek() == '/n')
      {
                   if(tt!=ERROR)
                   {
         tt = ID;state = DONE;break;
       }
                   state = DONE;break;
      }

      break;

}

       else
    {
      if(tt!=ERROR)
                    tt=ID;
               inf.putback(c);
               state = DONE;
    }
             break;
     }
        str += c;
     break;

       case DONE:break;
    default:
          state = DONE;
          tt = ERROR;
          break;
    } //switch(state)
}//while(state != DONE)
   // if is done output to the file the message.
   if(state == DONE)
   {
       outputFile(outf,tt,str);
   }
}//GetToken()

int main()
{
   ofstream outf("output.txt");
   ifstream inf("input.txt");
   //if inf is error,output the error message.
   if ( !inf )
   {
      cerr << "cannot open output.txt file!" << endl;
      return EXIT_FAILURE;
   }
   int lineno = 1;
   while(true)
   {
      GetToken(inf,outf,lineno);
      if(inf == NULL)
   {
         outf << "END OF FILE!" << endl;
         break;
   }
   }
   cout<<"☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆"<<endl;
   cout<<"词法自动机识别程序中的单词已完成！请查看output.txt文件，谢谢使用！(=^ ^=)"<<endl;
   // file stream close
   inf.close();
   outf.close();
   return EXIT_SUCCESS;
}

DFA图形化应用程序

程序使用指南：为了对一段C++程序进行词法分析，首先单击“导入源文件”按钮，在
弹出的文件打开对话框中选择*.txt、*.dat或*.cpp类型的文件，确定后C++程序将导入到左
边的编辑框中显示；单击“开始词法分析”按钮，程序将对左边编辑框中的C++代码进行词
法分析，分析完后再把分析结果显示在右边的编辑框中；单击“退出程序”按钮将退出程序。

程序测试

测试数据如下：

//input.txt
int 6_a=1,b8_=2,a.9;
char c='c';
char *people[3]={"lisi","王五"};
float pi1=3.14;
double pi2=3.14.02;
if(a>=b) /* >= */
   a+=8;
else
{
   a++;
   return ;
}

词法分析器执行结果如下：

-------------------------------------------
In the line 1 :
ZSEN: //input.txt

-------------------------------------------
In the line 2 :
ReservedWords: int
ERRORA: 6_a
Special Symbol: =
NUM: 1
Special Symbol: ,
ID: b8_
Special Symbol: =
NUM: 2
Special Symbol: ,
ERRORA: a.9
Special Symbol: ;
-------------------------------------------
In the line 3 :
ReservedWords: char
ID: c
Special Symbol: =
Special Symbol: '
ID: c
Special Symbol: '
Special Symbol: ;
-------------------------------------------
In the line 4 :
ReservedWords: char
Special Symbol: *
ID: people
Special Symbol: [
NUM: 3
Special Symbol: ]
Special Symbol: =
Special Symbol: {
Special Symbol: "
ID: lisi
Special Symbol: "
Special Symbol: ,
Special Symbol: "
UNKNOWNA Symbol: ?
UNKNOWNA Symbol: ?
UNKNOWNA Symbol: ?
UNKNOWNA Symbol: ?
Special Symbol: "
Special Symbol: }
Special Symbol: ;
-------------------------------------------
In the line 5 :
ReservedWords: float
ID: pi1
Special Symbol: =
NUM: 3.14
Special Symbol: ;
-------------------------------------------
In the line 6 :
ReservedWords: double
ID: pi2
Special Symbol: =
ERRORA: 3.14.02
Special Symbol: ;
-------------------------------------------
In the line 7 :
ReservedWords: if
Special Symbol: (
ID: a
Special Symbol: >=
ID: b
Special Symbol: )
ZSEN: /* >= */
-------------------------------------------
In the line 8 :
ID: a
Special Symbol: +
Special Symbol: =
NUM: 8
Special Symbol: ;
-------------------------------------------
In the line 9 :
ReservedWords: else
-------------------------------------------
In the line 10 :
Special Symbol: {
-------------------------------------------
In the line 11 :
ID: a
Special Symbol: +
Special Symbol: +
Special Symbol: ;
-------------------------------------------
In the line 12 :
ReservedWords: return
Special Symbol: ;
-------------------------------------------
In the line 13 :
Special Symbol: }
END OF FILE!

源代码分享

文中所涉及的所有程序源码和文档都已上传到了csdn下载栏里。具体地址为http://download.csdn.net/source/2513309。最后祝大家分享快乐！

【上篇】SRM 584div2
【下篇】windbg符号表问题，曾经纠结的东东

作者: 92899

该日志由 92899 于10年前发表在综合分类下，最后更新于 2014年02月01日.
转载请注明: 确定有限自动机（DFA）——一个简单的C++词法分析器 | 学步园 +复制链接

抱歉!评论已关闭.

学步园

确定有限自动机（DFA）——一个简单的C++词法分析器

作者: 92899

书签

最新文章New

本站推荐

返回首页