计算机相关专业的差不多都有学过编译原理吧?今天我班门弄刀,也谈谈我自己对编译原理的认识和理解。当然啦,我主要要谈的是编译器的前端的实现,后端的代码生成我目前还没有研究过。
实现一个编译器有两大步骤:一是词法分析,二是语法分析。应对这两块有很多的工具是可以帮助我们进行这样的工作的(例如:flex、yacc等),但我要说的是怎么完全手工去实现它。语法分析的主要目的是把一个个的字符和字母之类的东西给识别为token,而语法分析的主要作用是去构建分析后的对象或是构建抽象语法树。下面我结合早一段时间开发的JSON的反序列化来聊聊整个的过程。
词法分析
private IList<Token> ReadTokens(char[] chars)
{
IList<Token> tokens = new List<Token>();
StringBuilder sb = new StringBuilder();
CharReader src = new CharReader(chars);
char c = src.Read();
{
switch (c)
{
#region 空白字符
case '\t':
{
c = src.Read();
while (c == '\t') { c = src.Read(); }
break;
}
case ' ':
{
c = src.Read();
while (c == ' ') { c = src.Read(); }
break;
}
case '\r':
{
c = src.Read();
while (c == '\r') { c = src.Read(); }
break;
}
case '\n':
{
c = src.Read();
while (c == '\n') { c = src.Read(); }
break;
}
#endregion
case '[':
{
tokens.Add(new Token(TokenId.LBracket));
c = src.Read();
break;
}
case ']':
{
tokens.Add(new Token(TokenId.RBracket));
c = src.Read();
break;
}
case '{':
{
tokens.Add(new Token(TokenId.LCurly));
c = src.Read();
break;
}
case '}':
{
tokens.Add(new Token(TokenId.RCurly));
c = src.Read();
break;
}
case ':':
{
tokens.Add(new Token(TokenId.Colon));
c = src.Read();
break;
}
case ',':
{
tokens.Add(new Token(TokenId.Comma));
c = src.Read();
break;
}
#endregion
case '@':
case '\'':
case '"':
{
bool isVerbatim = false;
if (c == '@')
{
isVerbatim = true;
c = src.Read(); // skip to follow quote
}
sb.Length = 0;
char quote = c;
bool isSingleQuote = (c == '\'');
c = src.Read();
while (!src.Eof)
{
if (c == '\\' && !isVerbatim) // normal escaped chars
{
c = src.Read();
switch (c)
{
case '\0':
{
if (src.Eof)
{
goto readLoop;
}
else
{
sb.Append('\0');
c = src.Read();
break;
}
}
case 'a':
{
sb.Append('\a');
c = src.Read();
break;
}
case 'b':
{
sb.Append('\b');
c = src.Read();
break;
}
case 'f':
{
sb.Append('\f');
c = src.Read();
break;
}
case 'n':
{
sb.Append('\n');
c = src.Read();
break;
}
case 'r':
{
sb.Append('\r');
c = src.Read();
break;
}
case 't':
{
sb.Append('\t');
c = src.Read();
break;
}
case 'v':
{
sb.Append('\v');
c = src.Read();
break;
}
case '\\':
{
sb.Append('\\');
c = src.Read();
break;
}
case '\'':
{
sb.Append('\'');
c = src.Read();
break;
}
case '\"':
{
// strings are always stored as verbatim for now, so the double quote is needed
sb.Append("\"\"");
c = src.Read();
break;
}
default:
{
sb.Append(c);
break;
}
}
}
else if (c == '\"')
{
c = src.Read();
// two double quotes are escapes for quotes in verbatim mode
if (c == '\"' && isVerbatim)// verbatim escape
{
sb.Append("\"\"");
c = src.Read();
}
else if (isSingleQuote)
{
sb.Append('\"');
}
else
{
break;
}
}
else // non escaped
{
if (c == quote)
{
break;
}
sb.Append(c);
c = src.Read();
}
{
if (c == quote)
{
c = src.Read(); // skip last quote
}
}
break;
}
#endregion
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '.':
case '+':
case '-':
{
sb.Length = 0;
TokenId numKind = TokenId.Int;
{
c = src.Read();
if (c < '0' || c > '9')
{
break;
}
else
{
sb.Append('.');
numKind = TokenId.Double;
isDouble = true;
}
}
else if (c == '+')
{
c = src.Read();
}
else if (c == '-')
{
sb.Append('-');
c = src.Read();
}
{
switch (c)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
sb.Append(c);
break;
}
case '.':
{
if (isDouble)
{
numKind = TokenId.Double;
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
if (c < '0' || c > '9')
{
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
private IList<Token> ReadTokens(char[] chars)
{
IList<Token> tokens = new List<Token>();
StringBuilder sb = new StringBuilder();
CharReader src = new CharReader(chars);
char c = src.Read();
readLoop:
{
switch (c)
{
#region 空白字符
case '\t':
{
c = src.Read();
while (c == '\t') { c = src.Read(); }
break;
}
case ' ':
{
c = src.Read();
while (c == ' ') { c = src.Read(); }
break;
}
case '\r':
{
c = src.Read();
while (c == '\r') { c = src.Read(); }
break;
}
case '\n':
{
c = src.Read();
while (c == '\n') { c = src.Read(); }
break;
}
#endregion
#region 运算符
case '[':
{
tokens.Add(new Token(TokenId.LBracket));
c = src.Read();
break;
}
case ']':
{
tokens.Add(new Token(TokenId.RBracket));
c = src.Read();
break;
}
case '{':
{
tokens.Add(new Token(TokenId.LCurly));
c = src.Read();
break;
}
case '}':
{
tokens.Add(new Token(TokenId.RCurly));
c = src.Read();
break;
}
case ':':
{
tokens.Add(new Token(TokenId.Colon));
c = src.Read();
break;
}
case ',':
{
tokens.Add(new Token(TokenId.Comma));
c = src.Read();
break;
}
#endregion
#region 字符串
case '@':
case '\'':
case '"':
{
bool isVerbatim = false;
if (c == '@')
{
isVerbatim = true;
c = src.Read(); // skip to follow quote
}
sb.Length = 0;
char quote = c;
bool isSingleQuote = (c == '\'');
c = src.Read();
while (!src.Eof)
{
if (c == '\\' && !isVerbatim) // normal escaped chars
{
c = src.Read();
switch (c)
{
case '\0':
{
if (src.Eof)
{
goto readLoop;
}
else
{
sb.Append('\0');
c = src.Read();
break;
}
}
case 'a':
{
sb.Append('\a');
c = src.Read();
break;
}
case 'b':
{
sb.Append('\b');
c = src.Read();
break;
}
case 'f':
{
sb.Append('\f');
c = src.Read();
break;
}
case 'n':
{
sb.Append('\n');
c = src.Read();
break;
}
case 'r':
{
sb.Append('\r');
c = src.Read();
break;
}
case 't':
{
sb.Append('\t');
c = src.Read();
break;
}
case 'v':
{
sb.Append('\v');
c = src.Read();
break;
}
case '\\':
{
sb.Append('\\');
c = src.Read();
break;
}
case '\'':
{
sb.Append('\'');
c = src.Read();
break;
}
case '\"':
{
// strings are always stored as verbatim for now, so the double quote is needed
sb.Append("\"\"");
c = src.Read();
break;
}
default:
{
sb.Append(c);
break;
}
}
}
else if (c == '\"')
{
c = src.Read();
// two double quotes are escapes for quotes in verbatim mode
if (c == '\"' && isVerbatim)// verbatim escape
{
sb.Append("\"\"");
c = src.Read();
}
else if (isSingleQuote)
{
sb.Append('\"');
}
else
{
break;
}
}
else // non escaped
{
if (c == quote)
{
break;
}
sb.Append(c);
c = src.Read();
}
}
if (c != '\0')
{
if (c == quote)
{
c = src.Read(); // skip last quote
}
tokens.Add(
new Token(TokenId.String, sb.ToString()));}
break;
}
#endregion
#region 数字
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '.':
case '+':
case '-':
{
sb.Length = 0;
TokenId numKind = TokenId.Int;
bool isDouble = false;
if (c == '.')
{
c = src.Read();
if (c < '0' || c > '9')
{
break;
}
else
{
sb.Append('.');
numKind = TokenId.Double;
isDouble = true;
}
}
else if (c == '+')
{
c = src.Read();
}
else if (c == '-')
{
sb.Append('-');
c = src.Read();
}
bool isNum = true;
while (isNum)
{
switch (c)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
sb.Append(c);
break;
}
case '.':
{
if (isDouble)
{
numKind = TokenId.Double;
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
c
= src.Read();if (c < '0' || c > '9')
{
tokens.Add(new Token(numKind, sb.ToString()));
goto readLoop;
}
作者: borrower
- 该日志由 borrower 于11年前发表在综合分类下,最后更新于 2012年11月21日.
- 转载请注明: 转:谈谈编译原理和其在WEB开发中的应用1 http://www.cnblogs.com/afxcn | 学步园 +复制链接
抱歉!评论已关闭.