现在的位置: 首页 > 综合 > 正文

LCC编译器的源程序分析(7)词法分析

2013年03月13日 ⁄ 综合 ⁄ 共 14462字 ⁄ 字号 评论关闭
下面开始关键字、ID等识别,采用这种词法分析,是最高效的,由于在识别的过程里,就已经区分它是什么关键字,而不像其它的词法分析程序,需要查找才能决定是否是关键字。
#074         case 'i':
#075               if (rcp[0] == 'f'
#076               && !(map[rcp[1]]&(DIGIT|LETTER))) {
#077                    cp = rcp + 1;
#078                    return IF;
#079               }
#080               if (rcp[0] == 'n'
#081               && rcp[1] == 't'
#082               && !(map[rcp[2]]&(DIGIT|LETTER))) {
#083                    cp = rcp + 2;
#084                    tsym = inttype->u.sym;
#085                    return INT;
#086               }
#087               goto id;
#088         case 'h': case 'j': case 'k': case 'm': case 'n': case 'o':
#089         case 'p': case 'q': case 'x': case 'y': case 'z':
#090         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
#091         case 'G': case 'H': case 'I': case 'J': case 'K':
#092         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
#093         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
#094         case 'Y': case 'Z':
#095         id:
#096               if (limit - rcp < MAXLINE) {
#097                    cp = rcp - 1;
#098                    fillbuf();
#099                    rcp = ++cp;
#100               }
#101               assert(cp == rcp);
#102               token = (char *)rcp - 1;
#103               while (map[*rcp]&(DIGIT|LETTER))
#104                    rcp++;
#105               token = stringn(token, (char *)rcp - token);
#106               tsym = lookup(token, identifiers);
#107               cp = rcp;
#108               return ID;
74行是识别以i开头的关键字和ID字符串。第75行到第79行是识别if关键字,它是通过缓冲区里前三个字符if、分隔符来区分的,如果第三个字符是分隔符,说明它就是if关键字了,如果还有其它合法的字符,就是变量ID了。比如像ifStmt就是变量ID了。
80行到第86行是识别int关键字。它返回一个记号INT的值,同时还取得INT类型符号信息。
87行是所有的字符不是关键字时,就跳到id标号那里处理为变量ID
88行到第108行都是识别字符串为变量ID,由于那些字母开头的字符串里是不会有关键字的。
在第96行到第99行是重新填充字符缓冲区,以便识别完整的变量ID出来。
在第102行到第105行里,就获取ID的字符串,并保存到token里。
在第106行里查找这个ID是否已经声明,如果没有声明返回是空指令给tsym
在第108行里返回ID这个记号来标识当前已经识别出一个ID了。
 
下面是进行数字串的识别:
#109         case '0': case '1': case '2': case '3': case '4':
#110         case '5': case '6': case '7': case '8': case '9': {
#111               unsigned long n = 0;
#112               if (limit - rcp < MAXLINE) {
#113                    cp = rcp - 1;
#114                    fillbuf();
#115                    rcp = ++cp;
#116               }
#117               assert(cp == rcp);
#118               token = (char *)rcp - 1;
#119               if (*token == '0' && (*rcp == 'x' || *rcp == 'X')) {
#120                    int d, overflow = 0;
#121                    while (*++rcp) {
#122                          if (map[*rcp]&DIGIT)
#123                               d = *rcp - '0';
#124                          else if (*rcp >= 'a' && *rcp <= 'f')
#125                               d = *rcp - 'a' + 10;
#126                          else if (*rcp >= 'A' && *rcp <= 'F')
#127                               d = *rcp - 'A' + 10;
#128                          else
#129                               break;
#130                          if (n&~(~0UL >> 4))
#131                               overflow = 1;
#132                          else
#133                               n = (n<<4) + d;
#134                    }
#135                    if ((char *)rcp - token <= 2)
#136                          error("invalid hexadecimal constant `%S'/n", token, (char *)rcp-token);
#137                    cp = rcp;
#138                    tsym = icon(n, overflow, 16);
#139               } else if (*token == '0') {
#140                    int err = 0, overflow = 0;
#141                    for ( ; map[*rcp]&DIGIT; rcp++) {
#142                          if (*rcp == '8' || *rcp == '9')
#143                               err = 1;
#144                          if (n&~(~0UL >> 3))
#145                               overflow = 1;
#146                          else
#147                               n = (n<<3) + (*rcp - '0');
#148                    }
#149                    if (*rcp == '.' || *rcp == 'e' || *rcp == 'E') {
#150                          cp = rcp;
#151                          tsym = fcon();
#152                          return FCON;
#153                    }
#154                    cp = rcp;
#155                    tsym = icon(n, overflow, 8);
#156                    if (err)
#157                          error("invalid octal constant `%S'/n", token, (char*)cp-token);
#158               } else {
#159                    int overflow = 0;
#160                    for (n = *token - '0'; map[*rcp]&DIGIT; ) {
#161                          int d = *rcp++ - '0';
#162                          if (n > (ULONG_MAX - d)/10)
#163                               overflow = 1;
#164                          else
#165                               n = 10*n + d;
#166                    }
#167                    if (*rcp == '.' || *rcp == 'e' || *rcp == 'E') {
#168                          cp = rcp;
#169                          tsym = fcon();
#170                          return FCON;
#171                    }
#172                    cp = rcp;
#173                    tsym = icon(n, overflow, 10);
#174               }
#175               return ICON;
#176         }
109行到第110行里都是以数字开头的字符,这是C标准里规定数字常量的格式。
112行到第116行也是继续填充缓冲区。
119行到第138行是处理16进制的字符串,像0x12AB这样的字符串。通过n = (n<<4) + d来计算值有多大,最后调用函数icon来识别这个数字串是什么结尾的标识,比如0x12ABL这样的字符串。并且把它保存到符号tsym里,最后返回ICON常量记号。
139行到第157行是识别8进制数或浮点数字符串。在第141到第148行里计算8进制值的大小。在第148到第153行是识别以0为开头的浮点数。
158行到第175行是处理10进制的字符串和不是0开头的浮点数。
在数字串处理里,还需要判断值的大小,如果超出表示值,就需要给出错误提示。
 
 
#177         case '.':
#178               if (rcp[0] == '.' && rcp[1] == '.') {
#179                    cp += 2;
#180                    return ELLIPSIS;
#181               }
#182               if ((map[*rcp]&DIGIT) == 0)
#183                    return '.';
#184               if (limit - rcp < MAXLINE) {
#185                    cp = rcp - 1;
#186                    fillbuf();
#187                    rcp = ++cp;
#188               }
#189               assert(cp == rcp);
#190               cp = rcp - 1;
#191               token = (char *)cp;
#192               tsym = fcon();
#193               return FCON;
178行到第181行是识别C里的省略符’…’
182193行是识别浮点数的处理。
 
 
#194         case 'L':
#195               if (*rcp == '/'') {
#196                    unsigned int *s = scon(*cp, wcput, wcbuf);
#197                    if (s - wcbuf > 2)
#198                          warning("excess characters in wide-character literal ignored/n");
#199                    tval.type = widechar;
#200                    tval.u.c.v.u = wcbuf[0];
#201                    tsym = &tval;
#202                    return ICON;
#203               } else if (*rcp == '"') {
#204                    unsigned int *s = scon(*cp, wcput, wcbuf);
#205                    tval.type = array(widechar, s - wcbuf, 0);
#206                    tval.u.c.v.p = wcbuf;
#207                    tsym = &tval;
#208                    return SCON;
#209               } else
#210                    goto id;
195行到第209行都是识别宽字符或宽字符串。
 
#211         case '/'': {
#212               char *s = scon(*--cp, cput, cbuf);
#213               if (s - cbuf > 2)
#214                    warning("excess characters in multibyte character literal ignored/n");
#215               tval.type = inttype;
#216               if (chartype->op == INT)
#217                    tval.u.c.v.i = extend(cbuf[0], chartype);
#218               else
#219                    tval.u.c.v.i = cbuf[0]&0xFF;
#220               tsym = &tval;
#221               return ICON;
#222               }
上面是单引号的处理。
 
#223         case '"': {
#224               char *s = scon(*--cp, cput, cbuf);
#225               tval.type = array(chartype, s - cbuf, 0);
#226               tval.u.c.v.p = cbuf;
#227               tsym = &tval;
#228               return SCON;
#229               }
上面是双引号字符串的处理。
 
#230         case 'a':
#231               if (rcp[0] == 'u'
#232               && rcp[1] == 't'
#233               && rcp[2] == 'o'
#234               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#235                    cp = rcp + 3;
#236                    return AUTO;
#237               }
#238               goto id;
#239         case 'b':
#240               if (rcp[0] == 'r'
#241               && rcp[1] == 'e'
#242               && rcp[2] == 'a'
#243               && rcp[3] == 'k'
#244               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#245                    cp = rcp + 4;
#246                    return BREAK;
#247               }
#248               goto id;
#249         case 'c':
#250               if (rcp[0] == 'a'
#251               && rcp[1] == 's'
#252               && rcp[2] == 'e'
#253               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#254                    cp = rcp + 3;
#255                    return CASE;
#256               }
#257               if (rcp[0] == 'h'
#258               && rcp[1] == 'a'
#259               && rcp[2] == 'r'
#260               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#261                    cp = rcp + 3;
#262                    tsym = chartype->u.sym;
#263                    return CHAR;
#264               }
#265               if (rcp[0] == 'o'
#266               && rcp[1] == 'n'
#267               && rcp[2] == 's'
#268               && rcp[3] == 't'
#269               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#270                    cp = rcp + 4;
#271                    return CONST;
#272               }
#273               if (rcp[0] == 'o'
#274               && rcp[1] == 'n'
#275               && rcp[2] == 't'
#276               && rcp[3] == 'i'
#277               && rcp[4] == 'n'
#278               && rcp[5] == 'u'
#279               && rcp[6] == 'e'
#280               && !(map[rcp[7]]&(DIGIT|LETTER))) {
#281                    cp = rcp + 7;
#282                    return CONTINUE;
#283               }
#284               goto id;
#285         case 'd':
#286               if (rcp[0] == 'e'
#287               && rcp[1] == 'f'
#288               && rcp[2] == 'a'
#289               && rcp[3] == 'u'
#290               && rcp[4] == 'l'
#291               && rcp[5] == 't'
#292               && !(map[rcp[6]]&(DIGIT|LETTER))) {
#293                    cp = rcp + 6;
#294                    return DEFAULT;
#295               }
#296               if (rcp[0] == 'o'
#297               && rcp[1] == 'u'
#298               && rcp[2] == 'b'
#299               && rcp[3] == 'l'
#300               && rcp[4] == 'e'
#301               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#302                    cp = rcp + 5;
#303                    tsym = doubletype->u.sym;
#304                    return DOUBLE;
#305               }
#306               if (rcp[0] == 'o'
#307               && !(map[rcp[1]]&(DIGIT|LETTER))) {
#308                    cp = rcp + 1;
#309                    return DO;
#310               }
#311               goto id;
#312         case 'e':
#313               if (rcp[0] == 'l'
#314               && rcp[1] == 's'
#315               && rcp[2] == 'e'
#316               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#317                    cp = rcp + 3;
#318                    return ELSE;
#319               }
#320               if (rcp[0] == 'n'
#321               && rcp[1] == 'u'
#322               && rcp[2] == 'm'
#323               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#324                    cp = rcp + 3;
#325                    return ENUM;
#326               }
#327               if (rcp[0] == 'x'
#328               && rcp[1] == 't'
#329               && rcp[2] == 'e'
#330               && rcp[3] == 'r'
#331               && rcp[4] == 'n'
#332               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#333                    cp = rcp + 5;
#334                    return EXTERN;
#335               }
#336               goto id;
#337         case 'f':
#338               if (rcp[0] == 'l'
#339               && rcp[1] == 'o'
#340               && rcp[2] == 'a'
#341               && rcp[3] == 't'
#342               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#343                    cp = rcp + 4;
#344                    tsym = floattype->u.sym;
#345                    return FLOAT;
#346               }
#347               if (rcp[0] == 'o'
#348               && rcp[1] == 'r'
#349               && !(map[rcp[2]]&(DIGIT|LETTER))) {
#350                    cp = rcp + 2;
#351                    return FOR;
#352               }
#353             goto id;
#354         case 'g':
#355               if (rcp[0] == 'o'
#356               && rcp[1] == 't'
#357               && rcp[2] == 'o'
#358               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#359                    cp = rcp + 3;
#360                    return GOTO;
#361               }
#362               goto id;
#363         case 'l':
#364             if (rcp[0] == 'o'
#365               && rcp[1] == 'n'
#366               && rcp[2] == 'g'
#367               && !(map[rcp[3]]&(DIGIT|LETTER))) {
#368                    cp = rcp + 3;
#369                    return LONG;
#370               }
#371               goto id;
#372         case 'r':
#373               if (rcp[0] == 'e'
#374               && rcp[1] == 'g'
#375               && rcp[2] == 'i'
#376               && rcp[3] == 's'
#377               && rcp[4] == 't'
#378               && rcp[5] == 'e'
#379               && rcp[6] == 'r'
#380               && !(map[rcp[7]]&(DIGIT|LETTER))) {
#381                    cp = rcp + 7;
#382                    return REGISTER;
#383               }
#384               if (rcp[0] == 'e'
#385               && rcp[1] == 't'
#386               && rcp[2] == 'u'
#387               && rcp[3] == 'r'
#388               && rcp[4] == 'n'
#389               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#390                    cp = rcp + 5;
#391                    return RETURN;
#392               }
#393               goto id;
#394         case 's':
#395               if (rcp[0] == 'h'
#396               && rcp[1] == 'o'
#397               && rcp[2] == 'r'
#398               && rcp[3] == 't'
#399               && !(map[rcp[4]]&(DIGIT|LETTER))) {
#400                    cp = rcp + 4;
#401                    return SHORT;
#402               }
#403               if (rcp[0] == 'i'
#404               && rcp[1] == 'g'
#405               && rcp[2] == 'n'
#406               && rcp[3] == 'e'
#407               && rcp[4] == 'd'
#408               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#409                    cp = rcp + 5;
#410                    return SIGNED;
#411               }
#412               if (rcp[0] == 'i'
#413               && rcp[1] == 'z'
#414               && rcp[2] == 'e'
#415               && rcp[3] == 'o'
#416               && rcp[4] == 'f'
#417               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#418                    cp = rcp + 5;
#419                    return SIZEOF;
#420               }
#421               if (rcp[0] == 't'
#422               && rcp[1] == 'a'
#423                && rcp[2] == 't'
#424               && rcp[3] == 'i'
#425               && rcp[4] == 'c'
#426               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#427                    cp = rcp + 5;
#428                    return STATIC;
#429               }
#430               if (rcp[0] == 't'
#431               && rcp[1] == 'r'
#432               && rcp[2] == 'u'
#433               && rcp[3] == 'c'
#434               && rcp[4] == 't'
#435               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#436                    cp = rcp + 5;
#437                    return STRUCT;
#438               }
#439               if (rcp[0] == 'w'
#440               && rcp[1] == 'i'
#441               && rcp[2] == 't'
#442               && rcp[3] == 'c'
#443               && rcp[4] == 'h'
#444               && !(map[rcp[5]]&(DIGIT|LETTER))) {
#445                    cp = rcp + 5;
#446                    return SWITCH;
#447               }
#448               goto id;
#449         case 't':
#450               if (rcp[0] == 'y'
#451               && rcp[1] == 'p'
#452               && rcp[2] == 'e'
#453               && rcp[3] == 'd'
#454               && rcp[4] == 'e'
#455               && rcp[5] == 'f'
#456               && !(map[rcp[6]]&(DIGIT|LETTER))) {
#457                    cp = rcp + 6;
#458                    return TYPEDEF;
#459               }
#460               goto id;
#461         case 'u':
#462               if (rcp[

抱歉!评论已关闭.