編譯器DIY——詞法分析
阿新 • • 發佈:2018-12-30
在上一篇文章中已經介紹了讀檔案的操作,那麼這一篇文章中將會仔細解釋詞法分析。
在原始檔中解析出的單詞流必須識別為保留字,識別符號,常量,操作符和界符五大類
1.顯然我們需要列舉出所有的保留字,而這裡與保留字相似的那麼就是識別符號,在C語言中,保留字都是以小寫字母開頭,而且其中的字母只能是小寫字母,而識別符號的第一個字母則必須為字元(小寫大寫皆可)後面可以接大小寫字母和字元 ‘_’, 在我寫的這個編譯器中,識別符號不能超過100,在C語言中的識別符號定義的長度大小遠遠大於此。
2.對於常量,這裡需要注意的是整型和浮點型常量。
3.運算子按照的是下面的表:
C語言運算子表
運算子按照優先順序大小由上向下排列,在同一行的運算子具有相同優先順序。第二行是所有的一元運算子。
|
|
|
() [] -> . | 括號(函式等),陣列,兩種結構成員訪問 |
|
! ~ ++ -- + -
* & |
否定,按位否定,增量,減量,正負號,
間接,取地址 |
|
* / % | 乘,除,取模 |
|
+ - | 加,減 |
|
<< >> | 左移,右移 |
|
< <= >= > | 小於,小於等於,大於等於,大於 |
|
== != | 等於,不等於 |
|
& | 按位與 |
|
^ | 按位異或 |
|
| | 按位或 |
|
&& |
邏輯與 |
|
|| | 邏輯或 |
|
? : | 條件 |
|
= += -= *= /=
&= ^= |= <<= >>= |
各種賦值 |
|
, | 逗號(順序) |
|
4.界符:“;”“{}”,單引號,雙引號
接下來我介紹的是對保留字的歸類,為了查詢方便,將保留字按照a-z的順序排好,依據陣列的下標定位,減少尋找的時間
/* * keyword.h * * Created on: Jun 12, 2014 * */ #ifndef KEYWORD_H_ #define KEYWORD_H_ struct keyword{ char *keyName; }; static struct keyword key__[]={ {"__int64"}, {"end"} }; static struct keyword key_A[]={ {"auto"}, {"end"} }; static struct keyword key_B[]={ {"break"}, {"end"} }; static struct keyword key_C[]={ {"case"}, {"char"}, {"const"}, {"continue"}, {"end"} }; static struct keyword key_D[]={ {"default"}, {"do"}, {"double"}, {"end"} }; static struct keyword key_E[]={ {"else"}, {"enum"}, {"extern"}, {"end"} }; static struct keyword key_F[]={ {"float"}, {"for"}, {"end"} }; static struct keyword key_G[]={ {"goto"}, {"end"} }; static struct keyword key_H[]={ {"end"} }; static struct keyword key_I[]={ {"if"}, {"int"}, {"end"} }; static struct keyword key_J[]={ {"end"} }; static struct keyword key_K[]={ {"end"} }; static struct keyword key_L[]={ {"long"}, {"end"} }; static struct keyword key_M[]={ {"end"} }; static struct keyword key_N[]={ {"end"} }; static struct keyword key_O[]={ {"end"} }; static struct keyword key_P[]={ {"end"} }; static struct keyword key_Q[]={ {"end"} }; static struct keyword key_R[]={ {"register"}, {"return"}, {"end"} }; static struct keyword key_S[]={ {"short"}, {"signed"}, {"sizeof"}, {"static"}, {"struct"}, {"switch"}, {"end"} }; static struct keyword key_T[]={ {"typedef"}, {"end"} }; static struct keyword key_U[]={ {"union"}, {"unsigned"}, {"end"} }; static struct keyword key_V[]={ {"void"}, {"volatile"}, {"end"} }; static struct keyword key_W[]={ {"while"}, {"end"} }; static struct keyword key_X[]={ {"end"} }; static struct keyword key_Y[]={ {"end"} }; static struct keyword key_Z[]={ {"end"} }; // size is 27 static struct keyword *keywords[]={ key__,key_A,key_B,key_C,key_D,key_E, key_F,key_G,key_H,key_I,key_J,key_K, key_L,key_M,key_N,key_O,key_P,key_Q, key_R,key_S,key_T,key_U,key_V,key_W, key_X,key_Y,key_Z }; #endif /* KEYWORD_H_ */
下面是詞法分析的原始碼;
/*
* lex.h
*
* Created on: Jun 13, 2014
*
*/
#include "input.h"
#include "keyword.h"
#define isDigit(c) (c>='0' && c<='9')
#define isUpperLetter(c) (c>='A' && c <='Z')
#define isLowerLetter(c) (c>='a' && c<='z')
#define isLetter(c) (isUpperLetter || isLowerLetter)
/*
* lex.c
*
* Created on: Jun 13, 2014
*
*/
#include "zcc.h"
#include "lex.h"
#define curr source.cursor
int getToken() {
char a[100];
int a_length, i, flag;
/*
*skip ' ','\n' and '\b'
*/
while (*curr == ' ' || *curr == 10 || *curr == 9) {
curr++;
if (*curr == END_OF_FILE) {
return -1;
}
}
/* name or keyword on first is a-z */
a_length=0;
if (*curr >= 'a' && *curr <= 'z') {
IDAndKey:
a_length = 0;
do {
a[a_length++] = *curr++;
} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
|| *curr == '_');
a[a_length] = '\0';
i = 0;
flag = 0;
if (*a - 'a' <= 26 && *a - 'a' >= 0) {
while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
flag = 1;
break;
}
i++;
}
if (flag == 1) {
printf("keyword is %s\n", a);
return 1;
} else {
printf("Identify is %s\n", a);
return 1;
}
} else {
printf("Identify is %s\n", a);
return 1;
}
} else if (isUpperLetter(*curr)) {
goto IDAndKey;
} else if (isDigit(*curr)) {
a_length = 0;
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
//float number
if (*curr == '.') {
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
a[a_length] = '\0';
printf("float number is %s\n", a);
return 1;
} else {
// number
a[a_length] = '\0';
printf("number is %s\n", a);
return 1;
}
/*
* Operator begin
* */
} else if (*curr == '<') {
a[a_length++] = *curr++;
if (*curr == '<') {
a[a_length++] = *curr++;
lastOperatorDeal:
a[a_length] = '\0';
printf("Operator is %s\n", a);
return 1;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '>') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '=') {
a[a_length++] = *curr++;
if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '(') {
singleOperator:
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == ')') {
goto singleOperator;
} else if (*curr == '[') {
goto singleOperator;
} else if (*curr == ']') {
goto singleOperator;
} else if (*curr == '-') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '-') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
}else if(*curr=='.'){
goto singleOperator;
}else if(*curr=='!'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='~'){
goto singleOperator;
}else if(*curr=='+'){
a[a_length++]=*curr++;
if(*curr=='+'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='-'){
a[a_length++]=*curr++;
if(*curr=='-'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='*'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='&'){
a[a_length++]=*curr++;
if(*curr=='&'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='/'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}if(*curr=='/'){
// skip line
while(*curr!='\n'){
if(*curr==END_OF_FILE)
return -1;
curr++;
}
}else if(*curr=='*'){
curr++;
// skip "/**/"
while(*curr!=END_OF_FILE)
{
if(*curr=='*' && *(curr+1)=='/'){
curr+=2;
break;
}
curr++;
}
}else{
goto lastOperatorDeal;
}
}else if(*curr=='%'){
a[a_length++]=*curr++;
if(*curr=='d'){
goto singleOperator;
}else if(*curr=='c'){
goto singleOperator;
}else if(*curr=='f'){
goto singleOperator;
}else if(*curr=='l'){
a[a_length++]=*curr++;
if(*curr=='d')
goto singleOperator;
else if(*curr=='f')
goto singleOperator;
else
goto singleOperator;
}
}else if(*curr=='^'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='|'){
a[a_length++]=*curr++;
if(*curr=='|'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='?'){
goto singleOperator;
}else if(*curr==':'){
goto singleOperator;
}else if(*curr==','){
goto singleOperator;
}else if(*curr=='\\'){
a[a_length++]=*curr++;
if(*curr=='n'){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}
/*
* Operator end
* */
/*
* delimiter begin
* */
else if(*curr=='{'){
singleDelimiter:
a[a_length++]=*curr++;
a[a_length]='\0';
printf("Delimiter is %s\n", a);
return 1;
}else if(*curr=='}'){
goto singleDelimiter;
}else if(*curr==';'){
goto singleDelimiter;
}else if(*curr=='\''){
goto singleDelimiter;
}else if(*curr=='\"'){
goto singleDelimiter;
}
}
這裡實現了將單詞分成五類流,並將單詞打印出來,在後面的語法分析中將會使用到這裡的單詞流結果。
忘了說了,我將自己寫的編譯器命名為:ZCC,標頭檔案都包含在zcc.h中(*^__^*) 嘻嘻……,想寫個類似與gcc 一樣神奇的玩意。
最後看測試文件:
struct Student{
int a;
char* name;
}
int main()
{
int a=123;
float a2=1.2345677;
int b=1+3;
for(int i=0; i < 100; i++)
a+=i;
printf("%d\n", a);
return 0;
}
測試結果:
keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }
做到這裡,可以告一小段落了,接下來做的事情就是語法分析。