1. 程式人生 > >編譯器DIY——詞法分析

編譯器DIY——詞法分析

在上一篇文章中已經介紹了讀檔案的操作,那麼這一篇文章中將會仔細解釋詞法分析。

在原始檔中解析出的單詞流必須識別為保留字,識別符號,常量,操作符和界符五大類

1.顯然我們需要列舉出所有的保留字,而這裡與保留字相似的那麼就是識別符號,在C語言中,保留字都是以小寫字母開頭,而且其中的字母只能是小寫字母,而識別符號的第一個字母則必須為字元(小寫大寫皆可)後面可以接大小寫字母和字元 ‘_’, 在我寫的這個編譯器中,識別符號不能超過100,在C語言中的識別符號定義的長度大小遠遠大於此。

2.對於常量,這裡需要注意的是整型和浮點型常量。

3.運算子按照的是下面的表:

C語言運算子

運算子按照優先順序大小由上向下排列,在同一行的運算子具有相同優先順序。第二行是所有的一元運算子。


 

運算子
解釋
結合方式
() [] -> . 括號(函式等),陣列,兩種結構成員訪問
由左向右
! ~ ++ -- + - 

* &

否定,按位否定,增量,減量,正負號,

間接,取地址

由右向左
* / % 乘,除,取模
由左向右
+ - 加,減
由左向右
<< >> 左移,右移
由左向右
< <= >= > 小於,小於等於,大於等於,大於
由左向右
== != 等於,不等於
由左向右
& 按位與
由左向右
^ 按位異或
由左向右
| 按位或
由左向右
&&
邏輯與
由左向右
|| 邏輯或
由左向右
? : 條件
由右向左
= += -= *= /= 

&= ^= |= <<= >>=

各種賦值
由右向左
, 逗號(順序)
由左向右

4.界符:“;”“{}”,單引號,雙引號

接下來我介紹的是對保留字的歸類,為了查詢方便,將保留字按照a-z的順序排好,依據陣列的下標定位,減少尋找的時間

/*
 * keyword.h
 *
 *  Created on: Jun 12, 2014
 *    
 */

#ifndef KEYWORD_H_
#define KEYWORD_H_

struct keyword{
	char *keyName;
};

static struct keyword key__[]={
		{"__int64"},
		{"end"}
};

static struct keyword key_A[]={
		{"auto"},
		{"end"}
};
static struct keyword key_B[]={
		{"break"},
		{"end"}
};
static struct keyword key_C[]={
		{"case"},
		{"char"},
		{"const"},
		{"continue"},
		{"end"}
};
static struct keyword key_D[]={
		{"default"},
		{"do"},
		{"double"},
		{"end"}
};
static struct keyword key_E[]={
		{"else"},
		{"enum"},
		{"extern"},
		{"end"}
};
static struct keyword key_F[]={
		{"float"},
		{"for"},
		{"end"}
};
static struct keyword key_G[]={
		{"goto"},
		{"end"}
};
static struct keyword key_H[]={
		{"end"}
};
static struct keyword key_I[]={
		{"if"},
		{"int"},
		{"end"}
};
static struct keyword key_J[]={
		{"end"}
};
static struct keyword key_K[]={
		{"end"}
};
static struct keyword key_L[]={
		{"long"},
		{"end"}
};
static struct keyword key_M[]={
		{"end"}
};
static struct keyword key_N[]={
		{"end"}
};
static struct keyword key_O[]={
		{"end"}
};
static struct keyword key_P[]={
		{"end"}
};
static struct keyword key_Q[]={
		{"end"}
};
static struct keyword key_R[]={
		{"register"},
		{"return"},
		{"end"}
};
static struct keyword key_S[]={
		{"short"},
		{"signed"},
		{"sizeof"},
		{"static"},
		{"struct"},
		{"switch"},
		{"end"}
};
static struct keyword key_T[]={
		{"typedef"},
		{"end"}
};
static struct keyword key_U[]={
		{"union"},
		{"unsigned"},
		{"end"}
};
static struct keyword key_V[]={
		{"void"},
		{"volatile"},
		{"end"}
};
static struct keyword key_W[]={
		{"while"},
		{"end"}
};
static struct keyword key_X[]={
		{"end"}
};
static struct keyword key_Y[]={
		{"end"}
};
static struct keyword key_Z[]={
		{"end"}
};
// size is 27
static struct keyword *keywords[]={
		key__,key_A,key_B,key_C,key_D,key_E,
		key_F,key_G,key_H,key_I,key_J,key_K,
		key_L,key_M,key_N,key_O,key_P,key_Q,
		key_R,key_S,key_T,key_U,key_V,key_W,
		key_X,key_Y,key_Z
};

#endif /* KEYWORD_H_ */

下面是詞法分析的原始碼;

/*
 * lex.h
 *
 *  Created on: Jun 13, 2014
 *     
 */
#include "input.h"
#include "keyword.h"

#define isDigit(c)			(c>='0' && c<='9')
#define isUpperLetter(c)	(c>='A' && c <='Z')
#define isLowerLetter(c)	(c>='a' && c<='z')
#define isLetter(c)			(isUpperLetter || isLowerLetter)


/*
 * lex.c
 *
 *  Created on: Jun 13, 2014
 *      
 */
#include "zcc.h"
#include "lex.h"

#define curr source.cursor

int getToken() {
	char a[100];
	int a_length, i, flag;
	/*
	 *skip ' ','\n' and '\b'
	 */
	while (*curr == ' ' || *curr == 10 || *curr == 9) {
		curr++;
		if (*curr == END_OF_FILE) {
			return -1;
		}
	}
	/* name or keyword on first is a-z */
	a_length=0;
	if (*curr >= 'a' && *curr <= 'z') {
		IDAndKey:
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
				|| *curr == '_');
		a[a_length] = '\0';
		i = 0;
		flag = 0;
		if (*a - 'a' <= 26 && *a - 'a' >= 0) {
			while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
				if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
					flag = 1;
					break;
				}
				i++;
			}
			if (flag == 1) {
				printf("keyword is %s\n", a);
				return 1;
			} else {
				printf("Identify is %s\n", a);
				return 1;
			}
		} else {
			printf("Identify is %s\n", a);
			return 1;
		}
	} else if (isUpperLetter(*curr)) {
		goto IDAndKey;
	} else if (isDigit(*curr)) {
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while (isDigit(*curr));
		//float number
		if (*curr == '.') {
			do {
				a[a_length++] = *curr++;
			} while (isDigit(*curr));
			a[a_length] = '\0';
			printf("float number is %s\n", a);
			return 1;
		} else {
			// number
			a[a_length] = '\0';
			printf("number is %s\n", a);
			return 1;
		}
	/*
	 * Operator begin
	 * */
	} else if (*curr == '<') {
		a[a_length++] = *curr++;
		if (*curr == '<') {
			a[a_length++] = *curr++;
		lastOperatorDeal:
			a[a_length] = '\0';
			printf("Operator is %s\n", a);
			return 1;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '>') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}

	} else if (*curr == '=') {
		a[a_length++] = *curr++;
		if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '(') {
	    singleOperator:
		a[a_length++] = *curr++;
		goto lastOperatorDeal;
	} else if (*curr == ')') {
		goto singleOperator;
	} else if (*curr == '[') {
		goto singleOperator;
	} else if (*curr == ']') {
		goto singleOperator;
	} else if (*curr == '-') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '-') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	}else if(*curr=='.'){
		goto singleOperator;
	}else if(*curr=='!'){
		a[a_length++]=*curr++;
		if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='~'){
		goto singleOperator;
	}else if(*curr=='+'){
        a[a_length++]=*curr++;
        if(*curr=='+'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='-'){
        a[a_length++]=*curr++;
        if(*curr=='-'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='*'){
        a[a_length++]=*curr++;
        if(*curr=='='){
        	goto singleOperator;
        }else{
            goto lastOperatorDeal;
        }
	}else if(*curr=='&'){
		a[a_length++]=*curr++;
		if(*curr=='&'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='/'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }if(*curr=='/'){
        	// skip line
        	while(*curr!='\n'){
        		if(*curr==END_OF_FILE)
        			return -1;
        		curr++;
        	}
        }else if(*curr=='*'){
        	curr++;
        	// skip "/**/"
            while(*curr!=END_OF_FILE)
            {
            	if(*curr=='*' && *(curr+1)=='/'){
            		curr+=2;
            		break;
            	}
                curr++;
            }
        }else{
        	goto lastOperatorDeal;
        }
	}else if(*curr=='%'){
		a[a_length++]=*curr++;
		if(*curr=='d'){
			goto singleOperator;
		}else if(*curr=='c'){
			goto singleOperator;
		}else if(*curr=='f'){
			goto singleOperator;
		}else if(*curr=='l'){
			a[a_length++]=*curr++;
			if(*curr=='d')
				goto singleOperator;
			else if(*curr=='f')
				goto singleOperator;
			else
				goto singleOperator;
		}

	}else if(*curr=='^'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }else{
	    	goto lastOperatorDeal;
	    }
	}else if(*curr=='|'){
		a[a_length++]=*curr++;
		if(*curr=='|'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='?'){
        goto singleOperator;
	}else if(*curr==':'){
        goto singleOperator;
	}else if(*curr==','){
		goto singleOperator;
	}else if(*curr=='\\'){
		a[a_length++]=*curr++;
		if(*curr=='n'){
			goto singleOperator;
		}else {
			goto lastOperatorDeal;
		}

	}
	/*
	 * Operator end
	 * */
	/*
	 * delimiter begin
	 * */
	else if(*curr=='{'){
		singleDelimiter:
		a[a_length++]=*curr++;
		a[a_length]='\0';
		printf("Delimiter is %s\n", a);
		return 1;
	}else if(*curr=='}'){
        goto singleDelimiter;
	}else if(*curr==';'){
		goto singleDelimiter;
	}else if(*curr=='\''){
		goto singleDelimiter;
	}else if(*curr=='\"'){
		goto singleDelimiter;
	}
}

這裡實現了將單詞分成五類流,並將單詞打印出來,在後面的語法分析中將會使用到這裡的單詞流結果。

忘了說了,我將自己寫的編譯器命名為:ZCC,標頭檔案都包含在zcc.h中(*^__^*) 嘻嘻……,想寫個類似與gcc 一樣神奇的玩意。

最後看測試文件:

struct  Student{
   int a;
   char* name;
}

int main()
{
    int a=123;
    float a2=1.2345677;
    int b=1+3;
    for(int i=0; i < 100; i++)
    		a+=i;
    printf("%d\n", a);
    return 0;
}


測試結果:

keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }

做到這裡,可以告一小段落了,接下來做的事情就是語法分析。