編譯器DIY——詞法分析

阿新 • • 發佈：2018-12-30

在上一篇文章中已經介紹了讀檔案的操作，那麼這一篇文章中將會仔細解釋詞法分析。

在原始檔中解析出的單詞流必須識別為保留字，識別符號，常量，操作符和界符五大類

1.顯然我們需要列舉出所有的保留字，而這裡與保留字相似的那麼就是識別符號，在C語言中，保留字都是以小寫字母開頭，而且其中的字母只能是小寫字母，而識別符號的第一個字母則必須為字元（小寫大寫皆可）後面可以接大小寫字母和字元 ‘_’，在我寫的這個編譯器中，識別符號不能超過100，在C語言中的識別符號定義的長度大小遠遠大於此。

2.對於常量，這裡需要注意的是整型和浮點型常量。

3.運算子按照的是下面的表：

C語言運算子表

運算子按照優先順序大小由上向下排列，在同一行的運算子具有相同優先順序。第二行是所有的一元運算子。

運算子	解釋	結合方式
() [] -> .	括號（函式等），陣列，兩種結構成員訪問	由左向右
! ~ ++ -- + - * &	否定，按位否定，增量，減量，正負號，間接，取地址	由右向左
* / %	乘，除，取模	由左向右
+ -	加，減	由左向右
<< >>	左移，右移	由左向右
< <= >= >	小於，小於等於，大於等於，大於	由左向右
== !=	等於，不等於	由左向右
&	按位與	由左向右
^	按位異或	由左向右
\|	按位或	由左向右
&&	邏輯與	由左向右
\|\|	邏輯或	由左向右
? :	條件	由右向左
= += -= *= /= &= ^= \|= <<= >>=	各種賦值	由右向左
,	逗號（順序）	由左向右

4.界符：“；”“{}”,單引號，雙引號

接下來我介紹的是對保留字的歸類，為了查詢方便，將保留字按照a-z的順序排好，依據陣列的下標定位，減少尋找的時間

/*
 * keyword.h
 *
 *  Created on: Jun 12, 2014
 *    
 */

#ifndef KEYWORD_H_
#define KEYWORD_H_

struct keyword{
	char *keyName;
};

static struct keyword key__[]={
		{"__int64"},
		{"end"}
};

static struct keyword key_A[]={
		{"auto"},
		{"end"}
};
static struct keyword key_B[]={
		{"break"},
		{"end"}
};
static struct keyword key_C[]={
		{"case"},
		{"char"},
		{"const"},
		{"continue"},
		{"end"}
};
static struct keyword key_D[]={
		{"default"},
		{"do"},
		{"double"},
		{"end"}
};
static struct keyword key_E[]={
		{"else"},
		{"enum"},
		{"extern"},
		{"end"}
};
static struct keyword key_F[]={
		{"float"},
		{"for"},
		{"end"}
};
static struct keyword key_G[]={
		{"goto"},
		{"end"}
};
static struct keyword key_H[]={
		{"end"}
};
static struct keyword key_I[]={
		{"if"},
		{"int"},
		{"end"}
};
static struct keyword key_J[]={
		{"end"}
};
static struct keyword key_K[]={
		{"end"}
};
static struct keyword key_L[]={
		{"long"},
		{"end"}
};
static struct keyword key_M[]={
		{"end"}
};
static struct keyword key_N[]={
		{"end"}
};
static struct keyword key_O[]={
		{"end"}
};
static struct keyword key_P[]={
		{"end"}
};
static struct keyword key_Q[]={
		{"end"}
};
static struct keyword key_R[]={
		{"register"},
		{"return"},
		{"end"}
};
static struct keyword key_S[]={
		{"short"},
		{"signed"},
		{"sizeof"},
		{"static"},
		{"struct"},
		{"switch"},
		{"end"}
};
static struct keyword key_T[]={
		{"typedef"},
		{"end"}
};
static struct keyword key_U[]={
		{"union"},
		{"unsigned"},
		{"end"}
};
static struct keyword key_V[]={
		{"void"},
		{"volatile"},
		{"end"}
};
static struct keyword key_W[]={
		{"while"},
		{"end"}
};
static struct keyword key_X[]={
		{"end"}
};
static struct keyword key_Y[]={
		{"end"}
};
static struct keyword key_Z[]={
		{"end"}
};
// size is 27
static struct keyword *keywords[]={
		key__,key_A,key_B,key_C,key_D,key_E,
		key_F,key_G,key_H,key_I,key_J,key_K,
		key_L,key_M,key_N,key_O,key_P,key_Q,
		key_R,key_S,key_T,key_U,key_V,key_W,
		key_X,key_Y,key_Z
};

#endif /* KEYWORD_H_ */

下面是詞法分析的原始碼;

/*
 * lex.h
 *
 *  Created on: Jun 13, 2014
 *     
 */
#include "input.h"
#include "keyword.h"

#define isDigit(c)			(c>='0' && c<='9')
#define isUpperLetter(c)	(c>='A' && c <='Z')
#define isLowerLetter(c)	(c>='a' && c<='z')
#define isLetter(c)			(isUpperLetter || isLowerLetter)

/*
 * lex.c
 *
 *  Created on: Jun 13, 2014
 *      
 */
#include "zcc.h"
#include "lex.h"

#define curr source.cursor

int getToken() {
	char a[100];
	int a_length, i, flag;
	/*
	 *skip ' ','\n' and '\b'
	 */
	while (*curr == ' ' || *curr == 10 || *curr == 9) {
		curr++;
		if (*curr == END_OF_FILE) {
			return -1;
		}
	}
	/* name or keyword on first is a-z */
	a_length=0;
	if (*curr >= 'a' && *curr <= 'z') {
		IDAndKey:
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
				|| *curr == '_');
		a[a_length] = '\0';
		i = 0;
		flag = 0;
		if (*a - 'a' <= 26 && *a - 'a' >= 0) {
			while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
				if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
					flag = 1;
					break;
				}
				i++;
			}
			if (flag == 1) {
				printf("keyword is %s\n", a);
				return 1;
			} else {
				printf("Identify is %s\n", a);
				return 1;
			}
		} else {
			printf("Identify is %s\n", a);
			return 1;
		}
	} else if (isUpperLetter(*curr)) {
		goto IDAndKey;
	} else if (isDigit(*curr)) {
		a_length = 0;
		do {
			a[a_length++] = *curr++;
		} while (isDigit(*curr));
		//float number
		if (*curr == '.') {
			do {
				a[a_length++] = *curr++;
			} while (isDigit(*curr));
			a[a_length] = '\0';
			printf("float number is %s\n", a);
			return 1;
		} else {
			// number
			a[a_length] = '\0';
			printf("number is %s\n", a);
			return 1;
		}
	/*
	 * Operator begin
	 * */
	} else if (*curr == '<') {
		a[a_length++] = *curr++;
		if (*curr == '<') {
			a[a_length++] = *curr++;
		lastOperatorDeal:
			a[a_length] = '\0';
			printf("Operator is %s\n", a);
			return 1;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '>') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}

	} else if (*curr == '=') {
		a[a_length++] = *curr++;
		if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	} else if (*curr == '(') {
	    singleOperator:
		a[a_length++] = *curr++;
		goto lastOperatorDeal;
	} else if (*curr == ')') {
		goto singleOperator;
	} else if (*curr == '[') {
		goto singleOperator;
	} else if (*curr == ']') {
		goto singleOperator;
	} else if (*curr == '-') {
		a[a_length++] = *curr++;
		if (*curr == '>') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '-') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else if (*curr == '=') {
			a[a_length++] = *curr++;
			goto lastOperatorDeal;
		} else {
			goto lastOperatorDeal;
		}
	}else if(*curr=='.'){
		goto singleOperator;
	}else if(*curr=='!'){
		a[a_length++]=*curr++;
		if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='~'){
		goto singleOperator;
	}else if(*curr=='+'){
        a[a_length++]=*curr++;
        if(*curr=='+'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='-'){
        a[a_length++]=*curr++;
        if(*curr=='-'){
        	goto singleOperator;
        }else if(*curr=='='){
        	goto singleOperator;
        }else {
        	goto lastOperatorDeal;
        }
	}else if(*curr=='*'){
        a[a_length++]=*curr++;
        if(*curr=='='){
        	goto singleOperator;
        }else{
            goto lastOperatorDeal;
        }
	}else if(*curr=='&'){
		a[a_length++]=*curr++;
		if(*curr=='&'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='/'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }if(*curr=='/'){
        	// skip line
        	while(*curr!='\n'){
        		if(*curr==END_OF_FILE)
        			return -1;
        		curr++;
        	}
        }else if(*curr=='*'){
        	curr++;
        	// skip "/**/"
            while(*curr!=END_OF_FILE)
            {
            	if(*curr=='*' && *(curr+1)=='/'){
            		curr+=2;
            		break;
            	}
                curr++;
            }
        }else{
        	goto lastOperatorDeal;
        }
	}else if(*curr=='%'){
		a[a_length++]=*curr++;
		if(*curr=='d'){
			goto singleOperator;
		}else if(*curr=='c'){
			goto singleOperator;
		}else if(*curr=='f'){
			goto singleOperator;
		}else if(*curr=='l'){
			a[a_length++]=*curr++;
			if(*curr=='d')
				goto singleOperator;
			else if(*curr=='f')
				goto singleOperator;
			else
				goto singleOperator;
		}

	}else if(*curr=='^'){
		a[a_length++]=*curr++;
	    if(*curr=='='){
	    	goto singleOperator;
	    }else{
	    	goto lastOperatorDeal;
	    }
	}else if(*curr=='|'){
		a[a_length++]=*curr++;
		if(*curr=='|'){
			goto singleOperator;
		}else if(*curr=='='){
			goto singleOperator;
		}else{
			goto lastOperatorDeal;
		}
	}else if(*curr=='?'){
        goto singleOperator;
	}else if(*curr==':'){
        goto singleOperator;
	}else if(*curr==','){
		goto singleOperator;
	}else if(*curr=='\\'){
		a[a_length++]=*curr++;
		if(*curr=='n'){
			goto singleOperator;
		}else {
			goto lastOperatorDeal;
		}

	}
	/*
	 * Operator end
	 * */
	/*
	 * delimiter begin
	 * */
	else if(*curr=='{'){
		singleDelimiter:
		a[a_length++]=*curr++;
		a[a_length]='\0';
		printf("Delimiter is %s\n", a);
		return 1;
	}else if(*curr=='}'){
        goto singleDelimiter;
	}else if(*curr==';'){
		goto singleDelimiter;
	}else if(*curr=='\''){
		goto singleDelimiter;
	}else if(*curr=='\"'){
		goto singleDelimiter;
	}
}

這裡實現了將單詞分成五類流，並將單詞打印出來，在後面的語法分析中將會使用到這裡的單詞流結果。

忘了說了，我將自己寫的編譯器命名為：ZCC，標頭檔案都包含在zcc.h中(*^__^*) 嘻嘻……，想寫個類似與gcc 一樣神奇的玩意。

最後看測試文件：

struct  Student{
   int a;
   char* name;
}

int main()
{
    int a=123;
    float a2=1.2345677;
    int b=1+3;
    for(int i=0; i < 100; i++)
    		a+=i;
    printf("%d\n", a);
    return 0;
}

測試結果：

keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }

做到這裡，可以告一小段落了，接下來做的事情就是語法分析。

編譯器DIY——詞法分析

在上一篇文章中已經介紹了讀檔案的操作，那麼這一篇文章中將會仔細解釋詞法分析。在原始檔中解析出的單詞流必須識別為保留字，識別符號，常量，操作符和界符五大類 1.顯然我們需要列舉出所有的保留字，而這裡與保留字相似的那麼就是識別符號，在C語言中，保留字都是以小寫字母開頭，而且其

編譯器之詞法分析c

我學編譯原理沒有多久，剛剛學到語義分析，所以我也算是菜鳥。。。無意間看到了《自己動手寫編譯器、連結器》這本書，覺得實現一個編譯器才算是真正的入門。本來我是沒有決心寫好一個c語言的編譯器的，因為c語言的詞法、語法等等有太多的內容，但是無奈曾經也就學習過c語言、

JAVA實現一個簡單的代數運算語言編譯器（二）--詞法分析準備

上一篇文章主要介紹了這個代數運算編譯器的起因，這一篇我們就來開始寫這個專案。首先我們需要先設定一些系統的基礎類如系統符號類，保留字類、錯誤提示資訊類、自定義異常、輸入讀取類等，下面簡單地說一下這幾個類。系統符號類： package com.liu.system; /

編譯原理實驗報告一：PL0語言編譯器分析（PL0，詞法分析，語法分析，中間程式碼生成）

實驗報告一：PL0語言編譯器分析一、實驗目的通過閱讀與解析一個實際編譯器（PL/0語言編譯器）的原始碼，加深對編譯階段（包括詞法分析、語法分析、語義分析、中間程式碼生成等）和編譯系統軟體結構的理解，並達到提高學習興趣的目的。二、實驗要求(1) 要求掌握基本

Dom 事件和JavaScript的詞法分析過程

javascript dom event dom自帶了很多事件，常見的如下所示當觸發這些事件的時候，我們可以執行自定義的各種函數。一般說來，綁定事件有3種方法。第一種方法，直接在標簽上面綁定，比如<input id=‘i1‘ type=‘button‘ onclick=‘ClickOn(th

THULAC：一個高效的中文詞法分析工具包（z'z）

bsp 準確率 ext 效果 python3 nlp org 集成的人網址：http://thulac.thunlp.org/ THULAC（THU Lexical Analyzer for Chinese）由清華大學自然語言處理與社會人文計算實驗室研制推出的一套中文詞

用java實現一個簡易編譯器1-詞法解析入門

new 概念自加我們 sta 數字獲得 () 操作系統本文對應代碼下載地址為： http://download.csdn.net/detail/tyler_download/9435103 視頻地址： http://v.youku.com/v_show/id_XMT

javascript-詞法分析解析

utf -c head 詞法分析 img style utf-8 doc rip <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8">

PMD 編譯語法分析詞法分析抽象語法樹

edit get 編譯 test if語句 final 代碼掃描 pic blog 編譯原理 163 課堂 http://mooc.study.163.com/learn/-1000002001?tid=1000003000#/learn/content?type=deta

結對編程--C語言子程序詞法分析

字符串之前 info default 管理問題 min div == 一、問題描述 C語言小子集表的定義 2.設計單詞屬性值，各類表格（表示標識符表、常量表），單詞符號及機內表示,采用標準輸入和輸出的方式。程序從鍵盤接收代碼，遇到代碼結束符“#”時結束，並將

編譯原理實驗：實驗一簡單詞法分析程序設計（必修）(Python實現)

it is 括號 ali 鍵盤輸入優化沒有 mce constant 是否一、實驗目的了解詞法分析程序的基本構造原理，掌握詞法分析程序的手工構造方法。二、實驗內容 1、了解編譯程序的詞法分析過程。 2、根據PASCAL語言的說明語句形式，用手工方法構造一個對說明語

python學習之路 jJavaScript詞法分析

優先覆蓋 bject AR 又是調用函數運行 () class 詞法分析步驟 JavaScript在運行前會有一個類似預編譯的過程這個過程就是我們所說的詞法分析。這個詞法分析的步驟分析參數再分析變量的聲明分析函數說明列子： function func(ag

C# 詞法分析器（一）詞法分析介紹

art 優化不一定 clr gen 多個 scan 原理輸入緩沖系列導航（一）詞法分析介紹（二）輸入緩沖和代碼定位（三）正則表達式（四）構造 NFA （五）轉換 DFA （六）構造詞法分析器（七）總結雖然文章的標題是詞法分析，但

JavaScript詞法分析(盡力理解)

中間 fun 使用 fine 例子參數 code class 過程 JavaScript中在調用函數的那一瞬間之前，會先進行詞法分析詞法分析的過程：當函數調用的前一瞬間，會先形成一個激活對象：Avtive Object（AO），並會分析以下3個方面： 1:函數參數

編譯原理----詞法分析

0.PL/0文法〈程式〉→〈分程式〉. 〈分程式〉→ [<常量說明部分>][<變數說明部分>][<過程說明部分>]〈語句〉 <常量說明部分> → CONST<常量定義>{

PL/0詞法分析程式

　用C語言編寫一個PL/0詞法分析器，為語法語義分析提供單詞，使之能把輸入的字串形式的源程式分割成一個個單詞符號傳遞給語法語義分析，並把分析結果（基本字，運算子，識別符號，常數以及界符）輸出。　　PL/0的詞法分析程式GETSYM是一個獨立的過程，其功能是為語

javascript中的詞法分析

詞法分析 JavaScript中在呼叫函式的那一瞬間，會先進行詞法分析。詞法分析的過程：當函式呼叫的前一瞬間，會先形成一個啟用物件：Avtive Object（AO），並會分析以下3個方面： 1:函式引數，如果有，則將此引數賦值給AO，且值為undefined。如果沒有，則不做任何操作。2:函式區

編譯原理第三章詞法分析（上）

3.1.1 為什麼編譯器要把詞法分析和語法分析分開 3.1.2 詞法單元、模式和詞素（重要）例： 3.1.3 詞法單元的屬性（重要）詞法單元的屬性是用來記錄相對應的詞素的一些相關屬性資訊。例： int x = 10 + 20

編譯原理第三章詞法分析（下）

3.6 有窮自動機（非常重要） 3.6.1 不確定的有窮自動機(重要) 例：狀態0是開始狀態, 在狀態0上輸入符號b會進入狀態0，輸入a可能進去狀態0也有可能進入狀態1。所以對於狀態0來說一個確定的輸入符號a他有兩種離開狀態，這就是一種不確定的狀態。 &nbs

js詞法分析

javascript詞法分析函式在執行的瞬間，生成一個活動物件（Active Object），簡稱AO；具體分為兩個階段：一.分析階段 JavaScript程式碼執行前有一個類似編譯的過程即詞法分析，詞法分析主要有三個步驟： 1.分析引數

編譯器DIY——詞法分析

相關推薦