如何寫一個簡單的直譯器-1
阿新 • • 發佈:2018-11-02
Lan的原始碼由一些基本元素構成,我們稱之為Token,在詞法分析階段我們需要將輸入的字元流轉化成Token流(簡單說就是Token列表)。
下面是Token的型別定義,為了節省資源採用整數表示而不用列舉型別。
public class TokenType {
public static final int PLUS = 0;//("+")
public static final int PLUSPLUS = 1;//("++")
public static final int MINUS = 2;//("-")
public static final int MINUSMINUS = 3;//("--")
public static final int ASTERISK = 4;//("*")
public static final int SLASH = 5;//("/")
public static final int PERCENT = 6;//("%")
public static final int EQUAL = 7;//("==")
public static final int NOT_EQUAL = 8;//("!=")
public static final int GT = 9;//(">")
public static final int GE = 10;//(">=")
public static final int LT = 11;//("<")
public static final int LE = 12;//("<=")
public static final int AND = 13;//("&&")
public static final int OR = 14 ;//("||")
public static final int BANG = 15;//("!")
public static final int LEFT_PAREN = 16;//("(")
public static final int RIGHT_PAREN = 17;//(")")
public static final int LEFT_BRACE = 18;//("{")
public static final int RIGHT_BRACE = 19;//("}")
public static final int COMMA = 20;//(",")
public static final int QUESTION = 21;//("?")
public static final int COLON = 22;//(":")
public static final int NUMBER = 23;//("數值")
public static final int STRING = 24;//("字串")
public static final int ASSIGN = 25;//("=")
public static final int TRUE = 26;//("true")
public static final int FALSE = 27;//("false")
public static final int NULL = 28;//("null")
public static final int IDENTIFIER = 29;//("變數名")
public static final int IF = 30;//("if")
public static final int ELSE = 31;//("else")
public static final int WHILE = 32;//("while")
public static final int BREAK = 33;//("break")
public static final int CONTINUE = 34;//("continue")
public static final int PRINT = 35;//("print")
public static final int FUNC = 36;//("func")
public static final int RETURN = 37;//("return")
public static final int EOF = 38;//("末尾")
}
每種型別代表的內容看後面的註釋即可,沒有值得解釋的內容。然後定義Token的結構。
public class Token {
public int type; //Token型別
public String symbol; //Token內容,TokenType類中的註釋
public int line; //Token所在原始碼的行號
public Token(int type, String symbol, int line) {
this.type = type;
this.symbol = symbol;
this.line = line;
}
}
最後就是詞法分析器,我們稱之為Lexer。註釋部分已經解釋得很清楚了,沒有什麼難度。
public class Lexer {
//關鍵字字典,每次從原始碼中取到符號後都要依此判斷是否為關鍵字
private Map<String, Integer> keywordsFilter;
public Lexer() {
//初始化關鍵字字典
keywordsFilter = new HashMap<>();
keywordsFilter.put("true", TokenType.TRUE);
keywordsFilter.put("false", TokenType.FALSE);
keywordsFilter.put("null", TokenType.NULL);
keywordsFilter.put("if", TokenType.IF);
keywordsFilter.put("else", TokenType.ELSE);
keywordsFilter.put("while", TokenType.WHILE);
keywordsFilter.put("break", TokenType.BREAK);
keywordsFilter.put("continue", TokenType.CONTINUE);
keywordsFilter.put("print", TokenType.PRINT);
keywordsFilter.put("func", TokenType.FUNC);
keywordsFilter.put("return", TokenType.RETURN);
}
public List<Token> lex(String code) {
//該列表用於儲存所有需要返回的Token
List<Token> tokens = new ArrayList<>();
//從原始碼中獲取字元的索引
int index = 0;
//記錄Token在原始碼中的行號
int currentLine = 1;
//原始碼的總字元長度
int codeLength = code.length();
while (index < codeLength) {
//取出下一個字元,並且將索引加1
char c = code.charAt(index++);
//如果是空格,回車,製表符號直接跳過並進入下一次迴圈
if (c == ' ' || c == '\r' || c == '\t') continue;
//如果是換行符則將當前行號加1並進入下一次迴圈
if (c == '\n') {
currentLine++;
continue;
}
if (c == '+') {
if (index < codeLength && code.charAt(index) == '+') {
index++;
tokens.add(new Token(TokenType.PLUSPLUS, "++", currentLine));
} else {
tokens.add(new Token(TokenType.PLUS, "+", currentLine));
}
} else if (c == '-') {
if (index < codeLength && code.charAt(index) == '-') {
index++;
tokens.add(new Token(TokenType.MINUSMINUS, "--", currentLine));
} else {
tokens.add(new Token(TokenType.MINUS, "-", currentLine));
}
} else if (c == '*') {
tokens.add(new Token(TokenType.ASTERISK, "*", currentLine));
} else if (c == '/') {
if (index < codeLength && code.charAt(index) == '/') {//忽略註釋
do {
index++;
} while (index < codeLength && code.charAt(index) != '\n');
} else {
tokens.add(new Token(TokenType.SLASH, "/", currentLine));
}
} else if (c == '%') {
tokens.add(new Token(TokenType.PERCENT, "%", currentLine));
} else if (c == '(') {
tokens.add(new Token(TokenType.LEFT_PAREN, "(", currentLine));
} else if (c == ')') {
tokens.add(new Token(TokenType.RIGHT_PAREN, ")", currentLine));
} else if (c == '{') {
tokens.add(new Token(TokenType.LEFT_BRACE, "{", currentLine));
} else if (c == '}') {
tokens.add(new Token(TokenType.RIGHT_BRACE, "}", currentLine));
} else if (c == ',') {
tokens.add(new Token(TokenType.COMMA, ",", currentLine));
} else if (c == '?') {
tokens.add(new Token(TokenType.QUESTION, "?", currentLine));
} else if (c == ':') {
tokens.add(new Token(TokenType.COLON, ":", currentLine));
} else if (c == '>') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.GE, ">=", currentLine));
} else {
tokens.add(new Token(TokenType.GT, ">", currentLine));
}
} else if (c == '<') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.LE, "<=", currentLine));
} else {
tokens.add(new Token(TokenType.LT, "<", currentLine));
}
} else if (c == '!') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.NOT_EQUAL, "!=", currentLine));
} else {
tokens.add(new Token(TokenType.BANG, "!", currentLine));
}
} else if (c == '|') {
if (index < codeLength && code.charAt(index) == '|') {
index++;
tokens.add(new Token(TokenType.OR, "||", currentLine));
} else {
throw new RuntimeException("Lexer Error: expect '|'");
}
} else if (c == '&') {
if (index < codeLength && code.charAt(index) == '&') {
index++;
tokens.add(new Token(TokenType.AND, "&&", currentLine));
} else {
throw new RuntimeException("Lexer Error: expect '&'");
}
} else if (c == '=') {
if (index < codeLength && code.charAt(index) == '=') {
index++;
tokens.add(new Token(TokenType.EQUAL, "==", currentLine));
} else {
tokens.add(new Token(TokenType.ASSIGN, "=", currentLine));
}
} else if (Character.isDigit(c)) {//數字
int start = --index;
do {
if (++index >= code.length()) break;
c = code.charAt(index);
}
while (Character.isDigit(c));
tokens.add(new Token(TokenType.NUMBER, code.substring(start, index), currentLine));
} else if (Character.isAlphabetic(c)) {//符號
int start = --index;
do {
if (++index >= code.length()) break;
c = code.charAt(index);
}
while (Character.isAlphabetic(c));
String word = code.substring(start, index);
Integer type = keywordsFilter.get(word);
Token token = new Token(type == null ? TokenType.IDENTIFIER : type, word, currentLine);
tokens.add(token);
} else if (c == '"') {//字串字面量
int start = index;
do {
if (index >= code.length()) break;
c = code.charAt(index++);
if (c == '\n') break;
}
while (c != '\"');
if (c != '\"') {
throw new RuntimeException("Lexer Error: expect \"");
}
String strLiteral = code.substring(start, index-1);
tokens.add(new Token(TokenType.STRING, strLiteral, currentLine));
}
else {
throw new RuntimeException(String.format("Lexer Error: unknown character \"%c\"", c));
}
}
tokens.add(new Token(TokenType.EOF, "", currentLine));
return tokens;
}
}
最後手動測試一下
public class Main {
public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
Lexer lexer = new Lexer();
while (true) {
System.out.print(">>> ");
String code = scanner.nextLine();
if (code.equals(".q")) break;
List<Token> tokens = lexer.lex(code);
for (Token token : tokens) {
System.out.println(token.symbol);
}
}
}
}