1. 程式人生 > >如何寫一個簡單的直譯器-1

如何寫一個簡單的直譯器-1

Lan的原始碼由一些基本元素構成,我們稱之為Token,在詞法分析階段我們需要將輸入的字元流轉化成Token流(簡單說就是Token列表)。

下面是Token的型別定義,為了節省資源採用整數表示而不用列舉型別。

public class TokenType {
    public static final int PLUS        = 0;//("+")
    public static final int PLUSPLUS    = 1;//("++")
    public static final int MINUS       = 2;//("-")
    public
static final int MINUSMINUS = 3;//("--") public static final int ASTERISK = 4;//("*") public static final int SLASH = 5;//("/") public static final int PERCENT = 6;//("%") public static final int EQUAL = 7;//("==") public static final int NOT_EQUAL = 8;//("!=") public
static final int GT = 9;//(">") public static final int GE = 10;//(">=") public static final int LT = 11;//("<") public static final int LE = 12;//("<=") public static final int AND = 13;//("&&") public static final int OR = 14
;//("||") public static final int BANG = 15;//("!") public static final int LEFT_PAREN = 16;//("(") public static final int RIGHT_PAREN = 17;//(")") public static final int LEFT_BRACE = 18;//("{") public static final int RIGHT_BRACE = 19;//("}") public static final int COMMA = 20;//(",") public static final int QUESTION = 21;//("?") public static final int COLON = 22;//(":") public static final int NUMBER = 23;//("數值") public static final int STRING = 24;//("字串") public static final int ASSIGN = 25;//("=") public static final int TRUE = 26;//("true") public static final int FALSE = 27;//("false") public static final int NULL = 28;//("null") public static final int IDENTIFIER = 29;//("變數名") public static final int IF = 30;//("if") public static final int ELSE = 31;//("else") public static final int WHILE = 32;//("while") public static final int BREAK = 33;//("break") public static final int CONTINUE = 34;//("continue") public static final int PRINT = 35;//("print") public static final int FUNC = 36;//("func") public static final int RETURN = 37;//("return") public static final int EOF = 38;//("末尾") }

每種型別代表的內容看後面的註釋即可,沒有值得解釋的內容。然後定義Token的結構。

public class Token {
    public int type; //Token型別
    public String symbol; //Token內容,TokenType類中的註釋
    public int line; //Token所在原始碼的行號
    public Token(int type, String symbol, int line) {
        this.type = type;
        this.symbol = symbol;
        this.line = line;
    }
}

最後就是詞法分析器,我們稱之為Lexer。註釋部分已經解釋得很清楚了,沒有什麼難度。

public class Lexer {
    //關鍵字字典,每次從原始碼中取到符號後都要依此判斷是否為關鍵字
    private Map<String, Integer> keywordsFilter;
    public Lexer() {
        //初始化關鍵字字典
        keywordsFilter = new HashMap<>();
        keywordsFilter.put("true", TokenType.TRUE);
        keywordsFilter.put("false", TokenType.FALSE);
        keywordsFilter.put("null", TokenType.NULL);
        keywordsFilter.put("if", TokenType.IF);
        keywordsFilter.put("else", TokenType.ELSE);
        keywordsFilter.put("while", TokenType.WHILE);
        keywordsFilter.put("break", TokenType.BREAK);
        keywordsFilter.put("continue", TokenType.CONTINUE);
        keywordsFilter.put("print", TokenType.PRINT);
        keywordsFilter.put("func", TokenType.FUNC);
        keywordsFilter.put("return", TokenType.RETURN);
    }
    public List<Token> lex(String code) {
        //該列表用於儲存所有需要返回的Token
        List<Token> tokens = new ArrayList<>();
        //從原始碼中獲取字元的索引
        int index = 0;
        //記錄Token在原始碼中的行號
        int currentLine = 1;
        //原始碼的總字元長度
        int codeLength = code.length();
        while (index < codeLength) {
            //取出下一個字元,並且將索引加1
            char c = code.charAt(index++);
            //如果是空格,回車,製表符號直接跳過並進入下一次迴圈
            if (c == ' ' || c == '\r' || c == '\t') continue;
            //如果是換行符則將當前行號加1並進入下一次迴圈
            if (c == '\n') {
                currentLine++;
                continue;
            }
            if (c == '+') {
                if (index < codeLength && code.charAt(index) == '+') {
                    index++;
                    tokens.add(new Token(TokenType.PLUSPLUS, "++", currentLine));
                } else {
                    tokens.add(new Token(TokenType.PLUS, "+", currentLine));
                }
            } else if (c == '-') {
                if (index < codeLength && code.charAt(index) == '-') {
                    index++;
                    tokens.add(new Token(TokenType.MINUSMINUS, "--", currentLine));
                } else {
                    tokens.add(new Token(TokenType.MINUS, "-", currentLine));
                }
            } else if (c == '*') {
                tokens.add(new Token(TokenType.ASTERISK, "*", currentLine));
            } else if (c == '/') {
                if (index < codeLength && code.charAt(index) == '/') {//忽略註釋
                    do {
                        index++;
                    } while (index < codeLength && code.charAt(index) != '\n');
                } else {
                    tokens.add(new Token(TokenType.SLASH, "/", currentLine));
                }
            } else if (c == '%') {
                tokens.add(new Token(TokenType.PERCENT, "%", currentLine));
            } else if (c == '(') {
                tokens.add(new Token(TokenType.LEFT_PAREN, "(", currentLine));
            } else if (c == ')') {
                tokens.add(new Token(TokenType.RIGHT_PAREN, ")", currentLine));
            } else if (c == '{') {
                tokens.add(new Token(TokenType.LEFT_BRACE, "{", currentLine));
            } else if (c == '}') {
                tokens.add(new Token(TokenType.RIGHT_BRACE, "}", currentLine));
            } else if (c == ',') {
                tokens.add(new Token(TokenType.COMMA, ",", currentLine));
            } else if (c == '?') {
                tokens.add(new Token(TokenType.QUESTION, "?", currentLine));
            } else if (c == ':') {
                tokens.add(new Token(TokenType.COLON, ":", currentLine));
            } else if (c == '>') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.GE, ">=", currentLine));
                } else {
                    tokens.add(new Token(TokenType.GT, ">", currentLine));
                }
            } else if (c == '<') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.LE, "<=", currentLine));
                } else {
                    tokens.add(new Token(TokenType.LT, "<", currentLine));
                }
            } else if (c == '!') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.NOT_EQUAL, "!=", currentLine));
                } else {
                    tokens.add(new Token(TokenType.BANG, "!", currentLine));
                }
            } else if (c == '|') {
                if (index < codeLength && code.charAt(index) == '|') {
                    index++;
                    tokens.add(new Token(TokenType.OR, "||", currentLine));
                } else {
                    throw new RuntimeException("Lexer Error: expect '|'");
                }
            } else if (c == '&') {
                if (index < codeLength && code.charAt(index) == '&') {
                    index++;
                    tokens.add(new Token(TokenType.AND, "&&", currentLine));
                } else {
                    throw new RuntimeException("Lexer Error: expect '&'");
                }
            } else if (c == '=') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.EQUAL, "==", currentLine));
                } else {
                    tokens.add(new Token(TokenType.ASSIGN, "=", currentLine));
                }
            } else if (Character.isDigit(c)) {//數字
                int start = --index;
                do {
                    if (++index >= code.length()) break;
                    c = code.charAt(index);
                }
                while (Character.isDigit(c));
                tokens.add(new Token(TokenType.NUMBER, code.substring(start, index), currentLine));
            } else if (Character.isAlphabetic(c)) {//符號
                int start = --index;
                do {
                    if (++index >= code.length()) break;
                    c = code.charAt(index);
                }
                while (Character.isAlphabetic(c));
                String word = code.substring(start, index);
                Integer type = keywordsFilter.get(word);
                Token token = new Token(type == null ? TokenType.IDENTIFIER : type, word, currentLine);
                tokens.add(token);
            } else if (c == '"') {//字串字面量
                int start = index;
                do {
                    if (index >= code.length()) break;
                    c = code.charAt(index++);
                    if (c == '\n') break;
                }
                while (c != '\"');
                if (c != '\"') {
                    throw new RuntimeException("Lexer Error: expect \"");
                }
                String strLiteral = code.substring(start, index-1);
                tokens.add(new Token(TokenType.STRING, strLiteral, currentLine));
            }
            else {
                throw new RuntimeException(String.format("Lexer Error: unknown character \"%c\"", c));
            }
        }
        tokens.add(new Token(TokenType.EOF, "", currentLine));
        return tokens;
    }
}

最後手動測試一下

public class Main {
    public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);
        Lexer lexer = new Lexer();
        while (true) {
            System.out.print(">>> ");
            String code = scanner.nextLine();
            if (code.equals(".q")) break;
            List<Token> tokens = lexer.lex(code);
            for (Token token : tokens) {
                System.out.println(token.symbol);
            }
        }
    }
}