輸入法詞庫解析（二）搜狗拼音細胞詞庫.scel

阿新 • • 發佈：2022-05-27

前面很多空位元組的地方不用管，是一些描述資訊，詞庫名、示例詞等。

0x124 跟的 4 個位元組是詞條數，新的 scel 在檔案最後面可能有違禁詞（黑名單詞）。

拼音表

直接從 0x1540 開始。

前兩個位元組是拼音表的長度，這裡面都是按小端算的。

這裡 9D 01 就表示有 0x100 * 0x01 + 0x9D = 413 組。

後兩個位元組意義不明，一般是 0。

從 0x1544 開始就是拼音表正文部分。

佔用位元組數	描述
2	索引，從 00 00 到 9C 01
2	拼音位元組的長度
由上一項決定	拼音，utf-16le 編碼，一個字母佔 2 位元組。

詞條

偏移量 0x2628

佔用位元組數	描述
2	同一個音有多少詞
2	拼音索引的位元組長度
由上一項決定	拼音索引陣列
2	詞佔用位元組數
由上一項決定	詞，utf-16le 編碼
2	描述資訊位元組長度
由上一項決定	描述

帶英文詞庫的索引

從拼音表的長度往後，依次是 abcd。比如表長 413，最大索引9D 01，則下一個索引9E 01表示字母 a，依次類推。

`golang` 程式碼實現


func ParseSougouScel(rd io.Reader) []Pinyin {
    ret := make([]Pinyin, 0, 1e5)
    data, _ := ioutil.ReadAll(rd)
    r := bytes.NewReader(data)

    // utf-16le 轉換器
    decoder := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder()

    // 讀詞條數
    tmp := make([]byte, 4)
    r.Seek(0x124, 0)
    r.Read(tmp)
    dictLen := bytesToInt(tmp)

    // 拼音表偏移量
    r.Seek(0x1540, 0)

    // 前兩個位元組是拼音表長度，413
    tmp = make([]byte, 2)
    r.Read(tmp)
    pyTableLen := bytesToInt(tmp)
    pyTable := make([]string, pyTableLen)
    // fmt.Println("拼音表長度", pyTableLen)

    // 丟掉兩個位元組
    r.Read(tmp)

    // 讀拼音表
    for i := 0; i < pyTableLen; i++ {
        // 索引
        tmp := make([]byte, 2)
        r.Read(tmp)
        idx := bytesToInt(tmp)

        // 拼音長度
        r.Read(tmp)
        pyLen := bytesToInt(tmp)

        // 拼音 utf-16le
        pySli := make([]byte, pyLen)
        r.Read(pySli)
        py, _ := decoder.Bytes(pySli)

        pyTable[idx] = string(py)
    }

    // 讀碼錶
    for count := 0; count < dictLen; {
        // 重碼數（同一串音對應多個詞）
        tmp := make([]byte, 2)
        r.Read(tmp)
        repeat := bytesToInt(tmp)

        // 索引陣列長
        r.Read(tmp)
        codeLen := bytesToInt(tmp)

        // 讀取編碼
        var code []string
        for i := 0; 2*i < codeLen; i++ {
            r.Read(tmp)
            theIdx := bytesToInt(tmp)
            if theIdx >= pyTableLen {
                code = append(code, string(byte(theIdx-pyTableLen+97)))
                continue
            }
            code = append(code, pyTable[theIdx])
        }

        // 讀取一個或多個詞
        count += repeat
        for i := 1; i <= repeat; i++ {
            // 詞長
            r.Read(tmp)
            wordLen := bytesToInt(tmp)

            // 讀取詞
            wordSli := make([]byte, wordLen)
            r.Read(wordSli)
            wordSli, _ = decoder.Bytes(wordSli)
            word := string(wordSli)
            ret = append(ret, Pinyin{word, code, 1})

            // 末尾的補充資訊
            r.Read(tmp)
            infoLen := bytesToInt(tmp)
            info := make([]byte, infoLen)
            r.Read(info)
        }
    }
    return ret
}