輸入法詞庫解析（一）百度自定義方案.def

阿新 • • 發佈：2022-05-26

參考了 asd1fque1 的詞庫處理工具 js 實現

解析

碼錶偏移量 0x6D

佔用位元組數	描述
1	編碼長度（紅色框）
1	詞長 * 2 + 2
由編碼長度決定	編碼（黃色框），可以是純編碼，也可以是 `編碼=位置`
由詞長決定	詞（綠色框），utf16-le 編碼
6	6 個空位元組代表詞條結束

golang 實現：

func ParseBaiduDef(rd io.Reader) Dict {
    ret := make(Dict, 1e5)       // 初始化
    tmp, _ := ioutil.ReadAll(rd) // 全部讀到記憶體
    r := bytes.NewReader(tmp)
    r.Seek(0x6D, 0) // 從 0x6D 開始讀
    // utf-16le 轉換
    decoder := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder()
    for {
        codeLen, err := r.ReadByte() // 編碼長度
        wordLen, err := r.ReadByte() // 詞長*2 + 2
        if err != nil {
            break
        }
        sliCode := make([]byte, int(codeLen))
        sliWord := make([]byte, int(wordLen)-2) // -2 後就是位元組長度，沒有考慮4位元組的情況

        r.Read(sliCode) // 編碼切片
        r.Read(sliWord)

        code := string(sliCode)
        word, _ := decoder.Bytes(sliWord)
        ret.insert(strings.Split(code, "=")[0], string(word))

        r.Seek(6, 1) // 6個00，1是相對當前位置
    }
    return ret
}

生成

碼錶部分和解析一樣的，沒什麼好說的。

主要考慮前 0x6C(109) 個位元組。

第一個位元組意義不明，可能是最大碼長（一般是 0，有的碼錶裡是 4）

後面每 4 位元組一組，共 27 組。

表示以 26 個首字母開頭詞條的位元組長度累加（不包括前 2 個表示長度的位元組，包括後 6 個 0）

計算時，統計每個首字母的長度累計，寫入時再次累加。

golang 實現：


func GenBaiduDef(dl []codeAndWords) []byte {
    var buf bytes.Buffer
    // 首字母詞條位元組數統計
    lengthMap := make(map[byte]int)
    buf.Write(make([]byte, 0x6D, 0x6D))
    // utf-16le 轉換
    encoder := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewEncoder()
    for _, v := range dl {
        code := v.code
        for i, word := range v.words {
            if i != 0 { // 不在首選的寫入位置資訊，好像沒什麼用？
                code = v.code + "=" + strconv.Itoa(i+1)
            }
            sliWord, _ := encoder.Bytes([]byte(word)) // 轉為utf-16le
            buf.WriteByte(byte(len(code)))            // 寫編碼長度
            buf.WriteByte(byte(len(sliWord) + 2))     // 寫詞位元組長+2
            buf.WriteString(code)                     // 寫編碼
            buf.Write(sliWord)                        // 寫詞
            buf.Write([]byte{0, 0, 0, 0, 0, 0})       // 寫6個0

            // 編碼長度 + 詞位元組長 + 6，不包括長度本身佔的2個位元組
            lengthMap[code[0]] += len(code) + len(sliWord) + 2 + 6
        }
    }

    // 檔案頭
    byteList := make([]byte, 0, 0x6D)
    byteList = append(byteList, 0) // 第一個位元組可能是最大碼長？
    // 長度累加
    var currNum int
    for i := 0; i <= 26; i++ {
        currNum += lengthMap[byte(i+0x60)]
        // 不知道怎麼來的，反正就這樣算
        currBytes := []byte{byte(currNum % 0x100), byte((currNum / 0x100) % 0x100),
            byte((currNum / 0x10000) % 0x100), byte((currNum / 0x1000000) % 0x100)}
        byteList = append(byteList, currBytes...)
    }
    // 替換檔案頭
    ret := buf.Bytes()
    for i := 0; i < len(byteList); i++ {
        ret[i] = byteList[i]
    }
    return ret
}