1. 程式人生 > >製作文字識別訓練樣本的語料庫

製作文字識別訓練樣本的語料庫

從網上下載的小說的txt檔案往往含有較多的換行符,空格等字元,這在生成訓練樣本的時候是不需要的,甚至會出錯,需要對這些文字進行預處理,將所有行合併成一行,並刪除無用的字元,

import random
with open('novels.txt', mode='r', encoding='utf-8') as f:
    data = f.readlines()
    
lines = []
for line in data:
    line_striped = line.strip()
    line_striped = line_striped.replace('\u3000', '')
    line_striped = line_striped.replace('&nbsp', '')
    line_striped = line_striped.replace("\00", "")
    line_striped = line_striped.replace(" ", "")

    if line_striped != u'' and len(line.strip()) > 1:
        lines.append(line_striped)
        # 所有行合併成一行
        split_chars = [',', ',', ':', '-', ' ', ';', '。']
        splitchar = random.choice(split_chars)
        whole_line = splitchar.join(lines)
        #print(len(list(whole_line)))
print(len(set(list(whole_line))))        
print(len(list(whole_line)))

with open('novels_corpus.txt', 'w', encoding='utf-8') as r:
    chars = str(whole_line)
    print(len(chars))
    chars.replace(' ','')  #再次確認刪除空格
    print(len(chars))
    r.write(chars)