英文token預處理,用於將英文句子處理成單詞
阿新 • • 發佈:2018-11-23
參考
https://github.com/google-research/bert/blob/master/tokenization.py
使用
import tokenization tokenizer = tokenization.BasicTokenizer(do_lower_case=True) f = open("sample_text.txt",mode="r",encoding="utf-8") lines = f.readlines() f2 = open("vocab.txt",mode="w",encoding="utf-8") f2.write("[PAD]") f2.write("\n") f2.write("[UNK]") f2.write("\n") f2.write("[CLS]") f2.write("\n") f2.write("[SEP]") f2.write("\n") f2.write("[MASK]") f2.write("\n") word_set = set() for line in lines: word_list = tokenizer.tokenize(line) for word in word_list: word_set.add(word) for word in list(word_set): if word!=" " and word!="": f2.write(word) f2.write("\n") f.close() f2.close()