python wiki中文語料分詞
阿新 • • 發佈:2018-12-16
上篇將wiki中文語料已經下載下來(wiki中文文字語料下載並處理 ubuntu + python2.7),並且轉為了txt格式,本篇對txt檔案進行分詞,分詞後才能使用word2vector訓練詞向量 分詞python程式為(使用jieba分詞)
# -*- coding: utf-8 -*- #!/usr/bin/env python import sys reload(sys) sys.setdefaultencoding('utf8') import pandas as pd import numpy as np import lightgbm as lgb from sklearn.model_selection import StratifiedKFold from sklearn.metrics import f1_score from gensim.models import word2vec import logging, jieba import os, io stop_words_file = "stop_words.txt" stop_words = list() with io.open(stop_words_file, 'r', encoding="gb18030") as stop_words_file_object: contents = stop_words_file_object.readlines() for line in contents: line = line.strip() stop_words.append(line) data_file = 'wiki.txt' i = 1 with io.open(data_file, 'r', encoding='utf-8') as content: for line in content: seg_list = list(jieba.cut(line)) out_str = '' for word in seg_list: if word not in stop_words: if word.strip() != "": word = ''.join(word) out_str += word out_str += ' ' print 'fenci:' + str(i) i += 1 with io.open('wiki_seg.txt', 'a', encoding='utf-8') as output: output.write(unicode(out_str)) output.close()
最後會輸出一個 wiki_seg.txt 由於檔案很大(1.8G),所以程式跑的時間很長,具體時間忘記了 分詞後的檔案打印出來是這樣的
# -*- coding: utf-8 -*- #!/usr/bin/env python import sys reload(sys) sys.setdefaultencoding('utf8') import pandas as pd import numpy as np import lightgbm as lgb from sklearn.model_selection import StratifiedKFold from sklearn.metrics import f1_score from gensim.models import word2vec import logging, jieba import os, io words_file = "wiki_seg.txt" words = list() i = 0 with io.open(words_file, 'r', encoding="utf-8") as words_file_object: contents = words_file_object.readlines() for line in contents: print line i += 1 if i == 200: break