1. 程式人生 > >python wiki中文語料分詞

python wiki中文語料分詞

上篇將wiki中文語料已經下載下來(wiki中文文字語料下載並處理 ubuntu + python2.7),並且轉為了txt格式,本篇對txt檔案進行分詞,分詞後才能使用word2vector訓練詞向量 分詞python程式為(使用jieba分詞)

# -*- coding: utf-8 -*-
#!/usr/bin/env python
import sys
reload(sys)
sys.setdefaultencoding('utf8')

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from gensim.models import word2vec
import logging, jieba
import os, io

stop_words_file = "stop_words.txt" 
stop_words = list() 
with io.open(stop_words_file, 'r', encoding="gb18030") as stop_words_file_object: 
  contents = stop_words_file_object.readlines() 
  for line in contents: 
    line = line.strip() 
    stop_words.append(line)

data_file = 'wiki.txt'
i = 1
with io.open(data_file, 'r', encoding='utf-8') as content:
  for line in content:
    seg_list = list(jieba.cut(line))
    out_str = ''
    for word in seg_list:
      if word not in stop_words:
        if word.strip() != "":
          word = ''.join(word)
          out_str += word
          out_str += ' '
    print 'fenci:' + str(i)
    i += 1
    with io.open('wiki_seg.txt', 'a', encoding='utf-8') as output:
        output.write(unicode(out_str))
    output.close()

最後會輸出一個 wiki_seg.txt 由於檔案很大(1.8G),所以程式跑的時間很長,具體時間忘記了 分詞後的檔案打印出來是這樣的

# -*- coding: utf-8 -*-
#!/usr/bin/env python
import sys
reload(sys)
sys.setdefaultencoding('utf8')

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from gensim.models import word2vec
import logging, jieba
import os, io

words_file = "wiki_seg.txt" 
words = list()
i = 0 
with io.open(words_file, 'r', encoding="utf-8") as words_file_object: 
  contents = words_file_object.readlines() 
  for line in contents: 
    print line
    i += 1
    if i == 200:
      break

在這裡插入圖片描述