1. 程式人生 > >python使用gensim訓練搜狗語料的LDA

python使用gensim訓練搜狗語料的LDA

# -*- coding: utf-8 -*-
import jieba, os
import codecs
from gensim import corpora, models, similarities
from pprint import pprint
from collections import defaultdict
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def load_data():
    walk = os.walk('D:/dev_data/sogou')
    documents = []
    for root, dirs, files in walk:
        for name in files:
            raw = codecs.open(os.path.join(root, name), 'r', 'utf-8','ignore').read()
            documents.append(raw)
    return documents

def preprocess(documents):
    stoplist = codecs.open('tmp/stopword.txt','r',encoding='utf8').readlines()
    stoplist = set(w.strip() for w in stoplist)
    #分詞,去停用詞
    texts = [[word for word in list(jieba.cut(document.replace('\t','').replace('\n',''), cut_all = True)) if word not in stoplist]
        for document in documents]
    #去除低頻詞
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 2]
            for text in texts]
    dictionary = corpora.Dictionary(texts)
    dictionary.save('tmp/sogou.dict')
    print(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('tmp/sogou.mm', corpus)
    return corpus,dictionary


def train_lda(corpus,dictionary):
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    # 模型訓練
    lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 9)
    #模型的儲存/ 載入
    lda.save('tmp/sogou_lda.model')


def load_lda():
    lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    for i in range(4):
        print lda.print_topic(i)

def test_lda():
    lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    dictionary = corpora.Dictionary.load('tmp/sogou.dict')
    corpus = corpora.MmCorpus('tmp/sogou.mm')
    stoplist = codecs.open('tmp/stopword.txt', 'r', encoding='utf8').readlines()
    unseen_document = """
      在本賽季的這三場比賽中,騎士三戰皆勝。值得一提的是,全場比賽騎士三分線外46投25中,打破NBA常規賽單場比賽單支球隊三分球命中數紀錄。
        """
    d = "".join(unseen_document.split())
    print "The unseen document is composed by the following text:", unseen_document
    print
    text = [word for word in list(jieba.cut(d, cut_all=True)) if word not in stoplist]

    bow_vector = dictionary.doc2bow(text)
    for i in range(0, 9):
        print lda_model.print_topic(i)
    print lda_model[bow_vector]
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1 * tup[1]):
        print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 3))

def print_lda():
    lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    for i in range(0, 9):
        print i,lda_model.print_topic(i,10)

    print 0, lda_model.print_topic(0, 10)

def train():
    documents = load_data()
    corpus,dictionary = preprocess(documents)
    train_lda(corpus,dictionary)

def test():
    #load_lda()
    #test_lda()
    dictionary = corpora.Dictionary.load('tmp/sogou.dict')
    print dictionary[10]
    print len(dictionary)
    print dictionary

def test1():
    lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    test_doc = """
          中華網總經理陳曉薇表示,該公司將在今年首季推出生活頻道及重建英語頻道,並著手發展與其他國家及知名企業合作的資訊網頁,此外在5月份,中華網推出針對內地專業人士的娛樂內容,作為將來3G手機內容的供應來源。(英寧)
            """
    test_doc = list(jieba.cut(test_doc)) # 新文件進行分詞
    doc_bow = dictionary.doc2bow(test_doc)  # 文件轉換成bow
    doc_lda = lda[doc_bow]  # 得到新文件的主題分佈
    # 輸出新文件的主題分佈
    print doc_lda
    for topic in doc_lda:
        print "%s\t%f\n" % (lda.print_topic(topic[0]), topic[1])



#train()
print_lda()