python使用gensim訓練搜狗語料的LDA
阿新 • • 發佈:2019-02-16
# -*- coding: utf-8 -*- import jieba, os import codecs from gensim import corpora, models, similarities from pprint import pprint from collections import defaultdict import sys reload(sys) sys.setdefaultencoding('utf-8') def load_data(): walk = os.walk('D:/dev_data/sogou') documents = [] for root, dirs, files in walk: for name in files: raw = codecs.open(os.path.join(root, name), 'r', 'utf-8','ignore').read() documents.append(raw) return documents def preprocess(documents): stoplist = codecs.open('tmp/stopword.txt','r',encoding='utf8').readlines() stoplist = set(w.strip() for w in stoplist) #分詞,去停用詞 texts = [[word for word in list(jieba.cut(document.replace('\t','').replace('\n',''), cut_all = True)) if word not in stoplist] for document in documents] #去除低頻詞 frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 2] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save('tmp/sogou.dict') print(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('tmp/sogou.mm', corpus) return corpus,dictionary def train_lda(corpus,dictionary): tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # 模型訓練 lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 9) #模型的儲存/ 載入 lda.save('tmp/sogou_lda.model') def load_lda(): lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') for i in range(4): print lda.print_topic(i) def test_lda(): lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') dictionary = corpora.Dictionary.load('tmp/sogou.dict') corpus = corpora.MmCorpus('tmp/sogou.mm') stoplist = codecs.open('tmp/stopword.txt', 'r', encoding='utf8').readlines() unseen_document = """ 在本賽季的這三場比賽中,騎士三戰皆勝。值得一提的是,全場比賽騎士三分線外46投25中,打破NBA常規賽單場比賽單支球隊三分球命中數紀錄。 """ d = "".join(unseen_document.split()) print "The unseen document is composed by the following text:", unseen_document print text = [word for word in list(jieba.cut(d, cut_all=True)) if word not in stoplist] bow_vector = dictionary.doc2bow(text) for i in range(0, 9): print lda_model.print_topic(i) print lda_model[bow_vector] for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1 * tup[1]): print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 3)) def print_lda(): lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') for i in range(0, 9): print i,lda_model.print_topic(i,10) print 0, lda_model.print_topic(0, 10) def train(): documents = load_data() corpus,dictionary = preprocess(documents) train_lda(corpus,dictionary) def test(): #load_lda() #test_lda() dictionary = corpora.Dictionary.load('tmp/sogou.dict') print dictionary[10] print len(dictionary) print dictionary def test1(): lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') test_doc = """ 中華網總經理陳曉薇表示,該公司將在今年首季推出生活頻道及重建英語頻道,並著手發展與其他國家及知名企業合作的資訊網頁,此外在5月份,中華網推出針對內地專業人士的娛樂內容,作為將來3G手機內容的供應來源。(英寧) """ test_doc = list(jieba.cut(test_doc)) # 新文件進行分詞 doc_bow = dictionary.doc2bow(test_doc) # 文件轉換成bow doc_lda = lda[doc_bow] # 得到新文件的主題分佈 # 輸出新文件的主題分佈 print doc_lda for topic in doc_lda: print "%s\t%f\n" % (lda.print_topic(topic[0]), topic[1]) #train() print_lda()