python使用gensim訓練搜狗語料的LDA

阿新 • • 發佈：2019-02-16

# -*- coding: utf-8 -*-
import jieba, os
import codecs
from gensim import corpora, models, similarities
from pprint import pprint
from collections import defaultdict
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def load_data():
    walk = os.walk('D:/dev_data/sogou')
    documents = []
    for root, dirs, files in walk:
        for name in files:
            raw = codecs.open(os.path.join(root, name), 'r', 'utf-8','ignore').read()
            documents.append(raw)
    return documents

def preprocess(documents):
    stoplist = codecs.open('tmp/stopword.txt','r',encoding='utf8').readlines()
    stoplist = set(w.strip() for w in stoplist)
    #分詞，去停用詞
    texts = [[word for word in list(jieba.cut(document.replace('\t','').replace('\n',''), cut_all = True)) if word not in stoplist]
        for document in documents]
    #去除低頻詞
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 2]
            for text in texts]
    dictionary = corpora.Dictionary(texts)
    dictionary.save('tmp/sogou.dict')
    print(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('tmp/sogou.mm', corpus)
    return corpus,dictionary


def train_lda(corpus,dictionary):
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    # 模型訓練
    lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 9)
    #模型的儲存/ 載入
    lda.save('tmp/sogou_lda.model')


def load_lda():
    lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    for i in range(4):
        print lda.print_topic(i)

def test_lda():
    lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    dictionary = corpora.Dictionary.load('tmp/sogou.dict')
    corpus = corpora.MmCorpus('tmp/sogou.mm')
    stoplist = codecs.open('tmp/stopword.txt', 'r', encoding='utf8').readlines()
    unseen_document = """
    　　在本賽季的這三場比賽中，騎士三戰皆勝。值得一提的是，全場比賽騎士三分線外46投25中，打破NBA常規賽單場比賽單支球隊三分球命中數紀錄。
        """
    d = "".join(unseen_document.split())
    print "The unseen document is composed by the following text:", unseen_document
    print
    text = [word for word in list(jieba.cut(d, cut_all=True)) if word not in stoplist]

    bow_vector = dictionary.doc2bow(text)
    for i in range(0, 9):
        print lda_model.print_topic(i)
    print lda_model[bow_vector]
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1 * tup[1]):
        print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 3))

def print_lda():
    lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    for i in range(0, 9):
        print i,lda_model.print_topic(i,10)

    print 0, lda_model.print_topic(0, 10)

def train():
    documents = load_data()
    corpus,dictionary = preprocess(documents)
    train_lda(corpus,dictionary)

def test():
    #load_lda()
    #test_lda()
    dictionary = corpora.Dictionary.load('tmp/sogou.dict')
    print dictionary[10]
    print len(dictionary)
    print dictionary

def test1():
    lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
    test_doc = """
        　　中華網總經理陳曉薇表示，該公司將在今年首季推出生活頻道及重建英語頻道，並著手發展與其他國家及知名企業合作的資訊網頁，此外在5月份，中華網推出針對內地專業人士的娛樂內容，作為將來3G手機內容的供應來源。（英寧）
            """
    test_doc = list(jieba.cut(test_doc)) # 新文件進行分詞
    doc_bow = dictionary.doc2bow(test_doc)  # 文件轉換成bow
    doc_lda = lda[doc_bow]  # 得到新文件的主題分佈
    # 輸出新文件的主題分佈
    print doc_lda
    for topic in doc_lda:
        print "%s\t%f\n" % (lda.print_topic(topic[0]), topic[1])



#train()
print_lda()

python使用gensim訓練搜狗語料的LDA

# -*- coding: utf-8 -*- import jieba, os import codecs from gensim import corpora, models, similarities from pprint import pprint from co

對搜狗語料庫進行想要格式編碼的處理

1. 下載資料搜狗語料庫:http://www.sogou.com/labs/resource/cs.php,下載解壓. 2. 資料編碼處理在ubuntu環境下,編寫一個sh檔案: find ./ -type f -name "*.txt"|while read line;

匯入搜狗實驗室新聞語料庫

在搜狗實驗室下載的新聞語料庫中儲存的一條新聞的資料格式是 <doc> <url></url> <docno></docno> <contenttitle><

搜狗輸入法體驗評測

界面用戶分享內容一個所想體驗天氣選擇搜狗輸入法體驗評測 1.用戶界面截圖：首先，左上有一個導航欄，對搜索的信息分門別類，我並不常用，所以他也沒有占據明顯的位置，右上是天氣和地點等，聊勝於無，中間是醒目的搜索框，簡潔明了 2.記住用戶選擇截圖

對現有輸入法進行評價——搜狗

效果比較 mage 向導自己軟件修改 com 滿足我現在使用的為搜狗輸入法：下面從四個角度來分析它的使用情況：在此聲明，僅屬於個人看法，沒有任何詆毀或打廣告的意思一、用戶界面： 1）搜狗的用戶導航可有多種選擇，顏色、樣式會定期更新，推出新產品，滿足大

49. 搜狗面試題：大數相乘算法

std margin -a pac string out none content ack 分析：大數能大到整形類型存儲不了。須要借助於其它的算法，來完畢乘法運算。能夠使用口算乘法的步驟來模擬乘法操作。例如以下：

IE與搜狗input 默認樣式

眼睛 logs 淺析 lan style 密碼框 .cn com shadow 在IE10 及以上input框會加上默認的X號，密碼框會加上小眼睛去掉的方法： ::-ms-clear,::-ms-reveal{ display:none; } 　在搜狗瀏覽

Ubuntu mate安裝搜狗輸入法

install 添加修復 sougou hat via ppa 比較 conf 學習使用linux不過兩周時間，換了3份發行版，體驗了red hat和devian陣營的版本。因為是給舊筆記本電腦安裝，而且自己是新手，還是選用了Ubuntu陣營的操作系統。Kylin系統是中

[ubuntu16.04]安裝搜狗輸入法

1.0 image 文件 .so pinyin amd64 cnblogs 重啟搜狗 1，搜狗輸入法下載： http://pinyin.sogou.com/linux/ 下載文件：sogoupinyin_2.1.0.0086_amd64.deb 2，安裝方法：（1）直接

第三百三十節，web爬蟲講解2—urllib庫爬蟲—實戰爬取搜狗微信公眾號

文章 odin data 模塊 webapi 頭信息 hone 微信 android 第三百三十節，web爬蟲講解2—urllib庫爬蟲—實戰爬取搜狗微信公眾號封裝模塊 #!/usr/bin/env python # -*- coding: utf-8 -*- impo

ubuntu安裝搜狗輸入法（ubuntu 14.04、ubuntu16.04通用）

ron 搜索 conf 技術 ubuntu安裝再次 ges key log 本方法ubuntu 14.04、ubuntu16.04通用。 1.下載搜狗輸入法的安裝包deb 下載地址： http://pinyin.sogou.com/linux/?r=pinyin 2.安裝

kali 2.0安裝搜狗輸入法簡易教程

kali 搜狗輸入法安裝kali，配置好安裝源，選擇阿裏鏡像源。（網上教程很多自行搜索）通過瀏覽器進入搜狗官網下載（linux版本，位數根據安裝kali位數定）將下載好的搜狗輸入法版本從下載文件夾路徑剪切到home目錄下（kali界面可操作性）在home目錄下看文件是否存在，然後執行apt-get -f i

想靠人工智能實現IPO，搜狗故事裏缺的不只是創新

搜狗“已經沒有人懷疑搜狗的生存問題了，惟一的懸念是搜狗能否在搜索領域實現顛覆，是否能在AI領域引領重大的創新”。在搜狗創始人兼CEO王小川8月發布的這封計劃赴美IPO的內部信中，其自定義的搜狗風險在於創新和顛覆，而搜狗的機遇則在人工智能（AI）之上。10月13日晚間，搜狗向美國證券交易委員會提交招股說明書，人

靠外援打call的搜狗，AI會是未來嗎？

搜狗公司估值可能高達60億美元！即將赴紐交所“敲鐘”之前，搜狗的招股書試圖給市場留下這樣的印象。坐擁搜索業務強勁增長的現在，同時又能以酷炫的AI技術抓住未來，這就是搜狗CEO試圖向市場講述的故事。然而搜狗的現在和未來，真的像CEO王小川在路演中所描述的那麽美好嗎？搜狗在招股書中自稱擁有“七大優勢”：1.中國搜

用scrapy爬取搜狗Lofter圖片

request index import rap .so 圖片 file loader clas 用scrapy爬取搜狗Lofter圖片 # -*- coding: utf-8 -*- import json import scrapy from scrapy.http

對搜狗輸入法的評價

下載用戶界面增加好處 gpo bsp font post 界面從用戶界面、記住用戶選擇、短期刺激、長期使用的好處壞處、不要讓用戶犯簡單的錯誤四個方面對搜狗輸入法的評價。 · 搜狗輸入法用戶界面簡單明了大方，功能鮮明，並且可以更換皮膚，減少人們的疲倦感，增加新奇感。

搜狗詞庫轉txt

移位一個 truct unpack art set_trace 描述格式索引 # 運行環境要求 python2 1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 4 import struct

ubuntu16.04在英文狀態下安裝中文語言包的過程(法一：圖形界面的方式) 以及安裝中文語言包後無法選擇漢語問題的解決以及安裝搜狗中文輸入法的方法

完成 pan 設置狀態重啟 round font back 問題 1、筆記本安裝的ubuntu是桌面的，安裝語言包非常方便，桌面版本選擇齒輪 --> System --> System Settings... --> Language Suppor

Ubuntu16.04 下如何安裝搜狗拼音輸入法【親測有效】

tor 有效設置添加 tro 如何 Go lin sta 一.添加fcitx鍵盤輸入法系統【系統默認是iBus】 1.將下載源添加至系統源： sudo add-apt-repository ppa:fcitx-team/nightly 2.更新系統列表獲得最新

白話chrome登錄密鑰環問題和搜狗拼音輸入亂碼問題的解決（安全嘗試版）

點擊取消按鈕 config 沒有機會題解 too 新建問題查詢格式這是最近遇到的兩個問題。 chrome登錄秘鑰環解決方案說下密鑰環的作用，在linux中用於安全保存私密數據的模塊，用於加密保存密碼、證書、秘鑰等安全數據。之前安裝完谷歌瀏覽器之後，每次打開都會

python使用gensim訓練搜狗語料的LDA

相關推薦