1. 程式人生 > >pythonNLP-文字相似度計算-Demo

pythonNLP-文字相似度計算-Demo

參照部落格[我愛自然語言處理]裡面的如何計算兩個文字的相似度系列,把程式碼自己實現了一遍,對整個流程有了瞭解。純屬個人記錄,新手想學習可直接去上面的部落格學習,講的非常好。

程式碼

#-*- coding:utf-8
import gensim
from gensim import corpora, models, similarities
import traceback


documents = [ "Shipment of gold damaged in a fire",
              "Delivery of silver arrived in a silver truck"
, "Shipment of gold arrived in a truck"] ''' @:return:texts是token_list,只要我生成了token_list,給它就行了 ''' def pre_process( documents ): try: documents_token_list = [ [word for word in document.lower().split() ] for document in documents ] print "[INFO]: pre_process is finished!"
return documents_token_list except Exception,e: print traceback.print_exc() ''' 這個函式是比較通用的,可以跟我自己寫的結合。 這個是根據document[ token_list ]來訓練tf_idf模型的 @texts: documents = [ document1, document2, ... ] document1 = token_list1 @return: dictionary 根據texts建立的vsm空間,並且記錄了每個詞的位置,和我的實現一樣,對於vsm空間每個詞,你要記錄他的位置。否則,文件生成vsm空間的時候,每個詞無法找到自己的位置 @return: corpus_idf 每篇document在vsm上的tf-idf表示.但是他的輸出和我的不太一樣,我的輸出就是單純的vsm空間中tf-idf的值,但是它的空間裡面不是。還有位置資訊在。並且輸出的時候,看到的好像沒有值為0的向量,但是vsm向量的空間是一樣的。所以,我覺得應該是隻輸出了非0的。 這兩個返回值和我的都不一樣,因為字典(vsm)以及corpus_idf(vsm)都輸出了位置資訊。 但是這兩個資訊,可以快速生成lda和lsi模型 '''
def tf_idf_trainning(documents_token_list): try: # 將所有文章的token_list對映為 vsm空間 dictionary = corpora.Dictionary(documents_token_list) # 每篇document在vsm上的tf表示 corpus_tf = [ dictionary.doc2bow(token_list) for token_list in documents_token_list ] # 用corpus_tf作為特徵,訓練tf_idf_model tf_idf_model = models.TfidfModel(corpus_tf) # 每篇document在vsm上的tf-idf表示 corpus_tfidf = tf_idf_model[corpus_tf] print "[INFO]: tf_idf_trainning is finished!" return dictionary, corpus_tf, corpus_tfidf except Exception,e: print traceback.print_exc() def lsi_trainning( dictionary, corpus_tfidf, K ): try: # 用tf_idf作為特徵,訓練lsi模型 lsi_model = models.LsiModel( corpus_tfidf, id2word=dictionary, num_topics = K ) # 每篇document在K維空間上表示 corpus_lsi = lsi_model[corpus_tfidf] print "[INFO]: lsi_trainning is finished!" return lsi_model, corpus_lsi except Exception,e: print traceback.print_exc() def lda_trainning( dictionary, corpus_tfidf, K ): try: # 用corpus_tf作為特徵,訓練lda_model lda_model = models.LdaModel( corpus_tfidf, id2word=dictionary, num_topics = K ) # 每篇document在K維空間上表示 corpus_lda = lda_model[corpus_tfidf] for aa in corpus_lda: print aa print "[INFO]: lda_trainning is finished!" return lda_model, corpus_lda except Exception,e: print traceback.print_exc() def similarity( query, dictionary, corpus_tf, lda_model ): try: # 建立索引 index = similarities.MatrixSimilarity( lda_model[corpus_tf] ) # 在dictionary建立query的vsm_tf表示 query_bow = dictionary.doc2bow( query.lower().split() ) # 查詢在K維空間的表示 query_lda = lda_model[query_bow] # 計算相似度 simi = index[query_lda] query_simi_list = [ item for _, item in enumerate(simi) ] print query_simi_list except Exception,e: print traceback.print_exc() documents_token_list = pre_process(documents) dict, corpus_tf, corpus_tfidf = tf_idf_trainning(documents_token_list) #lsi_trainning(corpus_tfidf, dict, 2) lda_model, corpus_lda = lda_trainning(dict, corpus_tfidf, 2) similarity( "Shipment of gold arrived in a truck", dict, corpus_tf, lda_model )

程式碼

#-*- coding:utf-8
from gensim import corpora, models, similarities
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
import traceback

'''
------------------------------------------------------------
函式宣告
'''

# 預處理
def pre_process(PATH):
    try:

        # 課程資訊
        courses = [ line.strip() for line in file(PATH) ]
        courses_copy = courses
        courses_name = [ course.split('\t')[0] for course in courses ]

        # 分詞-轉化小寫
        texts_tokenized = [[word.lower() for word in word_tokenize(document.decode("utf-8"))] for document in courses]

        # 去除停用詞
        english_stopwords = stopwords.words('english')
        texts_filtered_stopwords = [ [ word for word in document if word not in english_stopwords ] for document in texts_tokenized ]

        # 去除標點符號
        english_punctuations = [',', '.',  ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
        texts_filterd = [ [ word for word in document if word not in english_punctuations ] for document in texts_filtered_stopwords ]

        # 詞幹化
        st = LancasterStemmer()
        texts_stemmed = [ [ st.stem(word) for word in document ] for document in texts_filterd ]
        #print texts_stemmed[0]

        # 去除低頻詞
        all_stems = sum(texts_stemmed, [])
        stem_once = set( stem for stem in set(all_stems) if all_stems.count(stem) == 1 )
        texts = [ [ word for word in text if word not in stem_once ] for text in texts_stemmed]

        print "[INFO]: pre_process is finished!"
        return texts, courses_copy, courses_name


    except Exception,e:
        print traceback.print_exc()

# 訓練tf_idf模型
def tf_idf_trainning(documents_token_list):
    try:

        # 將所有文章的token_list對映為 vsm空間
        dictionary = corpora.Dictionary(documents_token_list)

        # 每篇document在vsm上的tf表示
        corpus_tf = [ dictionary.doc2bow(token_list) for token_list in documents_token_list ]

        # 用corpus_tf作為特徵,訓練tf_idf_model
        tf_idf_model = models.TfidfModel(corpus_tf)

        # 每篇document在vsm上的tf-idf表示
        corpus_tfidf = tf_idf_model[corpus_tf]

        print "[INFO]: tf_idf_trainning is finished!"
        return dictionary, corpus_tf, corpus_tfidf

    except Exception,e:
        print traceback.print_exc()

# 訓練lsi模型
def lda_trainning( dictionary, corpus_tfidf, K ):
    try:

        # 用corpus_tf作為特徵,訓練lda_model
        lda_model = models.LdaModel( corpus_tfidf, id2word=dictionary, num_topics = K )

        # 每篇document在K維空間上表示
        corpus_lda = lda_model[corpus_tfidf]

        print "[INFO]: lda_trainning is finished!"
        return lda_model, corpus_lda

    except Exception,e:
        print traceback.print_exc()

# 基於lda模型的相似度計算
def similarity( query, dictionary, corpus_tf, lda_model ):
    try:

        # 建立索引
        index = similarities.MatrixSimilarity( lda_model[corpus_tf] )

        # 在dictionary建立query的vsm_tf表示
        query_bow = dictionary.doc2bow( query.lower().split() )

        # 查詢在K維空間的表示
        query_lda = lda_model[query_bow]

        # 計算相似度
        simi = index[query_lda]
        sort_simi = sorted(enumerate(simi), key=lambda item: -item[1])
        print sort_simi[0:10]

    except Exception,e:
        print traceback.print_exc()


'''
------------------------------------------------------------
常量定義
'''

PATH = "../../data/coursera/coursera_corpus"
number_of_topics = 10

'''
------------------------------------------------------------
'''

texts, courses, courses_name = pre_process(PATH)
dict, corpus_tf, corpus_tfidf = tf_idf_trainning(texts)
lda_model, corpus_lda = lda_trainning( dict, corpus_tf, number_of_topics )

similarity(courses[210], dict, corpus_tf, lda_model)