1. 程式人生 > >計算兩篇文件的餘弦相似度(tfidf)

計算兩篇文件的餘弦相似度(tfidf)

# -*- coding:utf-8 -*-
"""
@author: Linlifang
"""
import os
import jieba
import sys
import re
import string
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys)
sys.setdefaultencoding('utf-8')

'''
首先讀取資料夾裡的文件,然後通過結巴分詞,將分詞的結果存入檔案,接著使用sklearn包計算每一篇文件的tfidf值
並儲存在一個檔案裡,最後從這些檔案中任選兩個txt檔案來計算他們的餘弦相似度。 ''' def getFileList(path): filelist = [] files = os.listdir(path) for f in files: if f[0] == '.': pass else: filelist.append(f) return filelist, path def segment(filename, path, segPath): f = open(path + "/"
+ filename, 'r+') file_list = f.read() f.close() #對文件進行分詞處理 if not os.path.exists(segPath): os.mkdir(segPath) #對空格,換行符進行處理 # Segmenting the document seg_list = jieba.cut(file_list, cut_all=False) # stopword = open('stopworda.txt').readlines() result = [] for seg in seg_list: seg = ''
.join(seg.split()) reg = 'w+' r = re.search(reg, seg) if seg != '' and seg != ' = ' and seg != '[' and seg != ']' and seg != '(' and seg != ')' and not r: result.append(seg) finalresult = [] stopword = open('stopworda.txt').read() for word in result: #去除停用詞 if word in stopword: continue else: if word >= u'\u4e00' and word <= u'\u9fa5': #判斷是否是漢字 finalresult.append(word) # 將分詞後的結果用空格隔開,儲存在本地 f = open(segPath + "/" + filename + "-seg.txt", "w+") f.write(' '.join(finalresult)) f.close() # 讀取已經分詞好的文件,進行TFIDF計算 def Tfidf(filelist, sFilePath, path): corpus = [] for ff in filelist: fname = path + "/" + ff f = open(fname + "-seg.txt", 'r+') content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() # 全部文字關鍵字 weight = tfidf.toarray() if not os.path.exists(sFilePath): os.mkdir(sFilePath) for i in range(len(weight)): print u'-writing all the tf-idf in the ', i, u'file into ', sFilePath + '/' + string.zfill(i, 2) + ".txt" f = open(sFilePath + "/" + string.zfill(i, 2) + ".txt", 'w+') for j in range(len(word)): f.write(word[j] + " " + str(weight[i][j]) + " " + "\n") def coutcos(file1,file2): cipin1 = open(file1).readlines() cipin2 = open(file2).readlines() list1 = [] list2 = [] for x in cipin1: y = x.split(' ') list1.append(y[2]) for x in cipin2: y = x.split(' ') list2.append(y[2]) dot_product = 0.0 normA = 0.0 normB = 0.0 for a, b in zip(list1, list2): a = float(a) b = float(b) dot_product += a * b normA += a ** 2 normB += b ** 2 if normA == 0.0 or normB == 0.0: return None else: return dot_product / ((normA * normB) ** 0.5) if __name__ == "__main__": # 儲存TFIDF的計算結果到資料夾 sFilePath = "C:/Users/llfang1/PycharmProjects/untitled2/corpus/tfidffile" # 儲存分詞的資料夾 segPath = 'C:/Users/llfang1/PycharmProjects/untitled2/corpus/segfile' (allfile, path) = getFileList('C:/Users/llfang1/PycharmProjects/untitled2/corpus/allkeyword') for ff in allfile: print "Using jieba on " + ff segment(ff, path, segPath) Tfidf(allfile, sFilePath, segPath) file1 = sFilePath + "/" + "04.txt" file2 = sFilePath + "/" + "05.txt" similar = coutcos(file1,file2) print similar
注:此程式參考了一位同行的程式後進行了改動並新增一些內容