python 對一篇文章,按逗號分成一句一句的,然後在這篇文章中找到與某個句子類似的句子(包含相同的詞)
阿新 • • 發佈:2019-02-03
#-*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("gbk")
#code:[email protected]
#12-4.py
import numpy as np
import jieba
import copy
def get_cossimi(x,y):
myx=np.array(x)
myy=np.array(y)
cos1=np.sum(myx*myy)
cos21=np.sqrt(sum(myx*myx))
cos22=np.sqrt(sum(myy*myy))
return cos1/float(cos21*cos22)
f1_text='瑞典稅務局改稱臺灣為中國一省:按國際慣例修正'
if __name__ == '__main__':
f1 = file('testk.txt','r')
lines = f1.read()
#lines=lines.split(u'。')
lines=lines.split(u',')
#lines=re.split(',', lines)
#for
for i in lines :
#print i
#print "ok"
if not len(i) ==1 :
f1_seg_list = jieba.cut(f1_text)
#第一個待測試資料
ftest1_seg_list = jieba.cut(i)
#讀取樣本文字
#去除停用詞,同時構造樣本詞的字典
f_stop = open('stopwords.txt')
try:
f_stop_text = f_stop.read( )
f_stop_text=unicode(f_stop_text,'utf-8')
finally:
f_stop.close( )
f_stop_seg_list=f_stop_text.split('\n')
test_words={}
all_words={}
for myword in f1_seg_list:
#print ".",
if not(myword.strip() in f_stop_seg_list):
test_words.setdefault(myword,0)
all_words.setdefault(myword,0)
all_words[myword]+=1
#讀取待測試文字
mytest1_words=copy.deepcopy(test_words)
for myword in ftest1_seg_list:
#print ".",
if not(myword.strip() in f_stop_seg_list):
if mytest1_words.has_key(myword):
mytest1_words[myword]+=1
#計算樣本與待測試文字的餘弦相似度
sampdata=[]
test1data=[]
for key in all_words.keys():
sampdata.append(all_words[key])
test1data.append(mytest1_words[key])
test1simi=get_cossimi(sampdata,test1data)
print "%s %f %s "%(chr(10)+i+u'。'+chr(10),test1simi,chr(10))
else:
continue
f1.close()
import sys
reload(sys)
sys.setdefaultencoding("gbk")
#code:[email protected]
#12-4.py
import numpy as np
import jieba
import copy
def get_cossimi(x,y):
myx=np.array(x)
myy=np.array(y)
cos1=np.sum(myx*myy)
cos21=np.sqrt(sum(myx*myx))
cos22=np.sqrt(sum(myy*myy))
return cos1/float(cos21*cos22)
f1_text='瑞典稅務局改稱臺灣為中國一省:按國際慣例修正'
if __name__ == '__main__':
f1 = file('testk.txt','r')
lines = f1.read()
#lines=lines.split(u'。')
lines=lines.split(u',')
#lines=re.split(',', lines)
#for
for i in lines :
#print i
#print "ok"
if not len(i) ==1 :
f1_seg_list = jieba.cut(f1_text)
#第一個待測試資料
ftest1_seg_list = jieba.cut(i)
#讀取樣本文字
#去除停用詞,同時構造樣本詞的字典
f_stop = open('stopwords.txt')
try:
f_stop_text = f_stop.read( )
f_stop_text=unicode(f_stop_text,'utf-8')
finally:
f_stop.close( )
f_stop_seg_list=f_stop_text.split('\n')
test_words={}
all_words={}
for myword in f1_seg_list:
#print ".",
if not(myword.strip() in f_stop_seg_list):
test_words.setdefault(myword,0)
all_words.setdefault(myword,0)
all_words[myword]+=1
#讀取待測試文字
mytest1_words=copy.deepcopy(test_words)
for myword in ftest1_seg_list:
#print ".",
if not(myword.strip() in f_stop_seg_list):
if mytest1_words.has_key(myword):
mytest1_words[myword]+=1
#計算樣本與待測試文字的餘弦相似度
sampdata=[]
test1data=[]
for key in all_words.keys():
sampdata.append(all_words[key])
test1data.append(mytest1_words[key])
test1simi=get_cossimi(sampdata,test1data)
print "%s %f %s "%(chr(10)+i+u'。'+chr(10),test1simi,chr(10))
else:
continue
f1.close()