去重的類函式實現
阿新 • • 發佈:2019-01-03
# -*- coding: utf-8 -*- import pandas as pd import numpy as np import jieba.posseg as pseg import codecs from gensim import corpora, models, similarities from database import Database from demo import Demo import datetime class CacuSimil(object): insert = 'INSERT INTO finance_new_news(content,date, id, tags, time,' \ ' update_time, url, website) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' #def __init__(self): # self.db = Database() # self.db.connect('crawl_data') #計算對比日期 def date(self): today = datetime.date.today() print(today) preone_time = today + datetime.timedelta(days=-1) self.preone_time_nyr = preone_time.strftime('%Y-%m-%d') # 格式化輸出 return self.preone_time_nyr #連線資料庫,取出用來對照的文字,放在一個元組sql_saved內;取出剛抓取的內容,放在元組sql_craw內; def conn_db(self, day): db = Database() db.connect('crawl_data') sql_saved = "select content from finance_new_news where date >= '%s'" % (day) saved_files = db.query(sql_saved) print(saved_files) sql_craw = "select * from finance_old_news where date >= '%s'" % (day) #craw_file = list(db.query(sql_craw)) craw_file = db.query(sql_craw) print(len(craw_file)) db.close() return saved_files, craw_file #對一個文字分詞、去停用詞 def tokenization(self, content): # 構建停用詞 stop_words = 'stop_words_ch.txt' stopwords = codecs.open(stop_words, 'rb', encoding='utf-8').readlines() stop_words = [w.strip() for w in stopwords] # 結巴分詞後的停用詞性【標點符號、連詞、助詞、副詞、介詞、時語素、'的'、數詞、方位詞、代詞】 stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r'] result = [] words = pseg.cut(content) for word, flag in words: if flag not in stop_flag and word not in stopwords: result.append(word) return result #生成語料庫 def gen_corpus(self, saved_files): corpus = [] for content_tuple in saved_files: content = str(content_tuple) corpus.append(self.tokenization(content)) #print(corpus[0],corpus[1]) print(len(corpus)) return corpus def gen_Model(self, corpus): #建立詞袋模型:尋找整篇語料的詞典、所有詞 dictionary = corpora.Dictionary(corpus) print(dictionary) #分支一、BOW詞袋模型:由doc2bow變成詞袋 doc_vectors = [dictionary.doc2bow(text) for text in corpus] print(len(doc_vectors)) print(doc_vectors) #分支二、建立TF-IDF模型 tfidf = models.TfidfModel(doc_vectors) tfidf_vectors = tfidf[doc_vectors] print(len(tfidf_vectors)) #print(len(tfidf_vectors[0])) return dictionary, tfidf_vectors #輸入待匹配的檔案(元組形式),利用詞袋模型的字典將其對映到向量空間;計算相似度 def get_similar(self, file, dictionary, tfidf_vectors): content = file[0] token = self.tokenization(content) token_bow = dictionary.doc2bow(token) print(token_bow) index = similarities.MatrixSimilarity(tfidf_vectors) sims = index[token_bow] result = list(enumerate(sims)) print(result) #print(len(result)) similar = [] for i in range(len(result)): similar.append(result[i][1]) return max(similar) def insert_or_not(self, score, craw_file): db = Database() db.connect('crawl_data') if score<0.9: db.execute(self.insert, craw_file) db.close()
修改版:
# -*- coding: utf-8 -*- import json import pandas as pd import numpy as np import jieba.posseg as pseg import codecs from gensim import corpora, models, similarities from database import Database import grpc import data_pb2 import data_pb2_grpc import datetime class CacuSimil(object): def __init__(self): self.db = Database() self.db.connect('crawl_data') #獲取待匹配資料,json形式 def get_craw(self): conn = grpc.insecure_channel('192.168.1.100' + ':' + '2333') client = data_pb2_grpc.FormatDataStub(channel=conn) response = client.DoFormat(data_pb2.Data(text='getData')) return json.loads(response.text) #連線資料庫,取出用來對照的文字,sql_saved是一個元組;取出剛抓取的內容,放在元組sql_craw內; def conn_db(self): db = Database() db.connect('crawl_data') today = datetime.date.today() preone_time = today + datetime.timedelta(days=-1) day = preone_time.strftime('%Y-%m-%d') # 格式化輸出 sql_saved = "select content from finance_news where date >= '%s'" % (day) saved_files = db.query(sql_saved) db.close() return saved_files #對一個文字分詞、去停用詞 def tokenization(self, content): # 構建停用詞 stop_words = 'stop_words_ch.txt' stopwords = codecs.open(stop_words, 'rb', encoding='utf-8').readlines() stop_words = [w.strip() for w in stopwords] # 結巴分詞後的停用詞性【標點符號、連詞、助詞、副詞、介詞、時語素、'的'、數詞、方位詞、代詞】 stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r'] result = [] words = pseg.cut(content) for word, flag in words: if flag not in stop_flag and word not in stopwords: result.append(word) return result #生成語料庫,獲得模型 def get_Model(self, saved_files): corpus = [] for content_tuple in saved_files: content = str(content_tuple) corpus.append(self.tokenization(content)) #建立詞袋模型:尋找整篇語料的詞典、所有詞 dictionary = corpora.Dictionary(corpus) #分支一、BOW詞袋模型:由doc2bow變成詞袋 doc_vectors = [dictionary.doc2bow(text) for text in corpus] #分支二、建立TF-IDF模型 tfidf = models.TfidfModel(doc_vectors) tfidf_vectors = tfidf[doc_vectors] return dictionary, tfidf_vectors #輸入待匹配的檔案,計算相似度 def get_similar(self, craw_file, dictionary, tfidf_vectors): print(craw_file) content = craw_file['content'] token = self.tokenization(content) token_bow = dictionary.doc2bow(token) index = similarities.MatrixSimilarity(tfidf_vectors) sims = index[token_bow] result = list(enumerate(sims)) similar = [] for i in range(len(result)): similar.append(result[i][1]) return max(similar) #判斷是否插入資料庫 def insert_or_not(self, score, craw_file): db = Database() db.connect('crawl_data') insert = 'INSERT INTO finance_news(content,date, id, tags, time,' \ ' update_time, url, website) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' if score<0.9: db.execute(insert, [craw_file['content'], craw_file['date'], craw_file['id'], \ craw_file['tags'], craw_file['time'], craw_file['update_time'],\ craw_file['url'], craw_file['website']]) print(craw_file) db.close() def run(): CS=CacuSimil() #while True: craw_file = CS.get_craw() print(craw_file) if craw_file: saved_files = CS.conn_db() dictionary, tfidf_vectors = CS.get_Model(saved_files) similar = CS.get_similar(craw_file, dictionary, tfidf_vectors) CS.insert_or_not(similar, craw_file) if __name__ == '__main__': run()