1. 程式人生 > >去重的類函式實現

去重的類函式實現

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import jieba.posseg as pseg
import codecs
from gensim import corpora, models, similarities
from database import Database
from demo import Demo
import datetime

class CacuSimil(object):
    insert = 'INSERT INTO finance_new_news(content,date, id, tags, time,' \
             ' update_time, url, website) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'

    #def __init__(self):
        # self.db = Database()
        # self.db.connect('crawl_data')

    #計算對比日期
    def date(self):
        today = datetime.date.today()
        print(today)
        preone_time = today + datetime.timedelta(days=-1)
        self.preone_time_nyr = preone_time.strftime('%Y-%m-%d')  # 格式化輸出
        return self.preone_time_nyr


    #連線資料庫,取出用來對照的文字,放在一個元組sql_saved內;取出剛抓取的內容,放在元組sql_craw內;
    def conn_db(self, day):
        db = Database()
        db.connect('crawl_data')
        sql_saved = "select content from finance_new_news where date >= '%s'" % (day)
        saved_files = db.query(sql_saved)
        print(saved_files)
        sql_craw = "select * from finance_old_news where date >= '%s'" % (day)
        #craw_file = list(db.query(sql_craw))
        craw_file = db.query(sql_craw)
        print(len(craw_file))
        db.close()
        return saved_files, craw_file


    #對一個文字分詞、去停用詞
    def tokenization(self, content):
        # 構建停用詞
        stop_words = 'stop_words_ch.txt'
        stopwords = codecs.open(stop_words, 'rb', encoding='utf-8').readlines()
        stop_words = [w.strip() for w in stopwords]
        # 結巴分詞後的停用詞性【標點符號、連詞、助詞、副詞、介詞、時語素、'的'、數詞、方位詞、代詞】
        stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']

        result = []
        words = pseg.cut(content)
        for word, flag in words:
            if flag not in stop_flag and word not in stopwords:
                result.append(word)
        return result

    #生成語料庫
    def gen_corpus(self, saved_files):
        corpus = []
        for content_tuple in saved_files:
            content = str(content_tuple)
            corpus.append(self.tokenization(content))
        #print(corpus[0],corpus[1])
        print(len(corpus))
        return corpus

    def gen_Model(self, corpus):
        #建立詞袋模型:尋找整篇語料的詞典、所有詞
        dictionary = corpora.Dictionary(corpus)
        print(dictionary)

        #分支一、BOW詞袋模型:由doc2bow變成詞袋
        doc_vectors = [dictionary.doc2bow(text) for text in corpus]
        print(len(doc_vectors))
        print(doc_vectors)

        #分支二、建立TF-IDF模型
        tfidf = models.TfidfModel(doc_vectors)
        tfidf_vectors = tfidf[doc_vectors]
        print(len(tfidf_vectors))
        #print(len(tfidf_vectors[0]))
        return dictionary, tfidf_vectors


    #輸入待匹配的檔案(元組形式),利用詞袋模型的字典將其對映到向量空間;計算相似度
    def get_similar(self, file, dictionary, tfidf_vectors):
         content = file[0]
         token = self.tokenization(content)
         token_bow = dictionary.doc2bow(token)
         print(token_bow)

         index = similarities.MatrixSimilarity(tfidf_vectors)
         sims = index[token_bow]
         result = list(enumerate(sims))
         print(result)
         #print(len(result))
         similar = []
         for i in range(len(result)):
             similar.append(result[i][1])
         return max(similar)

    def insert_or_not(self, score, craw_file):
        db = Database()
        db.connect('crawl_data')
        if score<0.9:
            db.execute(self.insert, craw_file)
        db.close()

修改版:

# -*- coding: utf-8 -*-
import json
import pandas as pd
import numpy as np
import jieba.posseg as pseg
import codecs
from gensim import corpora, models, similarities
from database import Database
import grpc
import data_pb2
import data_pb2_grpc
import datetime

class CacuSimil(object):


    def __init__(self):
         self.db = Database()
         self.db.connect('crawl_data')

    #獲取待匹配資料,json形式
    def get_craw(self):
        conn = grpc.insecure_channel('192.168.1.100' + ':' + '2333')
        client = data_pb2_grpc.FormatDataStub(channel=conn)
        response = client.DoFormat(data_pb2.Data(text='getData'))
        return json.loads(response.text)


    #連線資料庫,取出用來對照的文字,sql_saved是一個元組;取出剛抓取的內容,放在元組sql_craw內;
    def conn_db(self):
        db = Database()
        db.connect('crawl_data')

        today = datetime.date.today()
        preone_time = today + datetime.timedelta(days=-1)
        day = preone_time.strftime('%Y-%m-%d')  # 格式化輸出

        sql_saved = "select content from finance_news where date >= '%s'" % (day)
        saved_files = db.query(sql_saved)

        db.close()
        return saved_files


    #對一個文字分詞、去停用詞
    def tokenization(self, content):
        # 構建停用詞
        stop_words = 'stop_words_ch.txt'
        stopwords = codecs.open(stop_words, 'rb', encoding='utf-8').readlines()
        stop_words = [w.strip() for w in stopwords]
        # 結巴分詞後的停用詞性【標點符號、連詞、助詞、副詞、介詞、時語素、'的'、數詞、方位詞、代詞】
        stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']
        result = []
        words = pseg.cut(content)
        for word, flag in words:
            if flag not in stop_flag and word not in stopwords:
                result.append(word)
        return result

    #生成語料庫,獲得模型
    def get_Model(self, saved_files):
        corpus = []
        for content_tuple in saved_files:
            content = str(content_tuple)
            corpus.append(self.tokenization(content))
        #建立詞袋模型:尋找整篇語料的詞典、所有詞
        dictionary = corpora.Dictionary(corpus)

        #分支一、BOW詞袋模型:由doc2bow變成詞袋
        doc_vectors = [dictionary.doc2bow(text) for text in corpus]
        #分支二、建立TF-IDF模型
        tfidf = models.TfidfModel(doc_vectors)
        tfidf_vectors = tfidf[doc_vectors]

        return dictionary, tfidf_vectors


    #輸入待匹配的檔案,計算相似度
    def get_similar(self, craw_file, dictionary, tfidf_vectors):
         print(craw_file)
         content = craw_file['content']
         token = self.tokenization(content)
         token_bow = dictionary.doc2bow(token)

         index = similarities.MatrixSimilarity(tfidf_vectors)
         sims = index[token_bow]
         result = list(enumerate(sims))

         similar = []
         for i in range(len(result)):
             similar.append(result[i][1])
         return max(similar)

    #判斷是否插入資料庫
    def insert_or_not(self, score, craw_file):
        db = Database()
        db.connect('crawl_data')

        insert = 'INSERT INTO finance_news(content,date, id, tags, time,' \
                 ' update_time, url, website) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'

        if score<0.9:
            db.execute(insert, [craw_file['content'], craw_file['date'], craw_file['id'], \
                                craw_file['tags'], craw_file['time'], craw_file['update_time'],\
                                craw_file['url'], craw_file['website']])
            print(craw_file)
        db.close()



def run():
    CS=CacuSimil()
    #while True:
    craw_file = CS.get_craw()
    print(craw_file)
    if craw_file:
        saved_files = CS.conn_db()
        dictionary, tfidf_vectors = CS.get_Model(saved_files)
        similar = CS.get_similar(craw_file, dictionary, tfidf_vectors)
        CS.insert_or_not(similar, craw_file)


if __name__ == '__main__':
        run()