一段比較好的生成自動摘要程式碼

阿新 • • 發佈：2019-01-28

#!/user/bin/python
# coding:utf-8

import nltk
import numpy
import jieba
import codecs
import os

class SummaryTxt:
    def __init__(self,stopwordspath):
        # 單詞數量
        self.N = 100
        # 單詞間的距離
        self.CLUSTER_THRESHOLD = 5
        # 返回的top n句子
        self.TOP_SENTENCES = 5
        self.stopwrods = {}
        #載入停用詞
        if os.path.exists(stopwordspath):
            stoplist = [line.strip() for line in codecs.open(stopwordspath, 'r', encoding='utf8').readlines()]
            self.stopwrods = {}.fromkeys(stoplist)


    def _split_sentences(self,texts):
        '''
        把texts拆分成單個句子，儲存在列表裡面，以（.!?。！？）這些標點作為拆分的意見，
        :param texts: 文字資訊
        :return:
        '''
        splitstr = '.!?。！？'.decode('utf8')
        start = 0
        index = 0  # 每個字元的位置
        sentences = []
        for text in texts:
            if text in splitstr:  # 檢查標點符號下一個字元是否還是標點
                sentences.append(texts[start:index + 1])  # 當前標點符號位置
                start = index + 1  # start標記到下一句的開頭
            index += 1
        if start < len(texts):
            sentences.append(texts[start:])  # 這是為了處理文字末尾沒有標

        return sentences

    def _score_sentences(self,sentences, topn_words):
        '''
        利用前N個關鍵字給句子打分
        :param sentences: 句子列表
        :param topn_words: 關鍵字列表
        :return:
        '''
        scores = []
        sentence_idx = -1
        for s in [list(jieba.cut(s)) for s in sentences]:
            sentence_idx += 1
            word_idx = []
            for w in topn_words:
                try:
                    word_idx.append(s.index(w))  # 關鍵詞出現在該句子中的索引位置
                except ValueError:  # w不在句子中
                    pass
            word_idx.sort()
            if len(word_idx) == 0:
                continue
            # 對於兩個連續的單詞，利用單詞位置索引，通過距離閥值計算族
            clusters = []
            cluster = [word_idx[0]]
            i = 1
            while i < len(word_idx):
                if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
                    cluster.append(word_idx[i])
                else:
                    clusters.append(cluster[:])
                    cluster = [word_idx[i]]
                i += 1
            clusters.append(cluster)
            # 對每個族打分，每個族類的最大分數是對句子的打分
            max_cluster_score = 0
            for c in clusters:
                significant_words_in_cluster = len(c)
                total_words_in_cluster = c[-1] - c[0] + 1
                score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
                if score > max_cluster_score:
                    max_cluster_score = score
            scores.append((sentence_idx, max_cluster_score))
        return scores

    def summaryScoredtxt(self,text):
        # 將文章分成句子
        sentences = self._split_sentences(text)

        # 生成分詞
        words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
                 len(w) > 1 and w != '\t']
        # words = []
        # for sentence in sentences:
        #     for w in jieba.cut(sentence):
        #         if w not in stopwords and len(w) > 1 and w != '\t':
        #             words.append(w)

        # 統計詞頻
        wordfre = nltk.FreqDist(words)

        # 獲取詞頻最高的前N個詞
        topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

        # 根據最高的n個關鍵詞，給句子打分
        scored_sentences = self._score_sentences(sentences, topn_words)

        # 利用均值和標準差過濾非重要句子
        avg = numpy.mean([s[1] for s in scored_sentences])  # 均值
        std = numpy.std([s[1] for s in scored_sentences])  # 標準差
        summarySentences = []
        for (sent_idx, score) in scored_sentences:
            if score > (avg + 0.5 * std):
                summarySentences.append(sentences[sent_idx])
                print sentences[sent_idx]
        return summarySentences

    def summaryTopNtxt(self,text):
        # 將文章分成句子
        sentences = self._split_sentences(text)

        # 根據句子列表生成分詞列表
        words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
                 len(w) > 1 and w != '\t']
        # words = []
        # for sentence in sentences:
        #     for w in jieba.cut(sentence):
        #         if w not in stopwords and len(w) > 1 and w != '\t':
        #             words.append(w)

        # 統計詞頻
        wordfre = nltk.FreqDist(words)

        # 獲取詞頻最高的前N個詞
        topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

        # 根據最高的n個關鍵詞，給句子打分
        scored_sentences = self._score_sentences(sentences, topn_words)

        top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-self.TOP_SENTENCES:]
        top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
        summarySentences = []
        for (idx, score) in top_n_scored:
            print sentences[idx]
            summarySentences.append(sentences[idx])

        return sentences



if __name__=='__main__':
    obj =SummaryTxt('D:\work\Solr\solr-python\CNstopwords.txt')

    txt=u'十八大以來的五年，是黨和國家發展程序中極不平凡的五年。面對世界經濟復甦乏力、區域性衝突和動盪頻發、全球性問題加劇的外部環境，面對我國經濟發展進入新常態等一系列深刻變化，我們堅持穩中求進工作總基調，迎難而上，開拓進取，取得了改革開放和社會主義現代化建設的歷史性成就。' \
        u'為貫徹十八大精神，黨中央召開七次全會，分別就政府機構改革和職能轉變、全面深化改革、全面推進依法治國、制定“十三五”規劃、全面從嚴治黨等重大問題作出決定和部署。五年來，我們統籌推進“五位一體”總體佈局、協調推進“四個全面”戰略佈局，“十二五”規劃勝利完成，“十三五”規劃順利實施，黨和國家事業全面開創新局面。' \
        u'經濟建設取得重大成就。堅定不移貫徹新發展理念，堅決端正發展觀念、轉變發展方式，發展質量和效益不斷提升。經濟保持中高速增長，在世界主要國家中名列前茅，國內生產總值從五十四萬億元增長到八十萬億元，穩居世界第二，對世界經濟增長貢獻率超過百分之三十。供給側結構性改革深入推進，經濟結構不斷優化，數字經濟等新興產業蓬勃發展，高鐵、公路、橋樑、港口、機場等基礎設施建設快速推進。農業現代化穩步推進，糧食生產能力達到一萬二千億斤。城鎮化率年均提高一點二個百分點，八千多萬農業轉移人口成為城鎮居民。區域發展協調性增強，“一帶一路”建設、京津冀協同發展、長江經濟帶發展成效顯著。創新驅動發展戰略大力實施，創新型國家建設成果豐碩，天宮、蛟龍、天眼、悟空、墨子、大飛機等重大科技成果相繼問世。南海島礁建設積極推進。開放型經濟新體制逐步健全，對外貿易、對外投資、外匯儲備穩居世界前列。' \
        u'全面深化改革取得重大突破。蹄疾步穩推進全面深化改革，堅決破除各方面體制機制弊端。改革全面發力、多點突破、縱深推進，著力增強改革系統性、整體性、協同性，壓茬拓展改革廣度和深度，推出一千五百多項改革舉措，重要領域和關鍵環節改革取得突破性進展，主要領域改革主體框架基本確立。中國特色社會主義制度更加完善，國家治理體系和治理能力現代化水平明顯提高，全社會發展活力和創新活力明顯增強。'

    # txt ='The information disclosed by the Film Funds Office of the State Administration of Press, Publication, Radio, Film and Television shows that, the total box office in China amounted to nearly 3 billion yuan during the first six days of the lunar year (February 8 - 13), an increase of 67% compared to the 1.797 billion yuan in the Chinese Spring Festival period in 2015, becoming the "Best Chinese Spring Festival Period in History".' \
    #      'During the Chinese Spring Festival period, "The Mermaid" contributed to a box office of 1.46 billion yuan. "The Man From Macau III" reached a box office of 680 million yuan. "The Journey to the West: The Monkey King 2" had a box office of 650 million yuan. "Kung Fu Panda 3" also had a box office of exceeding 130 million. These four blockbusters together contributed more than 95% of the total box office during the Chinese Spring Festival period.' \
    #      'There were many factors contributing to the popularity during the Chinese Spring Festival period. Apparently, the overall popular film market with good box office was driven by the emergence of a few blockbusters. In fact, apart from the appeal of the films, other factors like film ticket subsidy of online seat-selection companies, cinema channel sinking and the film-viewing heat in the middle and small cities driven by the home-returning wave were all main factors contributing to this blowout. A management of Shanghai Film Group told the 21st Century Business Herald.'
    print txt
    print "--"
    obj.summaryScoredtxt(txt)

    print "----"
    obj.summaryTopNtxt(txt)

一段比較好的按鍵實現程式碼

之前的一個專案按鍵比較多，面板上面有按鍵，遙控器，處理的稍微複雜一點，MCU使用的是STM8S005K6.關於按鍵部分的處理，現在拿處理來和大家分享一下，說的不對的地方還請各位大俠請教，大家共同進步。按鍵通常分有IO口按鍵（BUTTON),AD按鍵（通過AD取樣電壓），IR（遙控器）按按鍵功能分：有

一段比較好的生成自動摘要程式碼

#!/user/bin/python # coding:utf-8 import nltk import numpy import jieba import codecs import os class SummaryTxt: def __init__(self,

Python一段用於保密的自動銷燬程式碼

有的程式碼檔案，可能我們放到伺服器上，執行規定的次數如1次後，就不再需要了，或者為了對程式碼進行保密，在伺服器上臨時執行一次，程式執行還未結束或伺服器突然斷電，程式檔案內容即消失。有兩種方式，可以在執行程式開始，隨著程式碼載入記憶體開始，對該程式碼檔案實行檔案銷燬，或者對程式碼檔案裡內容進行銷

一段封裝好的ajax傳送請求的js程式碼

var $ = { getpa:function(data){ if(data && typeof data == "object"){ var str = '?'; for(var key in data){ str = str

一段封裝好的移動端點選事件的程式碼

/*封裝移動端的tap點選事件*/ var demo={ /*dom:傳入的dom元素讓我們可以為任意的元素新增tap事件*/ tap:function(dom,callback){ /*判斷是否傳入物件同時物件應該是一個dom元素*/ if(!dom

CentOS實現一段時間後Shell自動登出非活動用戶

情況 term entos jpg bsp ado roc text tro 默認情況下Shell是不會超時自動登出的，但有時需要讓它一段時間後在沒有操作的情況下自動退出終端。1、創建文件/etc/profile.d/autologout.sh： #

帶你開發一款給Apk中自動注入程式碼工具icodetools 完善篇

一、前言在前面已經介紹完了自動給apk中注入日誌程式碼工具icodetools原理了，在那裡我們曾經說過其實離真正的可使用價值有點距離，本篇就對這個工具進行一些優化，讓其真正意義上開始能工作量產。當時在前面一篇文章中說到遺留的三個主要問題：第一個問題：對每個類中都新增一個靜態

Qt中實現滑鼠一段時間不用，自動隱藏

Widget::Widget(QWidget *parent) : QWidget(parent), ui(new Ui::Widget) { ui->setupUi(this); this->setMouseTracking(true);

一種比較好的JNI Java和C++相互傳遞引數和返回值的方法

序言以前在Android上做移動多媒體開發的時候，有很多需要在Java和C++相互傳遞引數的Case，以前因為時間卡的緊，一直沒有去修復這一類的問題，因為能用，沒有出什麼問題，也就沒有想什麼優化方案。最近自己有一些閒暇時間折騰點小玩意兒，也需要從Java和C++之間相互傳遞引數。想

一款比較好用的JS時間控制元件-laydate

一款比較好用的JS時間控制元件-laydate 　　官方講解：http://laydate.layui.com/ 　　具體的內容請看官方講解，此處僅說名應用：　　1、在jsp或html或其他中引入laydate.js 　　　　<script src="……/laydate/l

【Cocos2d-x】圖片描邊的一種比較好的shader實現方法

轉載： http://blog.csdn.net/u011281572/article/details/44999609 圖片描邊需求如下： 1. 可指定描邊寬度2. 可指定描邊顏色3. 可用於字型圖片描邊我所知道的方式有以下幾種： 1. Cocos2d-x 3.x中，

INNO setup打包的一段介面美化的code端程式碼

[Code] var DetailList: TNewListBox; newFileNameLabel: TNewStaticText; LastDir: string; procedure InitializeWizard(); begin //介面修改

【web】一段建立本地資料夾的程式碼

一段建立本地資料夾的程式碼 document.addEventListener("deviceready", onDeviceReady, false); function onDeviceReady() { // alert('ondeviceR

Unity 延遲執行一段程式碼的實現比較好的方式

在Unity中，延時執行一段程式碼或者一個方法或者幾個方法的情況非常普遍。一般會用到Invoke和InvokeRepeating方法。顧名思義，第一個是執行一次，第二個是重複執行。看下定義： void Invoke(string methodName, float time); 第一個引數是方法名(

一段 VB 程式碼自動執行圖形介面程式傳送按鍵指令

用於ETL流程中, 有不支援命令列, 只能GUI互動的程式要實現自動化呼叫可以參考下面這段VB. 啟動了程式, 還在5秒後自動按了一下回車鍵. Dim Shell Set Shell=WScript.CreateObject("WScript.Shell"

用一段程式碼表示程式設計師網友：程式設計師你還好？

在分享這篇文章之前呢，我先介紹下自己，我自己是一名從事了5年前端開發的全棧工程師—————————————— 話不多說，直接上程式碼。請輸入圖片描述相信只要學了java的都能看懂。其實我幹了程式設計師這麼久，程式設計師這一行也看透了。我21畢業，

記錄一段生成素數python程式碼的調優過程 • cenalulu's Tech Blog

簡介：本文主要記錄了博主對一段使用python實現的素數生成程式碼的不斷優化過程。背景：最近在刷Project Euler的題目，刷到第十題（計算2百萬以下素數的和）的時候發現之前的素數生成程式碼效率太低導致幾分鐘都出不來。於是通過不斷的調優，終於得到一個能在秒級算出2百萬以內的素

apk安裝法之二----一段Android實現應用下載並自動安裝apk包的程式碼

protected File downLoadFile(String httpUrl) { // TODO Auto-generated method stub final String fileName = "updata.apk";

int 和 Integer 有什麼區別。請簡單的寫一段有關自動拆裝箱的程式碼。

//1 Integer a =10; Integer a1=10; System.out.println(a==a1);//true //2 Integer b=128; Integer

解決mac下ssh空閑一段時間自動斷開的問題

時間間隔保持 linu 遠程 conf etc style con class 之前在公司就遇到過這種問題，使用ssh登錄linux服務器後，在後臺放置一段時間，會發現會自動斷開，解決的方法很簡單： vim /etc/ssh/ssh_config 添加這2句即可 Serv

一段比較好的生成自動摘要程式碼

相關推薦