一段比較好的生成自動摘要程式碼
阿新 • • 發佈:2019-01-28
#!/user/bin/python # coding:utf-8 import nltk import numpy import jieba import codecs import os class SummaryTxt: def __init__(self,stopwordspath): # 單詞數量 self.N = 100 # 單詞間的距離 self.CLUSTER_THRESHOLD = 5 # 返回的top n句子 self.TOP_SENTENCES = 5 self.stopwrods = {} #載入停用詞 if os.path.exists(stopwordspath): stoplist = [line.strip() for line in codecs.open(stopwordspath, 'r', encoding='utf8').readlines()] self.stopwrods = {}.fromkeys(stoplist) def _split_sentences(self,texts): ''' 把texts拆分成單個句子,儲存在列表裡面,以(.!?。!?)這些標點作為拆分的意見, :param texts: 文字資訊 :return: ''' splitstr = '.!?。!?'.decode('utf8') start = 0 index = 0 # 每個字元的位置 sentences = [] for text in texts: if text in splitstr: # 檢查標點符號下一個字元是否還是標點 sentences.append(texts[start:index + 1]) # 當前標點符號位置 start = index + 1 # start標記到下一句的開頭 index += 1 if start < len(texts): sentences.append(texts[start:]) # 這是為了處理文字末尾沒有標 return sentences def _score_sentences(self,sentences, topn_words): ''' 利用前N個關鍵字給句子打分 :param sentences: 句子列表 :param topn_words: 關鍵字列表 :return: ''' scores = [] sentence_idx = -1 for s in [list(jieba.cut(s)) for s in sentences]: sentence_idx += 1 word_idx = [] for w in topn_words: try: word_idx.append(s.index(w)) # 關鍵詞出現在該句子中的索引位置 except ValueError: # w不在句子中 pass word_idx.sort() if len(word_idx) == 0: continue # 對於兩個連續的單詞,利用單詞位置索引,通過距離閥值計算族 clusters = [] cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) # 對每個族打分,每個族類的最大分數是對句子的打分 max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score scores.append((sentence_idx, max_cluster_score)) return scores def summaryScoredtxt(self,text): # 將文章分成句子 sentences = self._split_sentences(text) # 生成分詞 words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if len(w) > 1 and w != '\t'] # words = [] # for sentence in sentences: # for w in jieba.cut(sentence): # if w not in stopwords and len(w) > 1 and w != '\t': # words.append(w) # 統計詞頻 wordfre = nltk.FreqDist(words) # 獲取詞頻最高的前N個詞 topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N] # 根據最高的n個關鍵詞,給句子打分 scored_sentences = self._score_sentences(sentences, topn_words) # 利用均值和標準差過濾非重要句子 avg = numpy.mean([s[1] for s in scored_sentences]) # 均值 std = numpy.std([s[1] for s in scored_sentences]) # 標準差 summarySentences = [] for (sent_idx, score) in scored_sentences: if score > (avg + 0.5 * std): summarySentences.append(sentences[sent_idx]) print sentences[sent_idx] return summarySentences def summaryTopNtxt(self,text): # 將文章分成句子 sentences = self._split_sentences(text) # 根據句子列表生成分詞列表 words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if len(w) > 1 and w != '\t'] # words = [] # for sentence in sentences: # for w in jieba.cut(sentence): # if w not in stopwords and len(w) > 1 and w != '\t': # words.append(w) # 統計詞頻 wordfre = nltk.FreqDist(words) # 獲取詞頻最高的前N個詞 topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N] # 根據最高的n個關鍵詞,給句子打分 scored_sentences = self._score_sentences(sentences, topn_words) top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-self.TOP_SENTENCES:] top_n_scored = sorted(top_n_scored, key=lambda s: s[0]) summarySentences = [] for (idx, score) in top_n_scored: print sentences[idx] summarySentences.append(sentences[idx]) return sentences if __name__=='__main__': obj =SummaryTxt('D:\work\Solr\solr-python\CNstopwords.txt') txt=u'十八大以來的五年,是黨和國家發展程序中極不平凡的五年。面對世界經濟復甦乏力、區域性衝突和動盪頻發、全球性問題加劇的外部環境,面對我國經濟發展進入新常態等一系列深刻變化,我們堅持穩中求進工作總基調,迎難而上,開拓進取,取得了改革開放和社會主義現代化建設的歷史性成就。' \ u'為貫徹十八大精神,黨中央召開七次全會,分別就政府機構改革和職能轉變、全面深化改革、全面推進依法治國、制定“十三五”規劃、全面從嚴治黨等重大問題作出決定和部署。五年來,我們統籌推進“五位一體”總體佈局、協調推進“四個全面”戰略佈局,“十二五”規劃勝利完成,“十三五”規劃順利實施,黨和國家事業全面開創新局面。' \ u'經濟建設取得重大成就。堅定不移貫徹新發展理念,堅決端正發展觀念、轉變發展方式,發展質量和效益不斷提升。經濟保持中高速增長,在世界主要國家中名列前茅,國內生產總值從五十四萬億元增長到八十萬億元,穩居世界第二,對世界經濟增長貢獻率超過百分之三十。供給側結構性改革深入推進,經濟結構不斷優化,數字經濟等新興產業蓬勃發展,高鐵、公路、橋樑、港口、機場等基礎設施建設快速推進。農業現代化穩步推進,糧食生產能力達到一萬二千億斤。城鎮化率年均提高一點二個百分點,八千多萬農業轉移人口成為城鎮居民。區域發展協調性增強,“一帶一路”建設、京津冀協同發展、長江經濟帶發展成效顯著。創新驅動發展戰略大力實施,創新型國家建設成果豐碩,天宮、蛟龍、天眼、悟空、墨子、大飛機等重大科技成果相繼問世。南海島礁建設積極推進。開放型經濟新體制逐步健全,對外貿易、對外投資、外匯儲備穩居世界前列。' \ u'全面深化改革取得重大突破。蹄疾步穩推進全面深化改革,堅決破除各方面體制機制弊端。改革全面發力、多點突破、縱深推進,著力增強改革系統性、整體性、協同性,壓茬拓展改革廣度和深度,推出一千五百多項改革舉措,重要領域和關鍵環節改革取得突破性進展,主要領域改革主體框架基本確立。中國特色社會主義制度更加完善,國家治理體系和治理能力現代化水平明顯提高,全社會發展活力和創新活力明顯增強。' # txt ='The information disclosed by the Film Funds Office of the State Administration of Press, Publication, Radio, Film and Television shows that, the total box office in China amounted to nearly 3 billion yuan during the first six days of the lunar year (February 8 - 13), an increase of 67% compared to the 1.797 billion yuan in the Chinese Spring Festival period in 2015, becoming the "Best Chinese Spring Festival Period in History".' \ # 'During the Chinese Spring Festival period, "The Mermaid" contributed to a box office of 1.46 billion yuan. "The Man From Macau III" reached a box office of 680 million yuan. "The Journey to the West: The Monkey King 2" had a box office of 650 million yuan. "Kung Fu Panda 3" also had a box office of exceeding 130 million. These four blockbusters together contributed more than 95% of the total box office during the Chinese Spring Festival period.' \ # 'There were many factors contributing to the popularity during the Chinese Spring Festival period. Apparently, the overall popular film market with good box office was driven by the emergence of a few blockbusters. In fact, apart from the appeal of the films, other factors like film ticket subsidy of online seat-selection companies, cinema channel sinking and the film-viewing heat in the middle and small cities driven by the home-returning wave were all main factors contributing to this blowout. A management of Shanghai Film Group told the 21st Century Business Herald.' print txt print "--" obj.summaryScoredtxt(txt) print "----" obj.summaryTopNtxt(txt)