無監督中文抽取式摘要
阿新 • • 發佈:2021-06-20
Github :https://github.com/dmmiller612/bert-extractive-summarizer
該git提供了一箇中文無監督抽取關鍵句的方法,主要思想就是bert做向量表示,然後利用聚類計算距離。本文提供了中文的實現方法
pip install bert-extractive-summarizer pip install spacy==2.3.1 pip install transformers pip install neuralcoref python -m spacy download zh_core_web_lg #中文spacy
import spacy import zh_core_web_lg import neuralcoref nlp = zh_core_web_lg.load() neuralcoref.add_to_pipe(nlp) # summarizer 中文模型 from summarizer import Summarizer from summarizer.sentence_handler import SentenceHandler from spacy.lang.zh import Chinese from transformers import * # Load model, model config and tokenizer via Transformers modelName = "bert-base-chinese" custom_config = AutoConfig.from_pretrained(modelName) custom_config.output_hidden_states=True custom_tokenizer = AutoTokenizer.from_pretrained(modelName) custom_model = AutoModel.from_pretrained(modelName, config=custom_config) model = Summarizer( custom_model=custom_model, custom_tokenizer=custom_tokenizer, sentence_handler = SentenceHandler(language=Chinese) ) body = "要摘要的文章" result = model(body) full = ''.join(result) print(full) # 摘要出來的句子 函式引數 model( body: str # The string body that you want to summarize ratio: float # The ratio of sentences that you want for the final summary min_length: int # Parameter to specify to remove sentences that are less than 40 characters max_length: int # Parameter to specify to remove sentences greater than the max length, num_sentences: Number of sentences to use. Overrides ratio if supplied. )