1. 程式人生 > >哈工大LTP部署及測試Demo

哈工大LTP部署及測試Demo

#coding: utf-8
import os
from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer
from pyltp import Parser
from pyltp import SementicRoleLabeller
import re
# import processHandler
import pyltpT

#pyltp官方文件http://pyltp.readthedocs.io/zh_CN/develop/api.html#id15
#http://blog.csdn.net/MebiuW/article/details/52496920 #http://blog.csdn.net/lalalawxt/article/details/55804384 LTP_DATA_DIR = 'E:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目錄的路徑 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分詞模型路徑,模型名稱為`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model'
) # 詞性標註模型路徑,模型名稱為`pos.model` ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名實體識別模型路徑,模型名稱為`pos.model` par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路徑,模型名稱為`parser.model` srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model') # 語義角色標註模型目錄路徑, print("======================>>>>"
+srl_model_path) def main(): #sentence_splitter() words = segmentor('我家在中科院,我現在在北京上學。中秋節你是否會想到李白?') # print(words) tags = posttagger(words) netags=ner(words,tags) arcs = parse(words,tags) roles = role_label(words, tags, netags, arcs) print(roles) # 分句,也就是將一片文字分割為獨立的句子 def sentence_splitter(sentence='你好,你覺得這個例子從哪裡來的?當然還是直接複製官方文件,然後改了下這裡得到的。我的微博是MebiuW,轉載請註明來自MebiuW'): sents = SentenceSplitter.split(sentence) # 分句 print('\n'.join(sents)) """分詞""" def segmentor(sentence=None): segmentor = Segmentor() # 初始化例項 segmentor.load(cws_model_path) # 載入模型 words = segmentor.segment(sentence) # 分詞 #預設可以這樣輸出 print ('\t'.join(words)) # 可以轉換成List 輸出 words_list = list(words) segmentor.release() # 釋放模型 return words_list """詞性標註""" def posttagger(words): #LTP_DATA_DIR = 'E:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目錄的路徑 postagger = Postagger() # 初始化例項 postagger.load(pos_model_path) # 載入模型 postags = postagger.postag(words) # 詞性標註 for word,tag in zip(words,postags): print(word+'/'+tag) postagger.release() # 釋放模型 return postags """命名實體識別""" def ner(words,postags): print('命名實體開始') recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) #載入模型 netags = recognizer.recognize(words,postags) #命名實體識別 for word,ntag in zip(words,netags): print(word+'/'+ ntag) recognizer.release() #釋放模型 nerttags = list(netags) return nerttags """依法""" def parse(words, postags): parser = Parser() # 初始化例項 parser.load(par_model_path) # 載入模型 arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 釋放模型 return arcs def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化例項 labeller.load(srl_model_path) # 載入模型 roles = labeller.label(words, postags, netags, arcs) # 語義角色標註 for role in roles: print(role.index+"".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) labeller.release() # 釋放模型 return roles def testss(): sentence="600251後市怎樣操作?謝謝。" hagongdaLTP=pyltpT.PyltpT() hagongdalist = hagongdaLTP.ltpmain(sentence) sentence1="$精華製藥(sz003買入,短線還有機會嗎?壓力位多少,大概什麼價位賣掉合適?謝謝。" hagongdaLTP1 = pyltpT.PyltpT() hagongdalist2 = hagongdaLTP1.ltpmain(sentence1) print(hagongdalist) for item in hagongdalist2: if 'sh' or 'sz' in item: hagongdalist2[hagongdalist2.index(item)]=item[2:8] item = re.sub("[\s+\.\!\/_,\[\]$\-:);%=^*(+\"\']+|[+——\“\”?。?<《》>[email protected]#%……&*()]+", '', item) if len(item)==1: hagongdalist2.remove(item) print(hagongdalist2) def hagongda2(sentence): LTP_DATA_DIR = 'E:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目錄的路徑 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分詞模型路徑,模型名稱為`cws.model` cidian_path = os.path.join(LTP_DATA_DIR, 'cidian.txt') print(cidian_path) segmentor = Segmentor() # 初始化例項 segmentor.load_with_lexicon(cws_model_path, cidian_path) # 載入模型 sentence = ''.join(sentence.split()) sentence = re.sub("[\s+\.\!\/_,\[\]$\-:);%=^*(+\"\']+|[+——\“\”?。?<《》>[email protected]#%……&*()]+", '', sentence) words = segmentor.segment(sentence) #print(' '.join(words)) words_list = list(words) segmentor.release() return words_list def test2(): LTP_DATA_DIR = 'E:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目錄的路徑 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分詞模型路徑,模型名稱為`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 詞性標註模型路徑,模型名稱為`pos.model` ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名實體識別模型路徑,模型名稱為`pos.model` par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路徑,模型名稱為`parser.model` srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 語義角色標註模型目錄路徑, segmentor = Segmentor() segmentor.load(cws_model_path) words = segmentor.segment('亞硝酸鹽是一種化學物質') print(' '.join(words)) segmentor.release() def test(): project_path = "E:\\BaiduNetdiskDownload\\ltp_data_v3.4.0" # 專案資料夾目錄 # 可設定ltp_test、(cwsposparner_cmdline,但是注意各自能用的引數,沒有的引數請置空"" model_exe = "cws_cmdline" # 又如cws_cmdline threads_num = " --threads " + str(3) # 更改執行緒數 #last_stage = " --last-stage " + "all" # 最終步驟,可設定wsposnerdpsrlall input_path = " --input " + "E:\\BaiduNetdiskDownload\\ltp_data_v3.4.0\\file\\test.txt" # 輸入檔案 seg_lexicon = "" # 分詞使用者詞典 pos_lexicon = "" # 詞性標註使用者詞典 output_path = "E:\\LTP-project\\file\\out.txt" # 輸出檔案 command = "cd " + project_path + " & " + model_exe + threads_num + input_path + " > " + output_path os.system(command) if __name__ == '__main__': main()