基於CRF的中文句法依存分析模型
阿新 • • 發佈:2018-11-30
依存關係本身是一個樹結構,每一個詞看成一個節點,依存關係就是一條有向邊。本文主要通過清華大學的句法標註語料庫。
實現程式碼:
import sys reload(sys) sys.setdefaultencoding('utf8') import sklearn_crfsuite from sklearn_crfsuite import metrics from sklearn.externals import joblib class CorpusProcess(object): def __init__(self): """初始化""" self.train_process_path = "D://input_py//day17//train.data" #預處理之後的訓練集 self.test_process_path = "D://input_py//day17//dev.data" #預處理之後的測試集 def read_corpus_from_file(self, file_path): """讀取語料""" f = open(file_path, 'r') # ,encoding='utf-8' lines = f.readlines() f.close() return lines def write_corpus_to_file(self, data, file_path): """寫語料""" f = open(file_path, 'w') f.write(str(data)) f.close() def process_sentence(self,lines): """處理句子""" sentence = [] for line in lines: if not line.strip(): yield sentence sentence = [] else: lines = line.strip().split(u'\t') result = [line for line in lines] sentence.append(result) def initialize(self): """語料初始化""" train_lines = self.read_corpus_from_file(self.train_process_path) test_lines = self.read_corpus_from_file(self.test_process_path) self.train_sentences = [sentence for sentence in self.process_sentence(train_lines)] self.test_sentences = [sentence for sentence in self.process_sentence(test_lines)] def generator(self, train=True): """特徵生成器""" if train: sentences = self.train_sentences else: sentences = self.test_sentences return self.extract_feature(sentences) def extract_feature(self, sentences): """提取特徵""" features, tags = [], [] for index in range(len(sentences)): feature_list, tag_list = [], [] for i in range(len(sentences[index])): feature = {"w0": sentences[index][i][0], "p0": sentences[index][i][1], "w-1": sentences[index][i-1][0] if i != 0 else "BOS", "w+1": sentences[index][i+1][0] if i != len(sentences[index])-1 else "EOS", "p-1": sentences[index][i-1][1] if i != 0 else "un", "p+1": sentences[index][i+1][1] if i != len(sentences[index])-1 else "un"} feature["w-1:w0"] = feature["w-1"]+feature["w0"] feature["w0:w+1"] = feature["w0"]+feature["w+1"] feature["p-1:p0"] = feature["p-1"]+feature["p0"] feature["p0:p+1"] = feature["p0"]+feature["p+1"] feature["p-1:w0"] = feature["p-1"]+feature["w0"] feature["w0:p+1"] = feature["w0"]+feature["p+1"] feature_list.append(feature) tag_list.append(sentences[index][i][-1]) features.append(feature_list) tags.append(tag_list) return features, tags class ModelParser(object): def __init__(self): """初始化引數""" self.algorithm = "lbfgs" self.c1 = 0.1 self.c2 = 0.1 self.max_iterations = 100 self.model_path = "model.pkl" self.corpus = CorpusProcess() #初始化CorpusProcess類 self.corpus.initialize() #語料預處理 self.model = None def initialize_model(self): """模型初始化""" algorithm = self.algorithm c1 = float(self.c1) c2 = float(self.c2) max_iterations = int(self.max_iterations) self.model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=True) def train(self): """訓練""" self.initialize_model() x_train, y_train = self.corpus.generator() self.model.fit(x_train, y_train) labels = list(self.model.classes_) x_test, y_test = self.corpus.generator(train=False) y_predict = self.model.predict(x_test) metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print(metrics.flat_classification_report(y_test, y_predict, labels=sorted_labels, digits=3)) self.save_model() def predict(self, sentences): """模型預測""" self.load_model() features, _ = self.corpus.extract_feature(sentences) return self.model.predict(features) def load_model(self, name='model'): """載入模型 """ self.model = joblib.load(self.model_path) def save_model(self, name='model'): """儲存模型""" joblib.dump(self.model, self.model_path) model = ModelParser() model.train()
執行結果:
precision recall f1-score support -1_a 0.811 0.796 0.804 221 -1_b 0.783 0.770 0.777 61 -1_c 0.000 0.000 0.000 5 -1_d 0.711 0.409 0.519 66 -1_f 0.867 0.565 0.684 23 -1_h 0.000 0.000 0.000 0 -1_k 0.667 1.000 0.800 2 -1_m 0.905 0.895 0.900 256 -1_n 0.720 0.754 0.737 967 11_n 0.000 0.000 0.000 0 -1_ng 0.000 0.000 0.000 23 -1_nl 0.750 0.143 0.240 21 -1_nr 1.000 0.059 0.111 17 -1_nr1 0.000 0.000 0.000 0 -1_nr2 0.000 0.000 0.000 0 -1_nrf 0.000 0.000 0.000 25 -1_nrj 0.000 0.000 0.000 2 -1_ns 0.870 0.400 0.548 50 -1_nsf 0.822 0.402 0.540 92 -1_nt 0.667 0.286 0.400 14 -1_nz 0.000 0.000 0.000 7 -1_o 0.000 0.000 0.000 1 -1_p 0.524 0.214 0.303 103 -1_q 0.706 0.649 0.676 37 -1_r 0.946 0.841 0.891 63 -1_s 0.737 0.933 0.824 15 -1_t 0.952 0.894 0.922 66 -1_u 0.000 0.000 0.000 3 -1_v 0.628 0.669 0.648 2396 -1_x 0.000 0.000 0.000 0 -1_z 0.875 0.636 0.737 11 -2_a 0.800 0.364 0.500 11 -2_b 0.000 0.000 0.000 1 -2_c 0.000 0.000 0.000 0 -2_d 0.000 0.000 0.000 2 -2_f 0.000 0.000 0.000 1 -2_m 0.897 0.876 0.886 89 -2_n 0.095 0.021 0.034 95 -2_ng 0.000 0.000 0.000 0 -2_nl 0.000 0.000 0.000 1 -2_nr 0.000 0.000 0.000 1 -2_nr2 0.000 0.000 0.000 0 -2_nrf 0.000 0.000 0.000 3 -2_ns 0.000 0.000 0.000 3 -2_nsf 0.000 0.000 0.000 3 -2_nz 0.000 0.000 0.000 1 -2_p 0.000 0.000 0.000 9 -2_q 0.000 0.000 0.000 2 -2_r 0.000 0.000 0.000 3 -2_s 0.000 0.000 0.000 0 -2_t 1.000 0.920 0.958 25 -2_u 0.000 0.000 0.000 0 -2_v 0.326 0.211 0.256 445 -2_z 0.000 0.000 0.000 0 -3_a 1.000 0.500 0.667 2 -3_b 0.000 0.000 0.000 0 -3_d 0.000 0.000 0.000 0 -3_m 0.625 0.750 0.682 20 -3_n 0.000 0.000 0.000 20 -3_nl 0.000 0.000 0.000 0 -3_nsf 0.000 0.000 0.000 0 -3_p 0.000 0.000 0.000 1 -3_q 0.000 0.000 0.000 0 -3_t 1.000 0.571 0.727 14 -3_v 0.125 0.045 0.066 112 -4_b 0.000 0.000 0.000 0 -4_d 0.000 0.000 0.000 0 -4_m 0.500 0.200 0.286 5 -4_n 0.000 0.000 0.000 8 -4_nsf 0.000 0.000 0.000 0 -4_p 0.000 0.000 0.000 1 -4_t 0.000 0.000 0.000 0 -4_v 0.000 0.000 0.000 33