1. 程式人生 > 其它 >vivo工具開發線上計算語言模型打分

vivo工具開發線上計算語言模型打分

import json
import time
import uuid
from tqdm import tqdm
from json import JSONDecodeError
from flask import Flask, jsonify, request
import logging

from call_tokenize import get_tokenize_and_query

app = Flask(__name__)

kv_dict = {}


def arpa_read(files):
    with open(files, mode='r', encoding='
UTF-8') as f1: arpa_list = f1.readlines() for line in tqdm(arpa_list): try: splitList = line.strip("\n").split("\t") if len(splitList) == 2: key = splitList[1] v1 = float(splitList[0]) v2
= 0 elif len(splitList) == 3: key = splitList[1] v1 = float(splitList[0]) v2 = float(splitList[2]) else: continue kv_dict[key] = (v1, v2) except Exception:
continue return kv_dict #arpa_read("D:\\Users\\72152411\\Documents\\vchat\\ChatFiles\\trainfile.lm") # 呼叫函式arpa檔案轉換成字典 arpa_read("./1_9_arpa") # 呼叫函式arpa檔案轉換成字典 def _score(sentence): def calculate_sentence_start(word_0, word_1): key = word_0 + " " + word_1 if key in kv_dict: return kv_dict[key][0] else: return kv_dict[word_1][0] + kv_dict[word_0][1] def score_bigram_prob(w1,w2): s2 = 0 if w1 == "<s>": calculate_sentence_start(w1,w2) else: key = w1 + " " + w2 if key in kv_dict: s2 += kv_dict[key][0] return s2 else: s2 += kv_dict[w2][0] + kv_dict[w1][1] return s2 def score_trigram_prob(trigram_list): first, second, third = trigram_list tri_key = " ".join(trigram_list) if tri_key in kv_dict: return kv_dict[tri_key][0] # 需要回退 else: bi_key = second + " " + third bi_bow_key = first + " " + second # 回退到bigram if bi_key in kv_dict: bi_prob = kv_dict[bi_key][0] bi_bow = 0 # 後面的二元有, 前面上文的backoff可以查到 if bi_bow_key in kv_dict: bi_bow = kv_dict[bi_bow_key][1] bi_prob += bi_bow return bi_prob # 回退到unigram else: if third not in kv_dict: raise ValueError bi_bow = 0 uni_bow = 0 # 前面上文的backoff可以查到 if bi_bow_key in kv_dict: bi_bow = kv_dict[bi_bow_key][1] if second in kv_dict: uni_bow = kv_dict[second][1] uni_prob = kv_dict[third][0] + uni_bow + bi_bow return uni_prob try: sentence = ("<s> " + sentence).strip() wordArr = sentence.strip().split(" ") # 對輸入語料進行切分 wordArrK = tuple(wordArr) # 轉換成元組 因為字典的k不能是list 將切分好的語料和字典的k匹配 total_score = 0 if len(wordArrK) == 1: # <s> return kv_dict.get(wordArrK[0])[0] elif len(wordArrK) == 2: # <s> 今天 return calculate_sentence_start(wordArrK[0], wordArrK[1]) else: total_score += calculate_sentence_start(wordArrK[0], wordArrK[1]) for i in range(0, len(wordArrK) - 2): total_score += score_trigram_prob(wordArrK[i:i + 3]) return total_score except Exception as e: app.logger.exception(e) return -1 @app.route('/ngram_score', methods=['POST']) def score(): try: parameters = request.form print(parameters) sentence = parameters['sentence'] print(sentence) score_result = _score(sentence) response = {"score": score_result} response = jsonify(response) except Exception: response = 'Internal error', 500 return response @app.route('/input_score', methods=['POST']) def input_score(): try: parameters = request.form user_input = parameters['user_input'] keyboard_type = parameters['keyboard_type'] tokenize_result = get_tokenize_and_query(user_input, keyboard_type) result = [] for words, score in tokenize_result: sent = " ".join(words) score_result = _score(sent) result.append((sent, score_result, score)) response = result response = jsonify(response) except Exception as e: app.logger.exception(e) response = 'Internal error', 500 return response if __name__ == '__main__': app.run(host='0.0.0.0', port=9494)