1. 程式人生 > 其它 >對商品的評論進行資料探勘得到評論標籤(商品屬性+評論觀點),以及使用者的分組資訊

對商品的評論進行資料探勘得到評論標籤(商品屬性+評論觀點),以及使用者的分組資訊

技術標籤:ppython自然語言處理觀點抽取評論資料ltp依存分析

對商品的評論進行資料探勘得到評論標籤(商品屬性+評論觀點),以及使用者的分組資訊:

第一步:對文字進行預處理,分詞並進行語義角色標註

# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
import pandas as pd
import numpy as
np import heapq import re import emoji class Sentence_Parser: def __init__(self): #LTP_DIR = 'F:\project support\ltp_data_v3.4.0' LTP_DIR = './ltp_data_v3.4.0' # 分詞 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, 'cws.model')) # 詞性標註
self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, 'pos.model')) # 依存句法分析 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, 'parser.model')) # 命名實體識別(人名、地名、機構名等) self.recognizer = NamedEntityRecognizer() self.
recognizer.load(os.path.join(LTP_DIR, 'ner.model')) # 詞義角色標註(施事、受事、時間、地點) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) def format_labelrole(self, words, postags): """ 詞義角色標註 """ arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments} # for item in roles_dict.items(): # print(item) return roles_dict def bulid_parser_child_dict(self, words, postags, arcs): """ 句法分析---為句子中的每個詞語維護一個儲存句法依存子節點的字典 """ child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if arcs[arc_index].relation not in child_dict: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # print(rely_id) relation = [arc.relation for arc in arcs] # for i in range(len(relation)): # print(words[i], '_', postags[i], '_', i, '_', relation[i]) heads = ['Root' if id == 0 else words[id-1] for id in rely_id] # print(heads) for i in range(len(words)): a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list def parser_main(self, sentence): """ parser主函式 """ words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.bulid_parser_child_dict(words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list def select(self, words, postags): """ 篩選出名詞和形容詞 """ co_model = Word2Vec.load('coseg_text.model') n_list0 = [] a_list = [] for i in range(len(postags)): if postags[i] == 'n': if len(words[i]) >= 2: n_list0.append(words[i]) if postags[i] == 'a': # if len(words[i]) >= 2: a_list.append(words[i]) n_list0 = list(set(n_list0)) a_list = list(set(a_list)) # print(n_list0) # print(a_list) si_p = [] for n in n_list0: try: s = co_model.similarity(n, '手機') si_p.append(s) except Exception as e: si_p.append(0) index_list = list(map(si_p.index, heapq.nlargest(int(0.8*len(si_p)), si_p))) #取出和手機相關度最高的n n_list = [] for index in index_list: n_list.append(n_list0[index]) # print(n_list) return n_list, a_list def simlarity(self, n_list0, a_list): """ 計算相似度,進行正逆向匹配,篩選出名詞和形容詞的最佳搭配 """ n_list0 = n_list0 a_list = a_list co_model = Word2Vec.load('coseg_text.model') si_p = [] for n in n_list0: try: s = co_model.similarity(n, '手機') si_p.append(s) except Exception as e: si_p.append(0) index_list = list(map(si_p.index, heapq.nlargest(int(0.8*len(si_p)), si_p))) #取出和手機相關度最高的n n_list = [] for index in index_list: n_list.append(n_list0[index]) # 名詞正向匹配 comment1_df = pd.DataFrame(columns=['comment_tag', 'similarity'], index=[np.arange(100)]) index = 0 for i in range(len(n_list)): f_si = 0 for j in range(len(a_list)): try: si = co_model.similarity(n_list[i], a_list[j]) if si >= f_si: f_si = si comment_tag = n_list[i] + a_list[j] else: f_si = f_si except Exception as e: print('語料庫中缺少該詞', e) comment1_df.loc[index, ] = [comment_tag, f_si] index += 1 comment1_df = comment1_df.sort_values(by='similarity', ascending=False, ignore_index=True) comment1_df.dropna(subset=['comment_tag'], inplace=True) # comment1_df = comment1_df.iloc[0: int(0.2*len(comment_df)), ] # 形容詞匹配逆向匹配 comment2_df = pd.DataFrame(columns=['comment_tag', 'similarity'], index=[np.arange(100)]) index = 0 for i in range(len(a_list)): f_si = 0 for j in range(len(n_list)): try: si = co_model.similarity(n_list[j], a_list[i]) if si >= f_si: f_si = si comment_tag = n_list[j] + a_list[i] else: f_si = f_si except Exception as e: print('語料庫中缺少該詞', e) comment2_df.loc[index, ] = [comment_tag, f_si] index += 1 comment2_df = comment2_df.sort_values(by='similarity', ascending=False, ignore_index=True) comment1_df.dropna(subset=['comment_tag'], inplace=True) comment_df = pd.merge(comment1_df, comment2_df, on='comment_tag', how='inner') comment_df.dropna(subset=['comment_tag'], inplace=True) return comment_df def cleandata(self, x): """ 對資料進行清洗,替換一些不規則的標點符號 """ pat = re.compile("[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]") # 只保留中英文,去掉符號 x = x.replace(' ', ',') emoji.demojize(x) # 去掉表情表情符號 x = re.sub(pat, ',', x) return x

第二步:提取實體和相關實體資訊


```python
# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
from cixing import Sentence_Parser
import pandas as pd
import numpy as np
import heapq
import re
import emoji

class Extractor:
    def __init__(self):
        self.co_model = Word2Vec.load('coseg_text.model')
        self.parser = Sentence_Parser()

    def get_seginfo(self, comment_list):
        for c in range(len(comment_list)):
            if len(comment_list[c]) <= 200:
                sentence = comment_list[c]
            else:
                sentence = comment_list[c][0: 201]
            if sentence != '':
                sentence = self.parser.cleandata(sentence)
                words, postags, child_dict_list, roles_dict, format_parse_list = self.parser.parser_main(sentence)
                n_list, a_list = self.parser.select(words, postags)

                tags = []
                for j in range(len(a_list)):
                    # print(child_dict_list[j])
                    p = words.index(a_list[j])
                    if child_dict_list[p]:
                        # print(child_dict_list[p])
                        # 構成的是主謂關係
                        if 'SBV' in child_dict_list[p]:
                            # print(child_dict_list[p])
                            si_p = []
                            for po in child_dict_list[p]['SBV']:
                                try:
                                    si = self.co_model.similarity(words[po], '手機')
                                    si_p.append(si)
                                except Exception as e:
                                    si_p.append(0)
                                id = list(map(si_p.index, heapq.nlargest(1, si_p)))  # 和該形容詞最高的名詞

                            s = child_dict_list[p]['SBV'][id[0]]
                            w1 = words[s] + a_list[j]
                            if child_dict_list[s]:
                                # print(child_dict_list[s])
                                if 'ATT' in child_dict_list[s]:
                                    if postags[child_dict_list[s]['ATT'][0]] == 'n':
                                        w2 = words[child_dict_list[s]['ATT'][0]] + w1
                                        tags.append(w2)
                                    else:
                                        tags.append(w1)
                            else:
                                tags.append(w1)

                        if 'ATT' in child_dict_list[p]:
                            # print(child_dict_list[p])
                            s = child_dict_list[p]['ATT'][0]
                            if 'SBV' in child_dict_list[s]:
                                w3 = words[child_dict_list[s]['SBV'][0]]
                                w4 = w3 + a_list[j]
                                id1 = words.index(w3)
                                if child_dict_list[id1]:
                                    if 'ATT' in child_dict_list[id1]:
                                        if postags[child_dict_list[id1]['ATT'][0]] == 'n':
                                            w5 = words[child_dict_list[id1]['ATT'][0]] + w4
                                            tags.append(w5)
                                else:
                                    tags.append(w4)

                with open('F:\pycharm project data\\taobao\phone\\tags.txt', 'a') as t:
                    t.writelines(' '.join(tags))
                    t.writelines('\n')
                    # f.close()
                print(tags)


                # 獲取相關的名詞和使用者組
                n_list = list(set(n_list))
                if n_list:
                    with open('F:\pycharm project data\\taobao\phone\\noun.txt', 'a') as f:
                        f.writelines(' '.join(n_list))
                        f.writelines('\n')
                        # f.close()
                si_p = []
                u_list = ['小孩子', '作業', '高中', '初中', '兒童', '學校', '小孩', '老師', '網癮', '中學生', '小學', '女兒', '小學生', '孩子', '閨女', '兒子', '學生', '網課', '小朋友',
                            '同事', '表弟', '親戚', '姐妹', '表哥', '鄰居', '同學', '朋友', '盆友', '連結',
                            '姥姥', '老太太', '老人', '岳母', '父親', '老孃', '小姨', '老丈人', '舅舅', '岳父', '親人', '老媽子', '老頭兒', '婆婆', '老太', '老頭子', '父母', '家婆', '老父親', '老爹', '長輩', '大人', '外爺', '爺爺', '我爸', '老頭', '老媽', '老爺子', '爸媽', '奶奶', '老伴', '老爸', '母親', '老人家', '媽媽', '公公', '爸爸', '丈母孃', '姥爺', '家裡人', '家人',
                            '老奶奶', '小夥子', '阿姨', '娘娘', '小姑子', '姐姐', '老妹', '嬸嬸', '大姐', '外孫', '小屁孩', '孫子', '姨媽', '棉襖', '伯母', '孝心',
                            '媳婦', '妹妹', '男朋友', '物件', '生日', '女朋友', '男票', '老婆', '弟弟', '情人節', '爹媽', '麻麻', '老公', '外甥', '老弟'
                ]
                # print(n_list)
                # print(n_list)
                for n in range(len(n_list)):
                    for u in range(len(u_list)):
                        try:
                            s = self.co_model.similarity(n_list[n], u_list[u])
                            si_p.append(s)
                        except Exception as e:
                                si_p.append(0)
                index_list = list(map(si_p.index, heapq.nlargest(1, si_p)))  # 取出和手機相關度最高的n
                # print(index_list)
                user_list = []
                for index in index_list:
                    index = int(index/len(u_list))
                    user_list.append(n_list[index])
                # print(user_list)
                with open('F:\pycharm project data\\taobao\phone\\user.txt', 'a') as u:
                    u.writelines(user_list)
                    u.writelines('\n')
                    # f.close()
            t.close()
            f.close()
            u.close()

第三步:測試資料以及測試模型

# -*- coding:utf-8 -*-
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import heapq
import re
import emoji
from extractor import Extractor

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 5000)
pd.set_option('max_colwidth', 30)
pd.set_option('display.width', 1000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)

# 一、資料處理
# 匯入資料
df = pd.read_csv('F:\pycharm project data\\taobao\phone\\comment1.csv', encoding='utf-8-sig')
# 提取評論資料
co_df = df[['content']]
co_df = co_df.loc[co_df['content'] != '15天內買家未作出評價', ['content']]
co_df = co_df.loc[co_df['content'] != '評價方未及時做出評價,系統預設好評!', ['content']]
comment_list = co_df['content'].tolist()




if __name__ == '__main__':
    myextractor = Extractor()
    #myextractor.get_seginfo(comment_list)