1. 程式人生 > >基於共現提取人民的民義人物關係

基於共現提取人民的民義人物關係

“`

-- coding: utf-8 --

import os,sys
import re
import jieba,codecs,math
import jieba.posseg as pseg
import string
from zhon.hanzi import punctuation

names = {} # 姓名字典,字典的鍵為人物名稱,值為該人物在全文中出現的次數
relationships = {} # 關係字典,人物關係的有向邊,該字典的鍵為有向邊的起點,值為一個字典edge,
# edge的鍵是有向邊的終點,值是有向邊的權值,代表兩個人物之間聯絡的緊密程度
lineNames = [] # 每集內人物關係,儲存對每一段分詞得到當前集中出現的人物名稱,lineName[i]是一個列表,列表中儲存第i集中出現過的人物。

jieba.load_userdict(“dict.txt”) # 載入字典
with open(“introduction.txt”,”r”) as f:
for line in f.readlines():
line = line.decode(‘GB2312’)
line = line.encode(‘utf-8’)
line = re.sub(ur”[%s]+” % punctuation, “”, line.decode(“utf-8”)) # 去標點
poss = pseg.cut(line)
lineNames.append([])
for w in poss:
if w.flag != “nr” or len(w.word)<2:
continue # 當分詞長度小於2或該詞詞性不為nr時認為該詞不為人名
lineNames[-1].append(w.word) # 為當前段的環境增加一個人物
if names.get(w.word) is None:
names[w.word] = 0
relationships[w.word] = {}
names[w.word] += 1 # 該人物出現次數加 1
for name, times in names.items():
print name,times

for line in lineNames: # 對於每一段
for name1 in line:
for name2 in line: # 每段中的任意兩個人
if name1 == name2:
continue
if relationships[name1].get(name2) is None: # 若兩人尚未同時出現則新建項
relationships[name1][name2]= 1
else:
relationships[name1][name2] = relationships[name1][name2]+ 1

with codecs.open(“node.txt”, “w”, “gbk”) as f:
f.write(“Id Label Weight\r\n”)
for name, times in names.items():
f.write(name + ” ” + name + ” ” + str(times) + “\r\n”)

with codecs.open(“edge.txt”, “w”, “gbk”) as f:
f.write(“Source Target Weight\r\n”)
for name, edges in relationships.items():
for v, w in edges.items():
if w > 3:
f.write(name + ” ” + v + ” ” + str(w) + “\r\n”)“`