1. 程式人生 > 其它 >jieba 分詞-26

jieba 分詞-26

   import jieba
mydict = ['璉二奶奶','鳳哥兒','鳳丫頭','寶姑娘','顰兒','二姑娘','三姑娘','四姑娘','雲妹妹','蓉大奶奶']
for item in mydict:
    jieba.add_word(item)
txt = open('紅樓夢.txt',"r", encoding='utf-8').read()
bieming = [["林妹妹","黛玉",'林姑娘','林黛玉'],[ "寶釵" ,'寶姑娘','寶丫頭',\
'寶姐姐','薛寶釵'],['元春','大姑娘','娘娘','貴妃','元妃','賈元春'],\
['迎春','二姑娘','賈迎春'],['探春','三姑娘','賈探春'],['惜春','四姑娘',\
'賈惜春'],[ "王熙鳳","鳳丫頭",'璉二奶奶','鳳姐','鳳哥兒','鳳辣子'],['巧姐'],\
['湘雲','雲妹妹','史湘雲'],['妙玉'],['李紈','大嫂子'],['秦可卿','可卿','蓉大奶奶']]
words=jieba.lcut(txt)
counts={}
for word in words:
    if len(word) ==1:
        continue
    else:
        counts[word] = counts.get(word,0)+1
lst=list()
for i in range(12):
    lt=0
    for item in bieming[i]:
        lt += counts.get(item,0)
    lst.append(lt)
    
twelvechai= ['林黛玉','薛寶釵','賈元春','賈迎春','賈探春','賈惜春','王熙鳳','巧姐','史湘雲','妙玉','李紈','秦可卿']
items=list()
for i in range(12):
    items.append([twelvechai[i],lst[i]])
items.sort(key=lambda x:x[1], reverse=True) 
for i in range(12):
    word, count = items[i]
    print ("{0:<10}{1:>5}".format(word, count))