Pandas結合Jieba提取xlsx表格檔案高頻詞彙
阿新 • • 發佈:2019-01-25
import pandas as pd
import jieba.analyse
def sort_key(keylist):
'''
高頻關鍵詞排序
:param keylist: 關鍵詞列表
:return:
'''
percentage_number = int(keylist.count()*percentage)
keyword_string = keylist.to_string()
res = jieba.analyse.extract_tags(keyword_string,
percentage_number)
print(percentage_number, res)
return res
def write_excel(res_ls, sheet_name, writer):
df = pd.DataFrame(res_ls)
df.to_excel(writer, sheet_name)
def run(sheet_name, writer):
tb = pd.read_excel("D:/title.xls", sheet_name)
res_ls = {}
length = 0
for i in tb:
res_ls[i] = sort_key(tb.get(i))
if len(res_ls[i]) > length:
length = len(res_ls[i])
# 補齊佇列,長度保持一致
for i in res_ls:
i_len = len(res_ls.get(i))
if i_len < length:
res_ls[i].extend([""] * (length - i_len))
write_excel(res_ls, sheet_name, writer)
# 百分比
percentage = 0.15
writer = pd.ExcelWriter("d:/foo.xlsx")
ls = [u"表一", u"表二", u"表三", u"表四", u"表五"]
for x in ls:
run(x, writer)
writer.save()