pynlpir + pandas 文本分析
阿新 • • 發佈:2018-04-30
set del panda 創建 prop imp english nes get
導入包:
import pynlpir import pandas as pd import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties import multiprocessing,threading,time
讀入初始文本、停用詞文件,創建保存初始分詞數據的Dataframe
f_1 = open(r"C:\Users\lenovo\Desktop\肖老師爬蟲項目\停用詞.txt", "r") stopwords = f_1.read().splitlines() f_1.close() f = open(r"C:\Users\lenovo\Desktop\肖老師爬蟲項目\data_3.txt", "r") pd_root = pd.DataFrame(columns=[‘詞匯‘, ‘詞性‘])
一些參數:
time_start = time.time() #用於既是計時 pynlpir.open() font = FontProperties(fname=r‘c:\windows\fonts\simhei.ttf‘, size=13) #設置畫圖時的字體
過濾停用詞函數:
def stopword_delete(df): global stopwords for i in range(df.shape[0]): if (df.詞匯[i] in stopwords): df.drop(i,inplace=True) else: pass return df
由於文件裏文本內容比較多,直接讀取、分詞、過濾會比較慢,采用多線程按行讀取並處理
單行處理函數:
def line_deal(line): global pd_root line = line.replace(" ", "") segment = pynlpir.segment(line, pos_names=‘parent‘, pos_english=False) #對單行分詞 pd_line = pd.DataFrame(segment,columns=[‘詞匯‘,‘詞性‘]) #單行datafrrame pd_line = stopword_delete(pd_line) #過濾停用詞 pd_root = pd_root.append(pd_line,ignore_index=True)
使用多線程讀取:
threads_list = [] #線程列表 thread_max = 30 #最大線程 n=0 for line in f: p = threading.Thread(target=line_deal,args=(line,)) threads_list.append(p) p.start() n=n+1 print(len(threads_list),n) #打印當前線程數和讀取到的行數 for pro in threads_list: if pro.is_alive() == True: continue else: threads_list.remove(pro) if len(threads_list) >= thread_max: time.sleep(0.1) else: continue
f.close() #讀取完後關閉文件
打印最初分詞後的數據:
print(pd_root.head(10))
創建詞匯-頻數庫:
pd_word_num = pd.DataFrame(pd_root[‘詞匯‘].value_counts()) pd_word_num.rename(columns={‘詞匯‘: ‘頻數‘}) pd_word_num.rename(columns={‘詞匯‘:‘頻數‘},inplace=True) pd_word_num[‘百分比‘] = pd_word_num[‘頻數‘] / pd_word_num[‘頻數‘].sum() print(pd_word_num.head(10))
創建詞性-頻數庫:
pd_qua_num = pd.DataFrame(pd_root[‘詞性‘].value_counts()) #更改列名 pd_qua_num.rename(columns={‘詞性‘:‘頻數‘},inplace=True) #添加百分比列:詞性-頻數-百分比 pd_qua_num[‘百分比‘] = pd_qua_num[‘頻數‘] / pd_qua_num[‘頻數‘].sum() print(pd_qua_num.head(10))
統計幾種重要詞性的詞匯分布:
# 定義6類詞性統計數據框 columns_selected=[‘動詞‘,‘動詞計數‘,‘名詞‘,‘名詞計數‘,‘代詞‘,‘代詞計數‘, ‘時間詞‘,‘時間詞計數‘,‘副詞‘,‘副詞計數‘,‘形容詞‘,‘形容詞計數‘] pd_Top6 = pd.DataFrame(columns=columns_selected) for i in range(0,12,2): pd_Top6[columns_selected[i]] = pd_root.loc[pd_root[‘詞性‘]==columns_selected[i]][‘詞匯‘].value_counts().reset_index()[‘index‘] pd_Top6[columns_selected[i+1]] = pd_root.loc[pd_root[‘詞性‘]==columns_selected[i]][‘詞匯‘].value_counts().reset_index()[‘詞匯‘] print(pd_Top6.head(10))
提取文本中關鍵詞:
key_words = pynlpir.get_key_words(str, weighted=True) print(key_words)
繪圖:
def paint(df,x,y,title): plt.subplots(figsize=(7,5)) plt.yticks(fontproperties=font,size=10) plt.xlabel(x,fontproperties=font,size=10) plt.ylabel(y,fontproperties=font,size=10) plt.title(title,fontproperties=font) df.iloc[:10][‘頻數‘].plot(kind=‘barh‘) plt.show() paint(pd_word_num,"頻數","詞匯","詞匯分布") paint(pd_qua_num,"頻數","詞性","詞性分布")
fig = plt.figure(figsize=(10,5)) fig.subplots_adjust(hspace=0.3,wspace=0.2) for i in range(1,7): pd_qua = pd_Top6.iloc[:,[(2*i-2),2*i-1]] pd_qua.columns = [pd_qua.columns[0],‘頻數‘] pd_qua = pd_qua.set_index(pd_qua.columns[0]) print(pd_qua) ax = fig.add_subplot(2,3,i) pd_qua.head(10)[‘頻數‘].plot(kind=‘bar‘) ax.set_xticklabels(pd_qua.head(10).index,fontproperties=font,size=10,rotation=30) ax.set_title(pd_qua.index.name,fontproperties=font) fig.tight_layout() fig.show()
pynlpir + pandas 文本分析