1. 程式人生 > >pynlpir + pandas 文本分析

pynlpir + pandas 文本分析

set del panda 創建 prop imp english nes get

導入包:

import pynlpir
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import multiprocessing,threading,time

讀入初始文本、停用詞文件,創建保存初始分詞數據的Dataframe

f_1 = open(r"C:\Users\lenovo\Desktop\肖老師爬蟲項目\停用詞.txt", "r")
stopwords = f_1.read().splitlines()
f_1.close()
f = open(r"C:\Users\lenovo\Desktop\肖老師爬蟲項目\data_3.txt", "r")
pd_root = pd.DataFrame(columns=[‘詞匯‘, ‘詞性‘])

一些參數:

time_start = time.time()  #用於既是計時
pynlpir.open()
font = FontProperties(fname=r‘c:\windows\fonts\simhei.ttf‘, size=13)  #設置畫圖時的字體
過濾停用詞函數:
def stopword_delete(df):
    global stopwords
    for i in range(df.shape[0]):
        if (df.詞匯[i] in stopwords):
            df.drop(i,inplace=True)
        else:
            pass
    return df

由於文件裏文本內容比較多,直接讀取、分詞、過濾會比較慢,采用多線程按行讀取並處理

單行處理函數:

def line_deal(line):
    global pd_root
    line = line.replace(" ", "")
    segment = pynlpir.segment(line, pos_names=‘parent‘, pos_english=False)  #對單行分詞
    pd_line = pd.DataFrame(segment,columns=[‘詞匯‘,‘詞性‘])  #單行datafrrame
    pd_line = stopword_delete(pd_line)  #過濾停用詞
    pd_root = pd_root.append(pd_line,ignore_index=True)

使用多線程讀取:

threads_list = []   #線程列表
thread_max = 30  #最大線程
n=0
for line in f:
    p = threading.Thread(target=line_deal,args=(line,))
    threads_list.append(p)
    p.start()
    n=n+1
    print(len(threads_list),n)  #打印當前線程數和讀取到的行數
    for pro in threads_list:
        if pro.is_alive() == True:
            continue
        else:
            threads_list.remove(pro)
    if len(threads_list) >= thread_max:
        time.sleep(0.1)
    else:
        continue
f.close() #讀取完後關閉文件

打印最初分詞後的數據:

print(pd_root.head(10))

技術分享圖片

創建詞匯-頻數庫:

pd_word_num = pd.DataFrame(pd_root[‘詞匯‘].value_counts())
pd_word_num.rename(columns={‘詞匯‘: ‘頻數‘})
pd_word_num.rename(columns={‘詞匯‘:‘頻數‘},inplace=True)
pd_word_num[‘百分比‘] = pd_word_num[‘頻數‘] / pd_word_num[‘頻數‘].sum()
print(pd_word_num.head(10))

 技術分享圖片

創建詞性-頻數庫:

pd_qua_num = pd.DataFrame(pd_root[‘詞性‘].value_counts())
#更改列名
pd_qua_num.rename(columns={‘詞性‘:‘頻數‘},inplace=True)
#添加百分比列:詞性-頻數-百分比
pd_qua_num[‘百分比‘] = pd_qua_num[‘頻數‘] / pd_qua_num[‘頻數‘].sum()
print(pd_qua_num.head(10))

技術分享圖片

統計幾種重要詞性的詞匯分布:

# 定義6類詞性統計數據框
columns_selected=[‘動詞‘,‘動詞計數‘,‘名詞‘,‘名詞計數‘,‘代詞‘,‘代詞計數‘,
                  ‘時間詞‘,‘時間詞計數‘,‘副詞‘,‘副詞計數‘,‘形容詞‘,‘形容詞計數‘]
pd_Top6 = pd.DataFrame(columns=columns_selected)
for i in range(0,12,2):
    pd_Top6[columns_selected[i]] = pd_root.loc[pd_root[‘詞性‘]==columns_selected[i]][‘詞匯‘].value_counts().reset_index()[‘index‘]
    pd_Top6[columns_selected[i+1]] = pd_root.loc[pd_root[‘詞性‘]==columns_selected[i]][‘詞匯‘].value_counts().reset_index()[‘詞匯‘]
print(pd_Top6.head(10))

技術分享圖片

提取文本中關鍵詞:

key_words = pynlpir.get_key_words(str, weighted=True)
print(key_words)

繪圖:

def paint(df,x,y,title):
    plt.subplots(figsize=(7,5))
    plt.yticks(fontproperties=font,size=10)
    plt.xlabel(x,fontproperties=font,size=10)
    plt.ylabel(y,fontproperties=font,size=10)
    plt.title(title,fontproperties=font)
    df.iloc[:10][‘頻數‘].plot(kind=‘barh‘)
    plt.show()

paint(pd_word_num,"頻數","詞匯","詞匯分布")
paint(pd_qua_num,"頻數","詞性","詞性分布")

技術分享圖片

技術分享圖片

fig = plt.figure(figsize=(10,5))
fig.subplots_adjust(hspace=0.3,wspace=0.2)
for i in range(1,7):
    pd_qua = pd_Top6.iloc[:,[(2*i-2),2*i-1]]
    pd_qua.columns = [pd_qua.columns[0],‘頻數‘]
    pd_qua = pd_qua.set_index(pd_qua.columns[0])
    print(pd_qua)
    ax = fig.add_subplot(2,3,i)
    pd_qua.head(10)[‘頻數‘].plot(kind=‘bar‘)
    ax.set_xticklabels(pd_qua.head(10).index,fontproperties=font,size=10,rotation=30)
    ax.set_title(pd_qua.index.name,fontproperties=font)
fig.tight_layout()
fig.show()

  

技術分享圖片

pynlpir + pandas 文本分析