結巴分詞和自然語言處理HanLP處理手記
阿新 • • 發佈:2018-12-31
#!/usr/bin/env python # -*- coding:utf-8 -*- import jieba import os import re import time from jpype import * ''' title:利用結巴分詞進行文字語料的批量處理 1 首先對文字進行遍歷查詢 2 建立原始文字的儲存結構 3 對原文字進行結巴分詞和停用詞處理 4 對預處理結果進行標準化格式,並儲存原檔案結構路徑 author:白寧超 myblog:http://www.cnblogs.com/baiboy/ time:2017年4月28日10:03:09 ''' ''' 建立檔案目錄 path:根目錄下建立子目錄 ''' def mkdir(path): # 判斷路徑是否存在 isExists=os.path.exists(path) # 判斷結果 if not isExists: os.makedirs(path) print(path+' 建立成功') return True else: pass print('-->請稍後,文字正在預處理中...') ''' 結巴分詞工具進行中文分詞處理: read_folder_path:待處理的原始語料根路徑 write_folder_path 中文分詞經資料清洗後的語料 ''' def CHSegment(read_folder_path,write_folder_path): stopwords ={}.fromkeys([line.strip() for line in open('../Database/stopwords/CH_stopWords.txt','r',encoding='utf-8')]) # 停用詞表 # 獲取待處理根目錄下的所有類別 folder_list = os.listdir(read_folder_path) # 類間迴圈 # print(folder_list) for folder in folder_list: #某類下的路徑 new_folder_path = os.path.join(read_folder_path, folder) # 建立一致的儲存檔案路徑 mkdir(write_folder_path+folder) #某類下的儲存路徑 save_folder_path = os.path.join(write_folder_path, folder) #某類下的全部檔案集 # 類內迴圈 files = os.listdir(new_folder_path) j = 1 for file in files: if j > len(files): break # 讀取原始語料 raw = open(os.path.join(new_folder_path, file),'r',encoding='utf-8').read() # 只保留漢字 # raw1 = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", raw) # jieba分詞 wordslist = jieba.cut(raw, cut_all=False) # 精確模式 # 停用詞處理 cutwordlist='' for word in wordslist: if word not in stopwords and word=="\n": cutwordlist+="\n" # 保持原有文字換行格式 elif len(word)>1 : cutwordlist+=word+"/" #去除空格 #儲存清洗後的資料 with open(os.path.join(save_folder_path,file),'w',encoding='utf-8') as f: f.write(cutwordlist) j += 1 ''' 結巴分詞工具進行中文分詞處理: read_folder_path:待處理的原始語料根路徑 write_folder_path 中文分詞經資料清洗後的語料 ''' def HanLPSeg(read_folder_path,write_folder_path): startJVM(getDefaultJVMPath(), "-Djava.class.path=C:\hanlp\hanlp-1.3.2.jar;C:\hanlp", "-Xms1g", "-Xmx1g") # 啟動JVM,Linux需替換分號;為冒號: stopwords ={}.fromkeys([line.strip() for line in open('../Database/stopwords/CH_stopWords.txt','r',encoding='utf-8')]) # 停用詞表 # 獲取待處理根目錄下的所有類別 folder_list = os.listdir(read_folder_path) # 類間迴圈 # print(folder_list) for folder in folder_list: #某類下的路徑 new_folder_path = os.path.join(read_folder_path, folder) # 建立一致的儲存檔案路徑 mkdir(write_folder_path+folder) #某類下的儲存路徑 save_folder_path = os.path.join(write_folder_path, folder) #某類下的全部檔案集 # 類內迴圈 files = os.listdir(new_folder_path) j = 1 for file in files: if j > len(files): break # 讀取原始語料 raw = open(os.path.join(new_folder_path, file),'r',encoding='utf-8').read() # HanLP分詞 HanLP = JClass('com.hankcs.hanlp.HanLP') wordslist = HanLP.segment(raw) #儲存清洗後的資料 wordslist1=str(wordslist).split(",") # print(wordslist1[1:len(wordslist1)-1]) flagresult="" # 去除標籤 for v in wordslist1[1:len(wordslist1)-1]: if "/" in v: slope=v.index("/") letter=v[1:slope] if len(letter)>0 and '\n\u3000\u3000' in letter: flagresult+="\n" else:flagresult+=letter +"/" #去除空格 # print(flagresult) with open(os.path.join(save_folder_path,file),'w',encoding='utf-8') as f: f.write(flagresult.replace(' /','')) j += 1 shutdownJVM() if __name__ == '__main__' : print('開始進行文字分詞操作:\n') t1 = time.time() dealpath="../Database/SogouC/FileTest/" savepath="../Database/SogouCCut/FileTest/" # 待分詞的語料類別集根目錄 read_folder_path = '../Database/SogouC/FileNews/' write_folder_path = '../Database/SogouCCut/' #jieba中文分詞 CHSegment(read_folder_path,write_folder_path) #300個txtq其中結巴分詞使用3.31秒 HanLPSeg(read_folder_path,write_folder_path) #300個txt其中hanlp分詞使用1.83秒 t2 = time.time() print('完成中文文字切分: '+str(t2-t1)+"秒。")