Python下的英文預處理
阿新 • • 發佈:2019-01-31
一 得到原始文字內容
def FileRead(self,filePath):
f = open(filePath)
raw=f.read()
return raw
二 分割成句子
def SenToken(self,raw):#分割成句子
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(raw)
return sents
三 句子內容的清理,去掉數字標點和非字母字元
def CleanLines(self,line): identify = string.maketrans('', '') delEStr = string.punctuation +string.digits #ASCII 標點符號,數字 # cleanLine = line.translate(identify,delEStr) #去掉ASCII 標點符號和空格 cleanLine =line.translate(identify,delEStr) #去掉ASCII 標點符號 return cleanLine
四nltk.pos_tag進行詞性標註
def POSTagger(self,sent):
taggedLine=[nltk.pos_tag(sent) for sent in sents]
return taggedLine
五 nltk.word_tokenize分詞
def WordTokener(self,sent):#將單句字串分割成詞
result=''
wordsInStr = nltk.word_tokenize(sent)
return wordsInStr
六 enchant拼寫檢查
def WordCheck(self,words):#拼寫檢查
d = enchant.Dict("en_US")
checkedWords=()
for word in words:
if not d.check(word):
d.suggest(word)
word=raw_input()
checkedWords = (checkedWords,'05')
return checkedWords
七 去停用詞和小寫去短詞
def CleanWords(self,wordsInStr):#去掉標點符號,長度小於3的詞以及non-alpha詞,小寫化
cleanWords=[]
stopwords = {}.fromkeys([ line.rstrip()for line in open(conf.PreConfig.ENSTOPWORDS)])
for words in wordsInStr:
cleanWords+= [[w.lower() for w in words if w.lower() not in stopwords and 3<=len(w)]]
return cleanWords
八 使用Wordnet進行詞幹化
def StemWords(self,cleanWordsList):
stemWords=[]
# porter = nltk.PorterStemmer()#有博士說這個詞幹化工具效果不好,不是很專業
# result=[porter.stem(t) for t incleanTokens]
for words in cleanWordsList:
stemWords+=[[wn.morphy(w) for w in words]]
return stemWords
九 完整程式碼
#coding=utf-8
'''
Created on 2014-3-20
英文的詞幹化和去停用詞
@author: liTC
'''
import nltk
# import enchant
import string
import re
import os
from config import Config as conf
from nltk.corpus import wordnet as wn
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class EnPreprocess:
'''整體流程:
讀取檔案:FileRead()filepath to raw
分割成句子:SenToken()raw to sents
(詞性標註):POSTagger()sent to words[]
句子分割成詞:TokenToWords()將句子分割成詞 sent to word[]
(拼寫檢查):WordCheck() 錯誤的去掉或是等人工改正
去掉標點,去掉非字母內容:CleanLines()句子,line to cleanLine
去掉長度小於3的詞,小寫轉換,去停用詞:CleanWords(),words[] to cleanWords[]
詞幹化:StemWords()把詞詞幹化返回,words to stemWords
二次清理:再執行一次CleanWords(),使句子更加純淨
'''
def__init__(self):
print'English token and stopwords remove...'
defFileRead(self,filePath):#讀取內容
f =open(filePath)
raw=f.read()
return raw
defWriteResult(self,result,resultPath):
self.mkdir(str(resultPath).replace(str(resultPath).split('/')[-1],''))
f=open(resultPath,"w") #將結果儲存到另一個文件中
f.write(str(result))
f.close()
defSenToken(self,raw):#分割成句子
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
sents =sent_tokenizer.tokenize(raw)
return sents
def POSTagger(self,sent):
taggedLine=[nltk.pos_tag(sent) for sent in sents]
returntaggedLine
defWordTokener(self,sent):#將單句字串分割成詞
result=''
wordsInStr= nltk.word_tokenize(sent)
returnwordsInStr
defWordCheck(self,words):#拼寫檢查
d =enchant.Dict("en_US")
checkedWords=()
for word inwords:
if notd.check(word):
d.suggest(word)
word=raw_input()
checkedWords = (checkedWords,'05')
returncheckedWords
defCleanLines(self,line):
identify =string.maketrans('', '')
delEStr =string.punctuation + string.digits #ASCII 標點符號,數字
# cleanLine= line.translate(identify, delEStr) #去掉ASCII 標點符號和空格
cleanLine =line.translate(identify,delEStr) #去掉ASCII 標點符號
returncleanLine
defCleanWords(self,wordsInStr):#去掉標點符號,長度小於3的詞以及non-alpha詞,小寫化
cleanWords=[]
stopwords ={}.fromkeys([ line.rstrip() for line in open(conf.PreConfig.ENSTOPWORDS)])
for wordsin wordsInStr:
cleanWords+= [[w.lower() for w in words if w.lower() not in stopwordsand 3<=len(w)]]
returncleanWords
defStemWords(self,cleanWordsList):
stemWords=[]
# porter =nltk.PorterStemmer()#有博士說這個詞幹化工具效果不好,不是很專業
# result=[porter.stem(t) for t in cleanTokens]
for wordsin cleanWordsList:
stemWords+=[[wn.morphy(w) for w in words]]
returnstemWords
defWordsToStr(self,stemWords):
strLine=[]
for wordsin stemWords:
strLine+=[w for w in words]
returnstrLine
defmkdir(self,path):
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符號
path=path.rstrip("\\")
# 判斷路徑是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判斷結果
if notisExists:
# 如果不存在則建立目錄
printpath+' 建立成功'
# 建立目錄操作函式
os.makedirs(path)
returnTrue
else:
# 如果目錄存在則不建立,並提示目錄已存在
printpath+' 目錄已存在'
returnFalse
defEnPreMain(self,dir):
forroot,dirs,files in os.walk(dir):
foreachfiles in files:
croupPath=os.path.join(root,eachfiles)
printcroupPath
resultPath=conf.PreConfig.NLTKRESULTPATH+croupPath.split('/')[-2]+'/'+croupPath.split('/')[-1]
raw=self.FileRead(croupPath).strip()
sents=self.SenToken(raw)
# taggedLine=self.POSTagger(sents)#暫不啟用詞性標註
cleanLines=[self.CleanLines(line) for line in sents]
words=[self.WordTokener(cl) for cl in cleanLines]
# checkedWords=self.WordCheck(words)#暫不啟用拼寫檢查
cleanWords=self.CleanWords(words)
stemWords=self.StemWords(cleanWords)
# cleanWords=self.CleanWords(stemWords)#第二次清理出現問題,暫不啟用
strLine=self.WordsToStr(stemWords)
self.WriteResult(strLine,resultPath)#一個檔案暫時存成一行
defStandardTokener(self,raw):
result=''
#還沒弄好
returnresult
enPre=EnPreprocess()
enPre.EnPreMain(conf.PreConfig.ENCORUPPATH)
PS:一直還沒用好Stanford的那個工具包,誰用過教我一下吧