1. 程式人生 > >Python下的英文預處理

Python下的英文預處理

一 得到原始文字內容

    def FileRead(self,filePath):
        f = open(filePath)
        raw=f.read()
       return raw

二 分割成句子

    def SenToken(self,raw):#分割成句子
        sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_tokenizer.tokenize(raw)
        return  sents


三 句子內容的清理,去掉數字標點和非字母字元

    def CleanLines(self,line):
        identify = string.maketrans('', '')
        delEStr = string.punctuation +string.digits  #ASCII 標點符號,數字  
#         cleanLine = line.translate(identify,delEStr) #去掉ASCII 標點符號和空格
        cleanLine =line.translate(identify,delEStr) #去掉ASCII 標點符號
       return cleanLine

四nltk.pos_tag進行詞性標註

    def POSTagger(self,sent):
        taggedLine=[nltk.pos_tag(sent) for sent in sents]
       return taggedLine

五 nltk.word_tokenize分詞

def WordTokener(self,sent):#將單句字串分割成詞
        result=''
        wordsInStr = nltk.word_tokenize(sent)
       return wordsInStr

六 enchant拼寫檢查

    def WordCheck(self,words):#拼寫檢查
        d = enchant.Dict("en_US")
        checkedWords=()
        for word in words:
            if not d.check(word):
                d.suggest(word)
                word=raw_input()
            checkedWords = (checkedWords,'05')
       return checkedWords

七 去停用詞和小寫去短詞

    def CleanWords(self,wordsInStr):#去掉標點符號,長度小於3的詞以及non-alpha詞,小寫化
        cleanWords=[]
        stopwords = {}.fromkeys([ line.rstrip()for line in open(conf.PreConfig.ENSTOPWORDS)])
        for words in wordsInStr:
            cleanWords+= [[w.lower() for w in words if w.lower() not in stopwords and 3<=len(w)]]
       return cleanWords

八 使用Wordnet進行詞幹化

    def StemWords(self,cleanWordsList):
        stemWords=[]
#         porter = nltk.PorterStemmer()#有博士說這個詞幹化工具效果不好,不是很專業
#         result=[porter.stem(t) for t incleanTokens]
        for words in cleanWordsList:
            stemWords+=[[wn.morphy(w) for w in words]]
       return stemWords

九 完整程式碼

#coding=utf-8
'''
Created on 2014-3-20
英文的詞幹化和去停用詞
@author: liTC
'''
import nltk
# import enchant
import string 
import re
import os
from config import Config as conf
from nltk.corpus import wordnet as wn
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
 
class EnPreprocess:
    '''整體流程:
    讀取檔案:FileRead()filepath to raw
    分割成句子:SenToken()raw to sents
    (詞性標註):POSTagger()sent to words[]
    句子分割成詞:TokenToWords()將句子分割成詞 sent to word[]
    (拼寫檢查):WordCheck() 錯誤的去掉或是等人工改正
    去掉標點,去掉非字母內容:CleanLines()句子,line to cleanLine
    去掉長度小於3的詞,小寫轉換,去停用詞:CleanWords(),words[] to cleanWords[]
    詞幹化:StemWords()把詞詞幹化返回,words to stemWords
    二次清理:再執行一次CleanWords(),使句子更加純淨
    '''
    def__init__(self):
        print'English token and stopwords remove...'
    defFileRead(self,filePath):#讀取內容
        f =open(filePath)
       raw=f.read()
        return raw
    defWriteResult(self,result,resultPath):
       self.mkdir(str(resultPath).replace(str(resultPath).split('/')[-1],''))
       f=open(resultPath,"w") #將結果儲存到另一個文件中
       f.write(str(result))
        f.close()
    defSenToken(self,raw):#分割成句子
       sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        sents =sent_tokenizer.tokenize(raw)
        return  sents
    def POSTagger(self,sent):
       taggedLine=[nltk.pos_tag(sent) for sent in sents]
        returntaggedLine
    defWordTokener(self,sent):#將單句字串分割成詞
        result=''
        wordsInStr= nltk.word_tokenize(sent)
        returnwordsInStr
    defWordCheck(self,words):#拼寫檢查
        d =enchant.Dict("en_US")
       checkedWords=()
        for word inwords:
            if notd.check(word):
               d.suggest(word)
               word=raw_input()
           checkedWords = (checkedWords,'05')
        returncheckedWords
    defCleanLines(self,line):
        identify =string.maketrans('', '')
        delEStr =string.punctuation + string.digits #ASCII 標點符號,數字  
#         cleanLine= line.translate(identify, delEStr) #去掉ASCII 標點符號和空格
        cleanLine =line.translate(identify,delEStr) #去掉ASCII 標點符號
        returncleanLine
    defCleanWords(self,wordsInStr):#去掉標點符號,長度小於3的詞以及non-alpha詞,小寫化
       cleanWords=[]
        stopwords ={}.fromkeys([ line.rstrip() for line in open(conf.PreConfig.ENSTOPWORDS)])
        for wordsin wordsInStr:
           cleanWords+= [[w.lower() for w in words if w.lower() not in stopwordsand 3<=len(w)]]
        returncleanWords
    defStemWords(self,cleanWordsList):
       stemWords=[]
#         porter =nltk.PorterStemmer()#有博士說這個詞幹化工具效果不好,不是很專業
#        result=[porter.stem(t) for t in cleanTokens]
        for wordsin cleanWordsList:
           stemWords+=[[wn.morphy(w) for w in words]]
        returnstemWords
    defWordsToStr(self,stemWords):
        strLine=[]
        for wordsin stemWords:
           strLine+=[w for w in words]
        returnstrLine
    defmkdir(self,path):
        # 去除首位空格
       path=path.strip()
        # 去除尾部 \ 符號
       path=path.rstrip("\\")
        # 判斷路徑是否存在
        # 存在    True
        # 不存在  False
       isExists=os.path.exists(path)
        # 判斷結果
        if notisExists:
            # 如果不存在則建立目錄
            printpath+' 建立成功'
            # 建立目錄操作函式
           os.makedirs(path)
            returnTrue
        else:
            # 如果目錄存在則不建立,並提示目錄已存在
            printpath+' 目錄已存在'
            returnFalse
    defEnPreMain(self,dir):
        forroot,dirs,files in os.walk(dir):
          foreachfiles in files:
           croupPath=os.path.join(root,eachfiles)
            printcroupPath
           resultPath=conf.PreConfig.NLTKRESULTPATH+croupPath.split('/')[-2]+'/'+croupPath.split('/')[-1]           
           raw=self.FileRead(croupPath).strip()
           sents=self.SenToken(raw)
#             taggedLine=self.POSTagger(sents)#暫不啟用詞性標註
           cleanLines=[self.CleanLines(line) for line in sents]
           words=[self.WordTokener(cl) for cl in cleanLines]
#            checkedWords=self.WordCheck(words)#暫不啟用拼寫檢查
           cleanWords=self.CleanWords(words)
           stemWords=self.StemWords(cleanWords)
#            cleanWords=self.CleanWords(stemWords)#第二次清理出現問題,暫不啟用
           strLine=self.WordsToStr(stemWords)
           self.WriteResult(strLine,resultPath)#一個檔案暫時存成一行
    defStandardTokener(self,raw):
        result=''
        #還沒弄好
        returnresult
   
   
enPre=EnPreprocess()
enPre.EnPreMain(conf.PreConfig.ENCORUPPATH)

PS:一直還沒用好Stanford的那個工具包,誰用過教我一下吧