Python下的英文預處理

阿新 • • 發佈：2019-01-31

一得到原始文字內容

    def FileRead(self,filePath):
        f = open(filePath)
        raw=f.read()
       return raw

二分割成句子

    def SenToken(self,raw):#分割成句子
        sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_tokenizer.tokenize(raw)
        return  sents

三句子內容的清理，去掉數字標點和非字母字元

    def CleanLines(self,line):
        identify = string.maketrans('', '')
        delEStr = string.punctuation +string.digits  #ASCII 標點符號，數字  
#         cleanLine = line.translate(identify,delEStr) #去掉ASCII 標點符號和空格
        cleanLine =line.translate(identify,delEStr) #去掉ASCII 標點符號
       return cleanLine

四nltk.pos_tag進行詞性標註

    def POSTagger(self,sent):
        taggedLine=[nltk.pos_tag(sent) for sent in sents]
       return taggedLine

五 nltk.word_tokenize分詞

def WordTokener(self,sent):#將單句字串分割成詞
        result=''
        wordsInStr = nltk.word_tokenize(sent)
       return wordsInStr

六 enchant拼寫檢查

    def WordCheck(self,words):#拼寫檢查
        d = enchant.Dict("en_US")
        checkedWords=()
        for word in words:
            if not d.check(word):
                d.suggest(word)
                word=raw_input()
            checkedWords = (checkedWords,'05')
       return checkedWords

七去停用詞和小寫去短詞

    def CleanWords(self,wordsInStr):#去掉標點符號，長度小於3的詞以及non-alpha詞，小寫化
        cleanWords=[]
        stopwords = {}.fromkeys([ line.rstrip()for line in open(conf.PreConfig.ENSTOPWORDS)])
        for words in wordsInStr:
            cleanWords+= [[w.lower() for w in words if w.lower() not in stopwords and 3<=len(w)]]
       return cleanWords

八使用Wordnet進行詞幹化

    def StemWords(self,cleanWordsList):
        stemWords=[]
#         porter = nltk.PorterStemmer()#有博士說這個詞幹化工具效果不好，不是很專業
#         result=[porter.stem(t) for t incleanTokens]
        for words in cleanWordsList:
            stemWords+=[[wn.morphy(w) for w in words]]
       return stemWords

九完整程式碼

#coding=utf-8
'''
Created on 2014-3-20
英文的詞幹化和去停用詞
@author: liTC
'''
import nltk
# import enchant
import string 
import re
import os
from config import Config as conf
from nltk.corpus import wordnet as wn
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
 
class EnPreprocess:
    '''整體流程：
    讀取檔案：FileRead（）filepath to raw
    分割成句子:SenToken()raw to sents
    (詞性標註):POSTagger()sent to words[]
    句子分割成詞:TokenToWords()將句子分割成詞 sent to word[]
    （拼寫檢查）：WordCheck() 錯誤的去掉或是等人工改正
    去掉標點，去掉非字母內容:CleanLines()句子，line to cleanLine
    去掉長度小於3的詞，小寫轉換，去停用詞：CleanWords(),words[] to cleanWords[]
    詞幹化:StemWords()把詞詞幹化返回，words to stemWords
    二次清理:再執行一次CleanWords()，使句子更加純淨
    '''
    def__init__(self):
        print'English token and stopwords remove...'
    defFileRead(self,filePath):#讀取內容
        f =open(filePath)
       raw=f.read()
        return raw
    defWriteResult(self,result,resultPath):
       self.mkdir(str(resultPath).replace(str(resultPath).split('/')[-1],''))
       f=open(resultPath,"w") #將結果儲存到另一個文件中
       f.write(str(result))
        f.close()
    defSenToken(self,raw):#分割成句子
       sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        sents =sent_tokenizer.tokenize(raw)
        return  sents
    def POSTagger(self,sent):
       taggedLine=[nltk.pos_tag(sent) for sent in sents]
        returntaggedLine
    defWordTokener(self,sent):#將單句字串分割成詞
        result=''
        wordsInStr= nltk.word_tokenize(sent)
        returnwordsInStr
    defWordCheck(self,words):#拼寫檢查
        d =enchant.Dict("en_US")
       checkedWords=()
        for word inwords:
            if notd.check(word):
               d.suggest(word)
               word=raw_input()
           checkedWords = (checkedWords,'05')
        returncheckedWords
    defCleanLines(self,line):
        identify =string.maketrans('', '')
        delEStr =string.punctuation + string.digits #ASCII 標點符號，數字  
#         cleanLine= line.translate(identify, delEStr) #去掉ASCII 標點符號和空格
        cleanLine =line.translate(identify,delEStr) #去掉ASCII 標點符號
        returncleanLine
    defCleanWords(self,wordsInStr):#去掉標點符號，長度小於3的詞以及non-alpha詞，小寫化
       cleanWords=[]
        stopwords ={}.fromkeys([ line.rstrip() for line in open(conf.PreConfig.ENSTOPWORDS)])
        for wordsin wordsInStr:
           cleanWords+= [[w.lower() for w in words if w.lower() not in stopwordsand 3<=len(w)]]
        returncleanWords
    defStemWords(self,cleanWordsList):
       stemWords=[]
#         porter =nltk.PorterStemmer()#有博士說這個詞幹化工具效果不好，不是很專業
#        result=[porter.stem(t) for t in cleanTokens]
        for wordsin cleanWordsList:
           stemWords+=[[wn.morphy(w) for w in words]]
        returnstemWords
    defWordsToStr(self,stemWords):
        strLine=[]
        for wordsin stemWords:
           strLine+=[w for w in words]
        returnstrLine
    defmkdir(self,path):
        # 去除首位空格
       path=path.strip()
        # 去除尾部 \ 符號
       path=path.rstrip("\\")
        # 判斷路徑是否存在
        # 存在    True
        # 不存在  False
       isExists=os.path.exists(path)
        # 判斷結果
        if notisExists:
            # 如果不存在則建立目錄
            printpath+' 建立成功'
            # 建立目錄操作函式
           os.makedirs(path)
            returnTrue
        else:
            # 如果目錄存在則不建立，並提示目錄已存在
            printpath+' 目錄已存在'
            returnFalse
    defEnPreMain(self,dir):
        forroot,dirs,files in os.walk(dir):
          foreachfiles in files:
           croupPath=os.path.join(root,eachfiles)
            printcroupPath
           resultPath=conf.PreConfig.NLTKRESULTPATH+croupPath.split('/')[-2]+'/'+croupPath.split('/')[-1]           
           raw=self.FileRead(croupPath).strip()
           sents=self.SenToken(raw)
#             taggedLine=self.POSTagger(sents)#暫不啟用詞性標註
           cleanLines=[self.CleanLines(line) for line in sents]
           words=[self.WordTokener(cl) for cl in cleanLines]
#            checkedWords=self.WordCheck(words)#暫不啟用拼寫檢查
           cleanWords=self.CleanWords(words)
           stemWords=self.StemWords(cleanWords)
#            cleanWords=self.CleanWords(stemWords)#第二次清理出現問題，暫不啟用
           strLine=self.WordsToStr(stemWords)
           self.WriteResult(strLine,resultPath)#一個檔案暫時存成一行
    defStandardTokener(self,raw):
        result=''
        #還沒弄好
        returnresult
   
   
enPre=EnPreprocess()
enPre.EnPreMain(conf.PreConfig.ENCORUPPATH)

PS：一直還沒用好Stanford的那個工具包，誰用過教我一下吧

Python下的英文預處理

一得到原始文字內容 def FileRead(self,filePath): f = open(filePath) raw=f.read() return raw 二分割成句子 def SenToke

用python進行資料預處理，過濾特殊符號，英文和數字。（適用於中文分詞）

要進行中文分詞，必須要求資料格式全部都是中文，需求過濾掉特殊符號、標點、英文、數字等。當然了使用者可以根據自己的要求過濾自定義字元。實驗環境：python、mysql 實驗目的：從資料庫讀取資料，

Python下的資料處理和機器學習，對資料線上及本地獲取、解析、預處理和訓練、預測、交叉驗證、視覺化

<!doctype html> <html> <head> <title>Example Domain</title> <meta charset="utf-8" /> <meta http-equiv=

談下mysql預處理基礎

意思 fault pre 連接發現 myisam 定量 ont 預編譯傳統的操作數據庫方法有兩種：先寫一條sql語句，然後通過mysqli->query($sql)去操作數據庫（此處使用的是mysqli擴展庫）。這樣操作並不會有什麽大的錯誤，但是當要插入上千條

Python商品資料預處理與K-Means聚類視覺化分析

資料提取在我之前的文章Scrapy自動爬取商品資料爬蟲裡實現了爬蟲爬取商品網站搜尋關鍵詞為python的書籍商品，爬取到了60多頁網頁的1260本python書籍商品的書名，價格，評論數和商品連結，並將所有商品資料儲存到本地的.json檔案中。資料儲存格式如下：

python數據預處理和特性選擇後列的映射

form med 標準化學習 ont 矩陣 sim span 直接我們在用python進行機器學習建模時，首先需要對數據進行預處理然後進行特征工程，在這些過程中，數據的格式可能會發生變化，前幾天我遇到過的問題就是：　　　　對數據進行標準化、歸一化、方差過濾的時候數據都

python下selenium如何處理各種日期控制元件

# -*- coding: utf-8 -*- from selenium import webdriver from time import sleep driver = webdrive

python下醫學影象處理庫的安裝問題(更新中)

1. pip install dipy：DIPY is a python toolbox for analysis of MR diffusion imaging.2. pip install nibabel：安裝用於讀寫影像資料檔案的程式包3. pip install --

Python進行文字預處理（文字分詞，過濾停用詞，詞頻統計，特徵選擇，文字表示）

系統：win7 32位分詞軟體：PyNLPIR 整合開發環境（IDE）：Pycharm 功能：實現多級文字預處理全過程，包括文字分詞，過濾停用詞，詞頻統計，特徵選擇，文字表示，並將結果匯出為WEKA能夠處理的.arff格式。直接上程式碼： #!/usr/bin/

OpenCV-Python-(3)-影象預處理

影象預處理： cv2.copyMakeBorder(src,top, bottom, left, right ,borderType,value) #邊界填充 cv2.add()

Python----數據預處理

fit das sin missing 隨機 stand plot 行數逗號導入標準庫 import numpy as np import matplotlib.pyplot as plt import pandas as pd 導入數據集

python下進行lda主題挖掘(一)——預處理(英文)

到2018年3月7日為止，本系列三篇文章已寫完，可能後續有新的內容的話會繼續更新。歡迎閱讀並交流。寫在前面本人打算將LDA這部分的內容寫成一個系列，不涉及演算法思想，只分享程式碼與使用經驗，包括但不限於以下內容：英文文件的預處理、LD

python-----簡單英文語料預處理

英文語料預處理的主要步驟：（此步驟針對的是txt格式的檔案，如果檔案為其他格式，需要先將其轉換為txt檔案再進行操作） 1、去除非英文字元的字元，例如符號、數字、中文等 2、去停用詞具體實現（python具體實現）： 1、去除非英文字元在python中使

Python——字符串、文件操作，英文詞頻統計預處理

string 加密和解密 com 模塊 put 圖片查詢 url 偏移一.字符串操作：解析身份證號：生日、性別、出生地等。凱撒密碼編碼與解碼網址觀察與批量生成 2.凱撒密碼編碼與解碼　　凱撒加密法的替換方法

python大戰機器學習——數據預處理

但是毫無缺陷 nbsp 正則連續可選目標使用　　數據預處理的常用流程：　　　　1）去除唯一屬性　　　　2）處理缺失值　　　　3）屬性編碼　　　　4）數據標準化、正則化　　　　5）特征選擇　　　　6）主成分分析（1）去除唯一屬性　　在獲取的數據集

python基本數據預處理語法函數(2)

OS 10個 ict one 居中固定寬度 pos 通過 div 字符串格式化方法format的用法： <^> #分別為左對齊、居中、右對齊 ‘{:>18,.2f}‘.format(70305084.0) #:冒號+空白填充+右對齊+固

數據預處理（Python scikit-learn）

距離度量 sklearn 神經網絡效果 binary load roc maxscale 可能在機器學習任務中，經常會對數據進行預處理．如尺度變換，標準化，二值化，正規化．至於采用哪種方法更有效，則與數據分布和采用算法有關．不同算法對數據的假設不同，可能需要不同的變換，

易百教程人工智能python修正-人工智能數據準備-預處理數據

ray 第一步變化顯示 pre 布爾 2.4 圖像 maxscale 預處理數據在我們的日常生活中，需要處理大量數據，但這些數據是原始數據。為了提供數據作為機器學習算法的輸入，需要將其轉換為有意義的數據。這就是數據預處理進入圖像的地方。換言之，可以說在將數據提供

【MNIST/Python】手寫體數字訓練/測試資料集(圖片格式)下載及分割預處理

MNIST手寫體數字資料集 MNIST是一個手寫數字資料庫，它有60000個訓練樣本集和10000個測試樣本集由Yann LeCun等人建立，是NIST資料庫的一個子集官方網址連結：Link 官網上的資料庫檔案形式如下： train-images-idx3-ubyte.

caffe Python API 之圖片預處理

# 設定圖片的shape格式為網路data層格式 transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) # 改變維度的順序，由原始圖片維度(width, height, channel)變為(channel,

Python下的英文預處理

一 得到原始文字內容

二 分割成句子

三 句子內容的清理，去掉數字標點和非字母字元

四nltk.pos_tag進行詞性標註

五 nltk.word_tokenize分詞

六 enchant拼寫檢查

七 去停用詞和小寫去短詞

八 使用Wordnet進行詞幹化

九 完整程式碼

相關推薦

一得到原始文字內容

二分割成句子

三句子內容的清理，去掉數字標點和非字母字元

七去停用詞和小寫去短詞

八使用Wordnet進行詞幹化

九完整程式碼