1. 程式人生 > >[NLP]預處理--使用re正則化進行文字清理

[NLP]預處理--使用re正則化進行文字清理

文字清理:在自然語言處理中,儘管文字清理受所做的任務影響比較大,但是有一些通用的清理流程標準是通用的,比如是否有必要替換單位、貨幣、數學符號、數字。可以使用正則化工具將相應內容替換為標準內容。

工具:re(簡介

輸入:原始文字

輸出:乾淨文字

單位替換

將文字中的單位替換為統一格式如:將4kgs、4kg統一替換為4 kg,將4k替換為4000,將100100100或100替換為100 dollar。

import random
import re
text = "I want to lose 4kgs in a month. What does 4k mean in a salary?What is the best way to make money with $100?"
# 單位 text = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kgs => 4 kg text = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text) # e.g. 4kg => 4 kg text = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text) # e.g. 4k => 4000 text = re.sub(r"\$(\d+)"
, lambda m: m.group(1) + ' dollar ', text) # e.g. $100 => 100 dollar text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text) # e.g. 100$ => 100 dollar text
'I want to lose 4 kg in a month. What does 4000 mean in a salary?What is the best way to make money with 100 dollar ?'

略縮詞替換

將文字中首字母略縮詞替換為完整單詞,如can’t、cannot替換為can not,'ve替換為have,c#替換為csharp等。

text = "Why India can't compete with China in manufacturing. What is the biggest scam you've ever seen? Why Should I Learn c#? "

# 略縮詞
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"cannot", "can not ", text)
text = re.sub(r"what\'s", "what is", text)
text = re.sub(r"What\'s", "what is", text)
text = re.sub(r"\'ve ", " have ", text)
text = re.sub(r"n\'t", " not ", text)
text = re.sub(r"i\'m", "i am ", text)
text = re.sub(r"I\'m", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"c\+\+", "cplusplus", text)
text = re.sub(r"c \+\+", "cplusplus", text)
text = re.sub(r"c \+ \+", "cplusplus", text)
text = re.sub(r"c#", "csharp", text)
text = re.sub(r"f#", "fsharp", text)
text = re.sub(r"g#", "gsharp", text)
text = re.sub(r" e mail ", " email ", text)
text = re.sub(r" e \- mail ", " email ", text)
text = re.sub(r" e\-mail ", " email ", text)
text = re.sub(r",000", '000', text)
text = re.sub(r"\'s", " ", text)

text
'Why India can not compete with China in manufacturing. What is the biggest scam you have ever seen? Why Should I Learn csharp? '

拼寫校對

如將ph.d、PhD替換為phd,去掉多餘空格,將縮寫替換為全拼、將阿拉伯數字替換為英文數字、將美元複數替換為單數等。

text = "ph.d PhD pokemons e g fb usa 1 2 3 googling rs1 dollars"

# 拼寫校對
text = re.sub(r"ph\.d", "phd", text)
text = re.sub(r"PhD", "phd", text)
text = re.sub(r"pokemons", "pokemon", text)
text = re.sub(r"pokémon", "pokemon", text)
text = re.sub(r"pokemon go ", "pokemon-go ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" 9 11 ", " 911 ", text)
text = re.sub(r" j k ", " jk ", text)
text = re.sub(r" fb ", " facebook ", text)
text = re.sub(r"facebooks", " facebook ", text)
text = re.sub(r"facebooking", " facebook ", text)
text = re.sub(r"insidefacebook", "inside facebook", text)
text = re.sub(r"donald trump", "trump", text)
text = re.sub(r"the big bang", "big-bang", text)
text = re.sub(r"the european union", "eu", text)
text = re.sub(r" usa ", " america ", text)
text = re.sub(r" us ", " america ", text)
text = re.sub(r" u s ", " america ", text)
text = re.sub(r" U\.S\. ", " america ", text)
text = re.sub(r" US ", " america ", text)
text = re.sub(r" American ", " america ", text)
text = re.sub(r" America ", " america ", text)
text = re.sub(r" quaro ", " quora ", text)
text = re.sub(r" mbp ", " macbook-pro ", text)
text = re.sub(r" mac ", " macbook ", text)
text = re.sub(r"macbook pro", "macbook-pro", text)
text = re.sub(r"macbook-pros", "macbook-pro", text)
text = re.sub(r" 1 ", " one ", text)
text = re.sub(r" 2 ", " two ", text)
text = re.sub(r" 3 ", " three ", text)
text = re.sub(r" 4 ", " four ", text)
text = re.sub(r" 5 ", " five ", text)
text = re.sub(r" 6 ", " six ", text)
text = re.sub(r" 7 ", " seven ", text)
text = re.sub(r" 8 ", " eight ", text)
text = re.sub(r" 9 ", " nine ", text)
text = re.sub(r"googling", " google ", text)
text = re.sub(r"googled", " google ", text)
text = re.sub(r"googleable", " google ", text)
text = re.sub(r"googles", " google ", text)
text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"(\d+)rs", lambda m: ' rs ' + m.group(1), text)
text = re.sub(r"the european union", " eu ", text)
text = re.sub(r"dollars", " dollar ", text)

text
'phd phd pokemon eg facebook america one two three  google  rs 1  dollar '

標點處理

在標點兩旁加上空格、去除標點’。

text = "1+1=2 What is the biggest scam you have ever seen?I am learning csharp?"

# 標點處理
text = re.sub(r"\+", " + ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"-", " - ", text)
text = re.sub(r"/", " / ", text)
text = re.sub(r"\\", " \ ", text)
text = re.sub(r"=", " = ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\.", " . ", text)
text = re.sub(r",", " , ", text)
text = re.sub(r"\?", " ? ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\"", " \" ", text)
text = re.sub(r"&", " & ", text)
text = re.sub(r"\|", " | ", text)
text = re.sub(r";", " ; ", text)
text = re.sub(r"\(", " ( ", text)
text = re.sub(r"\)", " ( ", text)

text
'1 + 1 = 2 What is the biggest scam you have ever seen ? I am learning csharp ? '

符號替換

將邏輯符號替換為單詞。

text = "1   + 1 =   2   ₹    "

# 符號替換
text = re.sub(r"&", " and ", text)
text = re.sub(r"\|", " or ", text)
text = re.sub(r"=", " equal ", text)
text = re.sub(r"\+", " plus ", text)
text = re.sub(r"₹", " rs ", text)      # 測試!
text = re.sub(r"\$", " dollar ", text)

text
'1    plus  1  equal    2    rs     '

移除多餘空格

# 移除多餘空格
text = ' '.join(text.split())

text
'1 plus 1 equal 2 rs'

所有程式碼

def clean_text(text):
    """
    Clean text
    :param text: the string of text
    :return: text string after cleaning
    """
    # unit
    text = re.sub(r"(\d+)kgs ", lambda m: m.group(1) + ' kg ', text)        # e.g. 4kgs => 4 kg
    text = re.sub(r"(\d+)kg ", lambda m: m.group(1) + ' kg ', text)         # e.g. 4kg => 4 kg
    text = re.sub(r"(\d+)k ", lambda m: m.group(1) + '000 ', text)          # e.g. 4k => 4000
    text = re.sub(r"\$(\d+)", lambda m: m.group(1) + ' dollar ', text)
    text = re.sub(r"(\d+)\$", lambda m: m.group(1) + ' dollar ', text)

    # acronym
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"cannot", "can not ", text)
    text = re.sub(r"what\'s", "what is", text)
    text = re.sub(r"What\'s", "what is", text)
    text = re.sub(r"\'ve ", " have ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"I\'m", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"c\+\+", "cplusplus", text)
    text = re.sub(r"c \+\+", "cplusplus", text)
    text = re.sub(r"c \+ \+", "cplusplus", text)
    text = re.sub(r"c#", "csharp", text)
    text = re.sub(r"f#", "fsharp", text)
    text = re.sub(r"g#", "gsharp", text)
    text = re.sub(r" e mail ", " email ", text)
    text = re.sub(r" e \- mail ", " email ", text)
    text = re.sub(r" e\-mail ", " email ", text)
    text = re.sub(r",000", '000', text)
    text = re.sub(r"\'s", " ", text)

    # spelling correction
    text = re.sub(r"ph\.d", "phd", text)
    text = re.sub(r"PhD", "phd", text)
    text = re.sub(r"pokemons", "pokemon", text)
    text = re.sub(r"pokémon", "pokemon", text)
    text = re.sub(r"pokemon go ", "pokemon-go ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" 9 11 ", " 911 ", text)
    text = re.sub(r" j k ", " jk ", text)
    text = re.sub(r" fb ", " facebook ", text)
    text = re.sub(r"facebooks", " facebook ", text)
    text = re.sub(r"facebooking", " facebook ", text)
    text = re.sub(r"insidefacebook", "inside facebook", text)
    text = re.sub(r"donald trump", "trump", text)
    text = re.sub(r"the big bang", "big-bang", text)
    text = re.sub(r"the european union", "eu", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" us ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r" U\.S\. ", " america ", text)
    text = re.sub(r" US ", " america ", text)
    text = re.sub(r" American ", " america ", text)
    text = re.sub(r" America ", " america ", text)
    text = re.sub(r" quaro ", " quora ", text)
    text = re.sub(r" mbp ", " macbook-pro ", text)
    text = re.sub(r" mac ", " macbook ", text)
    text = re.sub(r"macbook pro", "macbook-pro", text)
    text = re.sub(r"macbook-pros", "macbook-pro", text)
    text = re.sub(r" 1 ", " one ", text)
    text = re.sub(r" 2 ", " two ", text)
    text = re.sub(r" 3 ", " three ", text)
    text = re.sub(r" 4 ", " four ", text)
    text = re.sub(r" 5 ", " five ", text)
    text = re.sub(r" 6 ", " six ", text)
    text = re.sub(r" 7 ", " seven ", text)
    text = re.sub(r" 8 ", " eight ", text)
    text = re.sub(r" 9 ", " nine ", text)
    text = re.sub(r"googling", " google ", text)
    text = re.sub(r"googled", " google ", text)
    text = re.sub(r"googleable", " google ", text)
    text = re.sub(r"googles", " google ", text)
    text = re.sub(r" rs(\d+)", lambda m: ' rs ' + m