1. 程式人生 > >英文原始文字的讀取與處理

英文原始文字的讀取與處理

       宣告:程式碼的執行環境為Python3。Python3與Python2在一些細節上會有所不同,希望廣大讀者注意。本部落格以程式碼為主,程式碼中會有詳細的註釋。相關文章將會發布在我的個人部落格專欄《Python自然語言處理》,歡迎大家關注。

 

一、線上下載txt文件

import nltk, re, pprint
from nltk import word_tokenize
from urllib.request import urlopen

# txt線上文件下載
url = "http://www.gutenberg.org/files/2553/2553.txt"  # 連結路徑
response = urlopen(url)  # 下載路徑中的檔案
raw = response.read().decode('utf8')  # 讀取檔案並進行解碼操作
print(type(raw))  # 打印出文字的型別
print(len(raw))  # 打印出文字的長度
print(raw[:10])  # 打印出文字前10個字元

<class 'str'>
653642
The Projec
#分詞
tokens = word_tokenize(raw)
print(type(tokens))
print(len(tokens))
print(tokens[:10])

<class 'list'>
131869
['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Jeanne', "d'Arc", ',', 'by', 'Mrs.']
#建立text
text = nltk.Text(tokens)
print(type(text))
print(text[1024:1062])
print(text.collocations())  # 檢視常用的搭配

<class 'nltk.text.Text'>
['century', '.', 'A', 'strong', 'and', 'splendid', 'kingdom', ',', 'to', 'which', 'in', 'early', 'ages', 'one', 'great', 'man', 'had', 'given', 'the', 'force', 'and', 'supremacy', 'of', 'a', 'united', 'nation', ',', 'had', 'fallen', 'into', 'a', 'disintegration', 'which', 'seems', 'almost', 'incredible', 'when', 'regarded']
Project Gutenberg-tm; St. Catherine; would seem; Project Gutenberg;
St. Margaret; Frere Isambard; St. Michael; Literary Archive; St.
Denis; Gutenberg-tm electronic; Archive Foundation; electronic works;
Gutenberg Literary; fifteenth century; United States; Church militant;
Holy Father; set forth; fifteen days; Jacques d'Arc
None
print(raw.find("Frere Isambard"))

415623


print(raw.rfind("fifteen days"))

502248


raw = raw[415623:502248]
raw

Out[14]: 'Frere Isambard, who was\r\nthe person in question, speaks at a later period he tells us that "the\r\nquestions put to Jeanne were too difficult, subtle, and dangerous, so\r\nthat the great clerks and learned men who were present scarcely would\r\nhave known 

 

二、HTML格式下載文字

# HTML下載
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read().decode('utf8')

#HTML解析
from bs4 import BeautifulSoup
raw = BeautifulSoup(html,'lxml').get_text()  # 使用lxml解析器,get_text()獲取到所有的文字資訊
tokens = word_tokenize(raw)  # 分詞

bs = BeautifulSoup(html,'lxml')
print(bs.find("div",class_='bodytext').get_text())  # 找到第一個屬性為bodytext的div標籤,bs.find_all()找到所有的屬性為bodytext的div標籤

#過濾無關內容
tokens = tokens[110:390]
text = nltk.Text(tokens)

 

三、讀取本地檔案

raw = open('F:/document.txt').read()
print(type(raw))
tokens = word_tokenize(raw)
print(type(tokens))
words = [w.lower() for w in tokens]
print(type(words))
vocab = sorted(set(words))
print(type(vocab))

<class 'str'>
<class 'list'>
<class 'list'>
<class 'list'>
vocab.append('blog')
# raw.append('blog')  # 追加的時候一定要注意型別


# query = 'Who knows?'
# beatles = ['john', 'paul', 'george', 'ringo']
# query + beatles  # 連線的時候也需要注意型別

字串常用的方法:

 

四、Unicode字元

path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')  # 波蘭語的檔案
f= path.open(encoding='latin2')  # 採用拉丁語的方式開啟
for line in f:
    line = line.strip()
    print(line)
    
Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.
ord('a')  # 查詢字母的整數序號

Out[21]: 97
a=u'\u0061' # 表示Unicode編碼
print(a)

a

 

五、正則表示式

常用的正則表示式符號:

# 正則表示式
import re  # 匯入正則表示式的包
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
#查詢ed結尾的詞彙
print([w for w in wordlist if re.search('ed$', w)])

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed', 'absconded', 'absorbed', 'abstracted', 'abstricted', 'accelerated', 'accepted', 'accidented', ...]
#字謎:8個字母,第3個字母是j,第6個字母是t
print([w for w in wordlist if re.search('^..j..t..$', w)])
['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter', 'rejector', 'unjilted', 'unjolted', 'unjustly']
#9宮格輸入判斷
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)])
['gold', 'golf', 'hold', 'hole']
#正則表示式中的+
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
print([w for w in chat_words if re.search('^m+i+n+e+$', w)])
['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', 'miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']
# 提取字元塊
word = 'supercalifragilisticexpialidocious'
print(re.findall(r'[aeiou]', word))  # 匹配正則表示式的時候通常在字元前面加上r
print(len(re.findall(r'[aeiou]', word)))

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']
16
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()  # 運用正則表示式獲取交叉表

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')  # 提取括號裡面的內容

Out[41]: ['ing']


re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')  # 匹配整個單詞

Out[42]: ['processing']


re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')  # 分成單詞和字尾

Out[43]: [('process', 'ing')]


re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')  # 如果以es或s結尾,可能會出現問題

Out[44]: [('processe', 's')]


re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')  # 加?,匹配出完整的字尾和詞語

Out[45]: [('process', 'es')]
def stems(word):  # 去除文字中的字尾
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government.  Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
[stems(t) for t in tokens]

Out[47]: 
['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'women',
 'ly',
 'in',
 'pond',
 'distribut',
 'sword',
 'i',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'Supreme',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

 

六、規範化文字

# 詞幹提取器(兩種不同的方法)
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]

Out[54]: 
['denni',
 ':',
 'listen',
 ',',
 'strang',
 'women',
 'lie',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandat',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcic',
 'aquat',
 'ceremoni',
 '.']
[lancaster.stem(t) for t in tokens]
Out[55]: 
['den',
 ':',
 'list',
 ',',
 'strange',
 'wom',
 'lying',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'bas',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'pow',
 'der',
 'from',
 'a',
 'mand',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'som',
 'farc',
 'aqu',
 'ceremony',
 '.']
wnl = nltk.WordNetLemmatizer()  # 詞性歸併
[wnl.lemmatize(t) for t in tokens]
Out[56]: 
['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'woman',
 'lying',
 'in',
 'pond',
 'distributing',
 'sword',
 'is',
 'no',
 'basis',
 'for',
 'a',
 'system',
 'of',
 'government',
 '.',
 'Supreme',
 'executive',
 'power',
 'derives',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

 

七、分割

# 例子
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

['"Nonsense!"',
 'said Gregory, who was very rational when anyone else\nattempted paradox.',
 '"Why do all the clerks and navvies in the\n'
 'railway trains look so sad and tired, so very sad and tired?',
 'I will\ntell you.',
 'It is because they know that the train is going right.',
 'It\n'
 'is because they know that whatever place they have taken a ticket\n'
 'for that place they will reach.',
 'It is because after they have\n'
 'passed Sloane Square they know that the next station must be\n'
 'Victoria, and nothing but Victoria.',
 'Oh, their wild rapture!',
 'oh,\n'
 'their eyes like stars and their souls again in Eden, if the next\n'
 'station were unaccountably Baker Street!"',
 '"It is you who are unpoetical," replied the poet Syme.']
def segment(text, segs):  # 分詞例子
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
segment(text, seg1)

Out[63]: ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
segment(text, seg2)

Out[64]: 
['do',
 'you',
 'see',
 'the',
 'kitty',
 'see',
 'the',
 'doggy',
 'do',
 'you',
 'like',
 'the',
 'kitty',
 'like',
 'the',
 'doggy']
def evaluate(text, segs):  # 分詞例子
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
print(evaluate(text, seg3))

46


print(evaluate(text, seg2))

47


print(evaluate(text, seg1))

63
# 模擬退火演算法
from random import randint
def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]
def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs
def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature,0)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)

63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
61 ['doyousee', 'thekitty', 'seeth', 'edoggy', 'doyou', 'l', 'i', 'ke', 'thekitty', 'liketh', 'edoggy']
57 ['doyou', 'see', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'l', 'ike', 'thekitty', 'liketh', 'edoggy']
54 ['doyou', 'see', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'l', 'ik', 'e', 'thekitty', 'l', 'ik', 'eth', 'edoggy']
52 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'lik', 'eth', 'edoggy']
52 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'lik', 'eth', 'edoggy']
51 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
51 ['doyou', 'se', 'e', 'thekitty', 'se', 'eth', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
48 ['doyou', 'se', 'e', 'thekitty', 'se', 'e', 'th', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
45 ['doyou', 'see', 'thekitty', 'see', 'th', 'edoggy', 'doyou', 'like', 'thekitty', 'like', 'th', 'edoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
Out[68]: '0000100100000001001000000010000100010000000100010000000'

 

八、連結串列與字串

#連結串列到字串
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
print(' '.join(silly))

We called him Tortoise because he taught us .
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in sorted(fdist):
    print(word, '->', fdist[word], '; ')
    
cat -> 3 ; 
dog -> 4 ; 
snake -> 1 ; 
for word in sorted(fdist):
    print('%s->%d;' % (word, fdist[word]))
    
cat->3;
dog->4;
snake->1;
# 例子:定義並使用交叉表
def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category')
    for word in words: # column headings
        print('%6s' % word)
    print()
    for category in categories:
        print('%-16s' % category) # row heading
        for word in words: # for each word
            print('%6d' % cfdist[category][word]) # print table cell
        print()                                              # end the row
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)