用Python進行自然語言處理-筆記
阿新 • • 發佈:2019-02-17
#!/usr/bin/env python # -*- coding: utf-8 -*- from nltk.book import * # 查詢特定詞語上下文 text1.concordance("monstrous") # 相關詞查詢 text1.similar("monstrous") # 查詢多個詞語的共同上下文 text2.common_contexts(["monstrous", "very"]) # 畫出詞語的離散圖 text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"]) # 產生隨機文字 text3.generate() Traceback (most recent call last): File "E:/nlp/eg1.py", line 25, in <module> text3.generate() TypeError: generate() missing 1 required positional argument: 'words' # 單詞數量 識別符號總數 print(len(text3)) # 詞彙的種類及數量 用集合set顯示 print(sorted(set(text3))) print(len(set(text3))) # 測量平均每類詞語被使用的次數 from __future__ import division #本命令必須放在檔案的開始之初 print(len(text3)/len(set(text3))) # 統計特定單詞在文字中出現的次數,並計算其佔比 print(text3.count("smote")) print(100*text4.count('a')/len(text4)) # # 詞的頻率分佈 fdist1 = FreqDist(text1) # # 輸出總的詞數 print(fdist1) # In Python 3 dict.keys() returns an iteratable but not indexable object. vac1 = list(fdist1.keys()) # # 輸出詞數最多的前五十個詞 print(vac1[:50]) # # 輸出whale的次數 print(fdist1["whale"]) # # 輸出前五十個詞的累積頻率圖 fdist1.plot(50) # 查詢長度超過15個字元的詞 V = set(text1) long_words = [w for w in V if len(w)>15] print(sorted(long_words)) # 查詢長度超過7的詞且頻率超過7 fdist5 = FreqDist(text5) print(sorted([ w for w in set(text5) if len(w)>7 and fdist5[w]>7])) # 雙連詞的使用 from nltk import bigrams # # 查了一下nltk官網上的函式說明,要加list()函式,結果才是書上的情況 print(list(bigrams(['more', 'is', 'said', 'than', 'done']))) # 文字中常用的連線詞 print(text4.collocations()) print([len(w) for w in text1]) fdist = FreqDist([len(w) for w in text1]) print(fdist) print(fdist.keys()) print(fdist.items()) print(fdist.max()) print(fdist[3]) print(fdist.freq(3)) print(sorted([w for w in set(text1) if w.endswith('ableness')])) print(babelize_shell())