python資料分析學習筆記九
第九章 分析文字資料和社交媒體
1 安裝nltk 略
2 濾除停用字 姓名和數字
示例程式碼如下:
import nltk # 載入英語停用字語料 sw = set(nltk.corpus.stopwords.words('english')) print('Stop words', list(sw)[:7]) # 取得gutenberg語料庫中的部分檔案 gb = nltk.corpus.gutenberg print('Gutenberg files', gb.fileids()[-5:]) # 取milton-paradise.txt檔案中的前兩句,作為下面所用的過濾語句 text_sent = gb.sents("milton-paradise.txt")[:2] print('Unfiltered', text_sent) # 過濾停用字 for sent in text_sent: filtered = [w for w in sent if w.lower() not in sw] print('Filtered', filtered) # 取得文字內所含的標籤 tagged = nltk.pos_tag(filtered) print("Tagged", tagged) words = [] for word in tagged:if word[1] != 'NNP' and word[1] != 'CD': words.append(word[0]) print(words) # 詞性標註集 # print(nltk.tag.tagset_mapping('ru-rnc', 'universal'))
執行結果如下:
Connected to pydev debugger (build162.1967.10)
Stop words ['his', 'only', 'because','with', 'each', 'myself', 'both']
Gutenberg files ['milton-paradise.txt','shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt','whitman-leaves.txt']
Unfiltered [['[', 'Paradise', 'Lost', 'by','John', 'Milton', '1667', ']'], ['Book', 'I']]
Filtered ['[', 'Paradise', 'Lost', 'John','Milton', '1667', ']']
Tagged [('[', 'JJ'), ('Paradise', 'NNP'),('Lost', 'NNP'), ('John', 'NNP'), ('Milton', 'NNP'), ('1667', 'CD'), (']','NN')]
['[', ']']
Filtered ['Book']
Tagged [('Book', 'NN')]
['Book']
本例用到的標記集:
{'PRP$',
'PDT',
'CD',
'EX',
'.',
'NNS',
'MD',
'PRP',
'RP',
'(',
'VBD',
'``',
"''",
'NN', 名詞
'LS',
'VBN',
'WRB',
'IN', 介詞
'FW',
'POS',
'CC', 並連詞
':',
'DT',
'VBZ',
'RBS',
'RBR',
'WP$',
'RB',
'SYM',
'JJS',
'JJR',
'UH',
'WDT',
'#',
',',
')',
'VB',
'NNPS',
'VBP', 動詞
'NNP',
'JJ', 形容詞
'WP',
'VBG',
'$',
'TO'} 單詞to
粗略的分為以下12種類型
'VERB',
'NOUN',
'PRON',
'ADJ',
'ADV',
'ADP',
'CONJ',
'DET',
'NUM',
'PRT',
'X',
'.'
3 詞袋模型
安裝scikit-learn略
示例程式碼如下:
import nltk from sklearn.feature_extraction.text import CountVectorizer # 從gutenberg語料庫中載入以下兩個檔案 gb = nltk.corpus.gutenberg hamlet = gb.raw('shakespeare-hamlet.txt') macbeth = gb.raw("shakespeare-macbeth.txt") # 去掉英語停用詞 cv = CountVectorizer(stop_words='english') # 輸出部分特徵值 print("Feature vector", cv.fit_transform([hamlet, macbeth]).toarray()) # 特徵值是按字母順序排序 print('Features', cv.get_feature_names()[:5])
執行結果如下:
Feature vector [[ 1 0 1..., 14 0 1]
[0 1 0 ..., 1 1 0]]
Features ['1599', '1603', 'abhominably','abhorred', 'abide']
4 詞頻分析
示例程式碼如下:
def printLine(values, num, keyOrValue, tag): """ 列印指定列表的num個元素的key或是value,輸出標籤為tag :param values:列表 :param num: 輸出元素個數 :param keyOrValue: 輸出元素的鍵還是值 0表示鍵,1表示值 :param tag: 輸出標籤 :return: """ tmpValue = [] for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]: tmpValue.append(key[keyOrValue]) print(tag, ":", tmpValue) # 載入文件 gb = nltk.corpus.gutenberg words = gb.words("shakespeare-caesar.txt") # 支除停用詞和標點符號 sw = set(nltk.corpus.stopwords.words('english')) punctuation = set(string.punctuation) filtered = [w.lower() for w in words if w.lower() not in sw and w.lower() not in punctuation] # 建立freqDist物件,輸出頻率最高的鍵和值 fd = nltk.FreqDist(filtered) printLine(fd, 5, 0, "Wrods") printLine(fd, 5, 1, "Counts") # 最常出現的單詞和次數 print('Max', fd.max()) print('Count', fd['caesar']) # 最常出現的雙字詞和詞數 fd = nltk.FreqDist(nltk.bigrams(filtered)) printLine(fd, 5, 0, "Bigrams") printLine(fd, 5, 1, "Counts") print('Bigram Max', fd.max()) print('Bigram count', fd[('let', 'vs')]) # 最常出現的三字詞和詞數 fd = nltk.FreqDist(nltk.trigrams(filtered)) printLine(fd, 5, 0, "Trigrams") printLine(fd, 5, 1, "Counts") print('Bigram Max', fd.max()) print('Bigram count', fd[('enter', 'lucius', 'luc')])
執行結果如下:
Wrods : ['caesar', 'brutus', 'bru', 'haue','shall']
Counts : [190, 161, 153, 148, 125]
Max caesar
Count 190
Bigrams : [('let', 'vs'), ('wee', 'l'),('mark', 'antony'), ('marke', 'antony'), ('st', 'thou')]
Counts : [16, 15, 13, 12, 12]
Bigram Max ('let', 'vs')
Bigram count 16
Trigrams : [('enter', 'lucius', 'luc'),('wee', 'l', 'heare'), ('thee', 'thou', 'st'), ('beware', 'ides', 'march'),('let', 'vs', 'heare')]
Counts : [4, 4, 3, 3, 3]
Bigram Max ('enter', 'lucius', 'luc')
Bigram count 4
5 樸素貝葉斯分類
是一個概率演算法,基於概率和數理統計中的貝葉斯定理
示例程式碼如下:
import nltk import string import random # 停用詞和標點符號集合 sw = set(nltk.corpus.stopwords.words('english')) punctuation = set(string.punctuation) # 將字長作為一個特徵 def word_features(word): return {'len': len(word)} # 是否為停用詞或是標點符號 def isStopword(word): return word in sw or word in punctuation # 載入檔案 gb = nltk.corpus.gutenberg words = gb.words("shakespeare-caesar.txt") # 對單詞進行標註,區分是否為停用詞 labeled_words = ([(word.lower(), isStopword(word.lower())) for word in words]) random.seed(42) random.shuffle(labeled_words) print(labeled_words[:5]) # 求出每個單詞的長度,作為特徵值 featuresets = [(word_features(n), word) for (n, word) in labeled_words] # 訓練一個樸素貝葉斯分類器 cutoff = int(.9 * len(featuresets)) # 建立訓練資料集和測試資料集 train_set, test_set = featuresets[:cutoff], featuresets[cutoff:] # 檢查分類器效果 classifier = nltk.NaiveBayesClassifier.train(train_set) print("'behold' class", classifier.classify(word_features('behold'))) print("'the' class", classifier.classify(word_features('the'))) # 根據測試資料集來計算分類器的準確性 print("Accuracy", nltk.classify.accuracy(classifier, test_set)) # 貢獻度最大的特徵 print(classifier.show_most_informative_features(5))
執行結果如下:
[('i', True), ('is', True), ('in', True),('he', True), ('ambitious', False)]
'behold' class False
'the' class True
Accuracy 0.8521671826625387
Most Informative Features
len = 7 False : True = 77.8 : 1.0
len = 6 False : True = 52.2 : 1.0
len = 1 True : False = 51.8 : 1.0
len = 2 True : False = 10.9 : 1.0
len = 5 False : True = 10.9 : 1.0
None
6 情感分析
示例程式碼如下:
import random from nltk.corpus import movie_reviews from nltk.corpus import stopwords from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk.classify import accuracy import string def getElementsByNum(values, num, keyOrValue): """ 取得指定列表的num個元素的key或是value, :param values:列表 :param num: 元素個數 :param keyOrValue: 元素的鍵還是值 0表示鍵,1表示值 :return: """ tmpValue = [] for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]: tmpValue.append(key[keyOrValue]) return tmpValue # 載入資料 labeled_docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)] random.seed(42) random.shuffle(labeled_docs) review_words = movie_reviews.words() print("#Review Words", len(review_words)) # 設定停用詞和標點符號 sw = set(stopwords.words('english')) punctuation = set(string.punctuation) # 檢查是否為停用詞 def isStopWord(word): return word in sw or word in punctuation # 過濾停用詞和標點符號 filtered = [w.lower() for w in review_words if not isStopWord(w.lower())] # print("# After filter", len(filtered)) # 選用詞頻最高的前5%作為特徵 words = FreqDist(filtered) N = int(.05 * len(words.keys())) # word_features = words.keys()[:N] word_features = getElementsByNum(words, N, 0) print('word_features', word_features) # 使用原始單詞計數來作為度量指標 def doc_features(doc): doc_words = FreqDist(w for w in doc if not isStopWord(w)) features = {} for word in word_features: features['count (%s)' % word] = (doc_words.get(word, 0)) return features # 使用原始單詞計數,來作為特徵值 featuresets = [(doc_features(d), c) for (d, c) in labeled_docs] # 建立訓練資料集和測試資料集 train_set, test_set = featuresets[200:], featuresets[:200] # 檢查分類器效果 classifier = NaiveBayesClassifier.train(train_set) # 根據測試資料集來計算分類器的準確性 print("Accuracy", accuracy(classifier, test_set)) # 貢獻度最大的特徵 print(classifier.show_most_informative_features())
執行結果如下:
#Review Words 1583820
# After filter 710579
Accuracy 0.765
Most Informative Features
count (wonderful) = 2 pos : neg = 14.8 : 1.0
count (outstanding) = 1 pos : neg = 12.0 : 1.0
count (apparently) = 2 neg : pos = 12.0 : 1.0
count (stupid) = 2 neg : pos = 11.1 : 1.0
count (boring) = 2 neg : pos = 10.7 : 1.0
count (bad) = 5 neg : pos = 10.0 : 1.0
count (best) = 4 pos : neg = 9.9 : 1.0
count (anyway) = 2 neg : pos = 8.1 : 1.0
count (minute) = 2 neg : pos = 8.1 : 1.0
count (matt) = 2 pos : neg = 7.9 : 1.0
None
7 建立詞雲
示例程式碼如下:
def getElementsByNum(values, num, keyOrValue): """ 取得指定列表的num個元素的key或是value, :param values:列表 :param num: 元素個數 :param keyOrValue: 元素的鍵還是值 0表示鍵,1表示值 :return: """ tmpValue = [] for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]: tmpValue.append(key[keyOrValue]) return tmpValue # 停用詞和標點符號集合 sw = set(stopwords.words('english')) punctuation = set(string.punctuation) # 檢查是否為停用詞或是標點符號 def isStopWord(word): return word in sw and word in punctuation # 取得原始文件 review_words = movie_reviews.words() # 過濾停用詞和標點符號 filtered = [w.lower() for w in review_words if not isStopWord(w.lower())] # 選用詞頻最高的前1%作為特徵 words = FreqDist(filtered) N = int(.01 * len(words.keys())) tags = getElementsByNum(words, N, 0) # tags = words.keys()[:N] for tag in tags: print(tag, ":", words[tag]) # 將輸出結果粘粘到wordle頁面,就可以得到詞雲頁面
輸出結果:略
進一步的過濾
詞頻和逆文件頻率 The Term Frequency -Inverse DocumentFrequency TF-IDF
示例程式碼如下:
from nltk.corpus import movie_reviews from nltk.corpus import stopwords from nltk.corpus import names from nltk import FreqDist from sklearn.feature_extraction.text import TfidfVectorizer import itertools import pandas as pd import numpy as np import string # 設定停用詞 標點符號和姓名 sw = set(stopwords.words('english')) punctuation = set(string.punctuation) all_names = set([name.lower() for name in names.words()]) # 過濾單詞(停用詞,標點符號,姓名,數字) def isStopWord(word): return (word in sw or word in punctuation) or not word.isalpha() or word in all_names # 取得影評文件 review_words = movie_reviews.words() # 過濾停用詞 filtered = [w.lower() for w in review_words if not isStopWord(w.lower())] words = FreqDist(filtered) # 建立TfidfVectorizer所需要的字串列表(過濾掉停用詞和只出現一次的單詞 ) texts = [] for fid in movie_reviews.fileids(): texts.append(" ".join([w.lower() for w in movie_reviews.words(fid) if not isStopWord(w.lower()) and words[w.lower()] > 1])) # 建立向量化程式 vectorizer = TfidfVectorizer(stop_words='english') matrix = vectorizer.fit_transform(texts) # 求單詞的TF-IDF的和 sums = np.array(matrix.sum(axis=0)).ravel() # 通過單詞的排名權值 ranks = [] # 不可用 # for word, val in itertools.izip(vectorizer.get_feature_names(), sums): for word, val in zip(vectorizer.get_feature_names(), sums): ranks.append((word, val)) # 建立DataFrame df = pd.DataFrame(ranks, columns=['term', 'tfidf']) # 並排序 # df = df.sort(columns='tfidf') df = df.sort_values(by='tfidf') # 輸出排名字低的值 print(df.head()) N = int(.01 * len(df)) df = df.tail(N) # 不可用 # for term, tfidf in itertools.izip(df['term'].values, df['tfidf'].values): for term, tfidf in zip(df['term'].values, df['tfidf'].values): print(term, ":", tfidf)
執行結果如下:
term tfidf
19963 superintendent 0.03035
8736 greys 0.03035
14010 ology 0.03035
2406 briefer 0.03035
2791 cannibalize 0.03035
matter : 10.1601563202
review : 10.1621092081
...
jokes : 10.1950553877
8 社交網路分析
安裝networdX 略
利用網路理論來研究社會關係
示例程式碼如下:
import matplotlib.pyplot as plt import networkx as nx # NetwordX所的供的示例圖 print([s for s in dir(nx) if s.endswith("graph")]) G = nx.davis_southern_women_graph() plt.figure(1) plt.hist(list(nx.degree(G).values())) plt.figure(2) pos = nx.spring_layout(G) nx.draw(G, node_size=9) nx.draw_networkx_labels(G, pos) plt.show()
執行結果如下:
['LCF_graph', 'barabasi_albert_graph','barbell_graph', 'binomial_graph', 'bull_graph', 'caveman_graph','chordal_cycle_graph', 'chvatal_graph', 'circulant_graph','circular_ladder_graph', 'complete_bipartite_graph', 'complete_graph','complete_multipartite_graph', 'connected_caveman_graph','connected_watts_strogatz_graph', 'cubical_graph', 'cycle_graph','davis_southern_women_graph', 'dense_gnm_random_graph', 'desargues_graph','diamond_graph', 'digraph', 'directed_havel_hakimi_graph','dodecahedral_graph', 'dorogovtsev_goltsev_mendes_graph','duplication_divergence_graph', 'ego_graph', 'empty_graph','erdos_renyi_graph', 'expected_degree_graph', 'fast_gnp_random_graph','florentine_families_graph', 'frucht_graph', 'gaussian_random_partition_graph','general_random_intersection_graph', 'geographical_threshold_graph','gn_graph', 'gnc_graph', 'gnm_random_graph', 'gnp_random_graph', 'gnr_graph','graph', 'grid_2d_graph', 'grid_graph', 'havel_hakimi_graph', 'heawood_graph','house_graph', 'house_x_graph', 'hypercube_graph', 'icosahedral_graph','is_directed_acyclic_graph', 'k_random_intersection_graph','karate_club_graph', 'kl_connected_subgraph', 'krackhardt_kite_graph','ladder_graph', 'line_graph', 'lollipop_graph', 'make_max_clique_graph','make_small_graph', 'margulis_gabber_galil_graph', 'moebius_kantor_graph','multidigraph', 'multigraph', 'navigable_small_world_graph','newman_watts_strogatz_graph', 'null_graph', 'nx_agraph', 'octahedral_graph','pappus_graph', 'path_graph', 'petersen_graph', 'planted_partition_graph','powerlaw_cluster_graph', 'projected_graph', 'quotient_graph','random_clustered_graph', 'random_degree_sequence_graph','random_geometric_graph', 'random_partition_graph', 'random_regular_graph','random_shell_graph', 'relabel_gexf_graph', 'relaxed_caveman_graph','scale_free_graph', 'sedgewick_maze_graph', 'star_graph', 'stochastic_graph','subgraph', 'tetrahedral_graph', 'to_networkx_graph', 'trivial_graph','truncated_cube_graph', 'truncated_tetrahedron_graph', 'tutte_graph','uniform_random_intersection_graph', 'watts_strogatz_graph', 'waxman_graph','wheel_graph']