1. 程式人生 > >python資料分析學習筆記九

python資料分析學習筆記九

第九章 分析文字資料和社交媒體

1 安裝nltk 略

2 濾除停用字 姓名和數字

示例程式碼如下:

import nltk

# 載入英語停用字語料
sw = set(nltk.corpus.stopwords.words('english'))
print('Stop words', list(sw)[:7])

# 取得gutenberg語料庫中的部分檔案
gb = nltk.corpus.gutenberg
print('Gutenberg files', gb.fileids()[-5:])

# 取milton-paradise.txt檔案中的前兩句,作為下面所用的過濾語句
text_sent = gb.sents(
"milton-paradise.txt")[:2] print('Unfiltered', text_sent) # 過濾停用字 for sent in text_sent:     filtered = [w for w in sent if w.lower() not in sw]     print('Filtered', filtered)     # 取得文字內所含的標籤     tagged = nltk.pos_tag(filtered)     print("Tagged", tagged)     words = []     for word in tagged:        
if word[1] != 'NNP' and word[1] != 'CD':             words.append(word[0])     print(words) # 詞性標註集 # print(nltk.tag.tagset_mapping('ru-rnc', 'universal'))

執行結果如下:

Connected to pydev debugger (build162.1967.10)

Stop words ['his', 'only', 'because','with', 'each', 'myself', 'both']

Gutenberg files ['milton-paradise.txt','shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt','whitman-leaves.txt']

Unfiltered [['[', 'Paradise', 'Lost', 'by','John', 'Milton', '1667', ']'], ['Book', 'I']]

Filtered ['[', 'Paradise', 'Lost', 'John','Milton', '1667', ']']

Tagged [('[', 'JJ'), ('Paradise', 'NNP'),('Lost', 'NNP'), ('John', 'NNP'), ('Milton', 'NNP'), ('1667', 'CD'), (']','NN')]

['[', ']']

Filtered ['Book']

Tagged [('Book', 'NN')]

['Book']

本例用到的標記集:

{'PRP$',

'PDT',

'CD',

'EX',

 '.',

'NNS',

 'MD',

 'PRP',

 'RP',

'(',

'VBD',

'``',

"''",

'NN', 名詞

 'LS',

'VBN',

 'WRB',

'IN', 介詞

'FW',

 'POS',

'CC', 並連詞

':',

'DT',

 'VBZ',

'RBS',

'RBR',

'WP$',

'RB',

'SYM',

 'JJS',

 'JJR',

'UH',

'WDT',

'#',

',',

')',

'VB',

'NNPS',

 'VBP',  動詞

'NNP',

'JJ',  形容詞

'WP',

'VBG',

'$',

'TO'} 單詞to

粗略的分為以下12種類型

'VERB',
'NOUN',
'PRON',
'ADJ',
'ADV',
'ADP',
'CONJ',
'DET',
'NUM',
'PRT',
'X',
'.'

3 詞袋模型

安裝scikit-learn略

示例程式碼如下:

import nltk
from sklearn.feature_extraction.text import CountVectorizer

# 從gutenberg語料庫中載入以下兩個檔案
gb = nltk.corpus.gutenberg
hamlet = gb.raw('shakespeare-hamlet.txt')
macbeth = gb.raw("shakespeare-macbeth.txt")

# 去掉英語停用詞
cv = CountVectorizer(stop_words='english')
# 輸出部分特徵值
print("Feature vector", cv.fit_transform([hamlet, macbeth]).toarray())
# 特徵值是按字母順序排序
print('Features', cv.get_feature_names()[:5])

執行結果如下:

Feature vector [[ 1  0  1..., 14  0  1]

 [0  1 0 ...,  1  1  0]]

Features ['1599', '1603', 'abhominably','abhorred', 'abide']

4 詞頻分析

示例程式碼如下:

def printLine(values, num, keyOrValue, tag):
    """
    列印指定列表的num個元素的key或是value,輸出標籤為tag
    :param values:列表
    :param num: 輸出元素個數
    :param keyOrValue: 輸出元素的鍵還是值 0表示鍵,1表示值
    :param tag: 輸出標籤
    :return:
    """

    tmpValue = []
    for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]:
        tmpValue.append(key[keyOrValue])
    print(tag, ":", tmpValue)


# 載入文件
gb = nltk.corpus.gutenberg
words = gb.words("shakespeare-caesar.txt")

# 支除停用詞和標點符號
sw = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)
filtered = [w.lower() for w in words if w.lower() not in sw and w.lower() not in punctuation]

# 建立freqDist物件,輸出頻率最高的鍵和值
fd = nltk.FreqDist(filtered)
printLine(fd, 5, 0, "Wrods")
printLine(fd, 5, 1, "Counts")

# 最常出現的單詞和次數
print('Max', fd.max())
print('Count', fd['caesar'])

# 最常出現的雙字詞和詞數
fd = nltk.FreqDist(nltk.bigrams(filtered))
printLine(fd, 5, 0, "Bigrams")
printLine(fd, 5, 1, "Counts")
print('Bigram Max', fd.max())
print('Bigram count', fd[('let', 'vs')])

# 最常出現的三字詞和詞數
fd = nltk.FreqDist(nltk.trigrams(filtered))
printLine(fd, 5, 0, "Trigrams")
printLine(fd, 5, 1, "Counts")
print('Bigram Max', fd.max())
print('Bigram count', fd[('enter', 'lucius', 'luc')])

執行結果如下:

Wrods : ['caesar', 'brutus', 'bru', 'haue','shall']

Counts : [190, 161, 153, 148, 125]

Max caesar

Count 190

Bigrams : [('let', 'vs'), ('wee', 'l'),('mark', 'antony'), ('marke', 'antony'), ('st', 'thou')]

Counts : [16, 15, 13, 12, 12]

Bigram Max ('let', 'vs')

Bigram count 16

Trigrams : [('enter', 'lucius', 'luc'),('wee', 'l', 'heare'), ('thee', 'thou', 'st'), ('beware', 'ides', 'march'),('let', 'vs', 'heare')]

Counts : [4, 4, 3, 3, 3]

Bigram Max ('enter', 'lucius', 'luc')

Bigram count 4

5 樸素貝葉斯分類

是一個概率演算法,基於概率和數理統計中的貝葉斯定理

示例程式碼如下:

import nltk
import string
import random

# 停用詞和標點符號集合
sw = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)


# 將字長作為一個特徵
def word_features(word):
    return {'len': len(word)}


# 是否為停用詞或是標點符號
def isStopword(word):
    return word in sw or word in punctuation


# 載入檔案
gb = nltk.corpus.gutenberg
words = gb.words("shakespeare-caesar.txt")

# 對單詞進行標註,區分是否為停用詞
labeled_words = ([(word.lower(), isStopword(word.lower())) for word in words])
random.seed(42)
random.shuffle(labeled_words)
print(labeled_words[:5])

# 求出每個單詞的長度,作為特徵值
featuresets = [(word_features(n), word) for (n, word) in labeled_words]

# 訓練一個樸素貝葉斯分類器
cutoff = int(.9 * len(featuresets))

# 建立訓練資料集和測試資料集
train_set, test_set = featuresets[:cutoff], featuresets[cutoff:]

# 檢查分類器效果
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("'behold' class", classifier.classify(word_features('behold')))
print("'the' class", classifier.classify(word_features('the')))

# 根據測試資料集來計算分類器的準確性
print("Accuracy", nltk.classify.accuracy(classifier, test_set))
# 貢獻度最大的特徵
print(classifier.show_most_informative_features(5))

執行結果如下:

[('i', True), ('is', True), ('in', True),('he', True), ('ambitious', False)]

'behold' class False

'the' class True

Accuracy 0.8521671826625387

Most Informative Features

                     len = 7               False : True   =    77.8 : 1.0

                     len = 6               False : True   =    52.2 : 1.0

                     len = 1                True : False  =    51.8 : 1.0

                     len = 2                True : False  =    10.9 : 1.0

                     len = 5               False : True   =    10.9 : 1.0

None

6 情感分析

示例程式碼如下:

import random
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
import string


def getElementsByNum(values, num, keyOrValue):
    """
    取得指定列表的num個元素的key或是value,
    :param values:列表
    :param num: 元素個數
    :param keyOrValue: 元素的鍵還是值 0表示鍵,1表示值
    :return:
    """

    tmpValue = []
    for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]:
        tmpValue.append(key[keyOrValue])
    return tmpValue


# 載入資料
labeled_docs = [(list(movie_reviews.words(fid)), cat)
                for cat in movie_reviews.categories()
                for fid in movie_reviews.fileids(cat)]

random.seed(42)
random.shuffle(labeled_docs)

review_words = movie_reviews.words()
print("#Review Words", len(review_words))

# 設定停用詞和標點符號
sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)


# 檢查是否為停用詞
def isStopWord(word):
    return word in sw or word in punctuation


# 過濾停用詞和標點符號
filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]
# print("# After filter", len(filtered))

# 選用詞頻最高的前5%作為特徵
words = FreqDist(filtered)
N = int(.05 * len(words.keys()))

# word_features = words.keys()[:N]
word_features = getElementsByNum(words, N, 0)
print('word_features', word_features)


# 使用原始單詞計數來作為度量指標
def doc_features(doc):
    doc_words = FreqDist(w for w in doc if not isStopWord(w))
    features = {}
    for word in word_features:
        features['count (%s)' % word] = (doc_words.get(word, 0))
    return features


# 使用原始單詞計數,來作為特徵值
featuresets = [(doc_features(d), c) for (d, c) in labeled_docs]

# 建立訓練資料集和測試資料集
train_set, test_set = featuresets[200:], featuresets[:200]
# 檢查分類器效果
classifier = NaiveBayesClassifier.train(train_set)
# 根據測試資料集來計算分類器的準確性
print("Accuracy", accuracy(classifier, test_set))

# 貢獻度最大的特徵
print(classifier.show_most_informative_features())

執行結果如下:

#Review Words 1583820

# After filter 710579

Accuracy 0.765

Most Informative Features

      count (wonderful) = 2                pos : neg    =     14.8 : 1.0

    count (outstanding) = 1                pos : neg    =     12.0 : 1.0

     count (apparently) = 2                neg : pos    =     12.0 : 1.0

         count (stupid) = 2                neg : pos    =     11.1 : 1.0

         count (boring) = 2                 neg : pos    =    10.7 : 1.0

            count (bad) = 5                neg : pos    =     10.0 : 1.0

           count (best) = 4                pos : neg    =      9.9 : 1.0

         count (anyway) = 2                neg : pos    =      8.1 : 1.0

         count (minute) = 2                neg : pos    =      8.1 : 1.0

           count (matt) = 2                pos : neg    =      7.9 : 1.0

None

7 建立詞雲

示例程式碼如下:

def getElementsByNum(values, num, keyOrValue):
    """
    取得指定列表的num個元素的key或是value,
    :param values:列表
    :param num: 元素個數
    :param keyOrValue: 元素的鍵還是值 0表示鍵,1表示值
    :return:
    """

    tmpValue = []
    for key in sorted(values.items(), key=lambda d: d[1], reverse=True)[:num]:
        tmpValue.append(key[keyOrValue])
    return tmpValue


# 停用詞和標點符號集合
sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)


# 檢查是否為停用詞或是標點符號
def isStopWord(word):
    return word in sw and word in punctuation


# 取得原始文件
review_words = movie_reviews.words()
# 過濾停用詞和標點符號
filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]

# 選用詞頻最高的前1%作為特徵
words = FreqDist(filtered)
N = int(.01 * len(words.keys()))
tags = getElementsByNum(words, N, 0)
# tags = words.keys()[:N]

for tag in tags:
    print(tag, ":", words[tag])

# 將輸出結果粘粘到wordle頁面,就可以得到詞雲頁面

輸出結果:略

進一步的過濾

詞頻和逆文件頻率 The Term Frequency -Inverse DocumentFrequency TF-IDF

示例程式碼如下:

from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
import pandas as pd
import numpy as np
import string

# 設定停用詞 標點符號和姓名
sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)
all_names = set([name.lower() for name in names.words()])


# 過濾單詞(停用詞,標點符號,姓名,數字)
def isStopWord(word):
    return (word in sw or word in punctuation) or not word.isalpha() or word in all_names


# 取得影評文件
review_words = movie_reviews.words()
# 過濾停用詞
filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]
words = FreqDist(filtered)

# 建立TfidfVectorizer所需要的字串列表(過濾掉停用詞和只出現一次的單詞 )
texts = []
for fid in movie_reviews.fileids():
    texts.append(" ".join([w.lower()
                           for w in movie_reviews.words(fid)
                           if not isStopWord(w.lower()) and words[w.lower()] > 1]))

# 建立向量化程式
vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(texts)
# 求單詞的TF-IDF的和
sums = np.array(matrix.sum(axis=0)).ravel()

# 通過單詞的排名權值
ranks = []

# 不可用
# for word, val in itertools.izip(vectorizer.get_feature_names(), sums):
for word, val in zip(vectorizer.get_feature_names(), sums):
    ranks.append((word, val))

# 建立DataFrame
df = pd.DataFrame(ranks, columns=['term', 'tfidf'])

# 並排序
# df = df.sort(columns='tfidf')
df = df.sort_values(by='tfidf')
# 輸出排名字低的值
print(df.head())

N = int(.01 * len(df))
df = df.tail(N)

# 不可用
# for term, tfidf in itertools.izip(df['term'].values, df['tfidf'].values):
for term, tfidf in zip(df['term'].values, df['tfidf'].values):
    print(term, ":", tfidf)

執行結果如下:

                 term    tfidf

19963 superintendent  0.03035

8736            greys  0.03035

14010           ology  0.03035

2406          briefer  0.03035

2791     cannibalize  0.03035

matter : 10.1601563202

review : 10.1621092081

...

jokes : 10.1950553877

8 社交網路分析

安裝networdX 略

利用網路理論來研究社會關係

示例程式碼如下:

import matplotlib.pyplot as plt
import networkx as nx

# NetwordX所的供的示例圖
print([s for s in dir(nx) if s.endswith("graph")])

G = nx.davis_southern_women_graph()
plt.figure(1)
plt.hist(list(nx.degree(G).values()))
plt.figure(2)
pos = nx.spring_layout(G)
nx.draw(G, node_size=9)
nx.draw_networkx_labels(G, pos)
plt.show()

執行結果如下:

['LCF_graph', 'barabasi_albert_graph','barbell_graph', 'binomial_graph', 'bull_graph', 'caveman_graph','chordal_cycle_graph', 'chvatal_graph', 'circulant_graph','circular_ladder_graph', 'complete_bipartite_graph', 'complete_graph','complete_multipartite_graph', 'connected_caveman_graph','connected_watts_strogatz_graph', 'cubical_graph', 'cycle_graph','davis_southern_women_graph', 'dense_gnm_random_graph', 'desargues_graph','diamond_graph', 'digraph', 'directed_havel_hakimi_graph','dodecahedral_graph', 'dorogovtsev_goltsev_mendes_graph','duplication_divergence_graph', 'ego_graph', 'empty_graph','erdos_renyi_graph', 'expected_degree_graph', 'fast_gnp_random_graph','florentine_families_graph', 'frucht_graph', 'gaussian_random_partition_graph','general_random_intersection_graph', 'geographical_threshold_graph','gn_graph', 'gnc_graph', 'gnm_random_graph', 'gnp_random_graph', 'gnr_graph','graph', 'grid_2d_graph', 'grid_graph', 'havel_hakimi_graph', 'heawood_graph','house_graph', 'house_x_graph', 'hypercube_graph', 'icosahedral_graph','is_directed_acyclic_graph', 'k_random_intersection_graph','karate_club_graph', 'kl_connected_subgraph', 'krackhardt_kite_graph','ladder_graph', 'line_graph', 'lollipop_graph', 'make_max_clique_graph','make_small_graph', 'margulis_gabber_galil_graph', 'moebius_kantor_graph','multidigraph', 'multigraph', 'navigable_small_world_graph','newman_watts_strogatz_graph', 'null_graph', 'nx_agraph', 'octahedral_graph','pappus_graph', 'path_graph', 'petersen_graph', 'planted_partition_graph','powerlaw_cluster_graph', 'projected_graph', 'quotient_graph','random_clustered_graph', 'random_degree_sequence_graph','random_geometric_graph', 'random_partition_graph', 'random_regular_graph','random_shell_graph', 'relabel_gexf_graph', 'relaxed_caveman_graph','scale_free_graph', 'sedgewick_maze_graph', 'star_graph', 'stochastic_graph','subgraph', 'tetrahedral_graph', 'to_networkx_graph', 'trivial_graph','truncated_cube_graph', 'truncated_tetrahedron_graph', 'tutte_graph','uniform_random_intersection_graph', 'watts_strogatz_graph', 'waxman_graph','wheel_graph']