【機器學習】文字資料的向量化(TF-IDF)---樣本集例項講解+python實現
阿新 • • 發佈:2019-02-11
1.文字資料的向量化
1.1名詞解釋
CF:文件集的頻率,是指詞在文件集中出現的次數
DF:文件頻率,是指出現詞的文件數
IDF:逆文件頻率,idf = log(N/(1+df)),N為所有文件的數目,為了相容df=0情況,將分母弄成1+df。
TF:詞在文件中的頻率
TF-IDF:TF-IDF= TF*IDF
1.2文字資料樣本集
為了講解文字資料的向量化,假設我們有4個文字,所有文字一共有6個不同的詞,如下所示。
doc1 | iphone | guuci | huawei | watch | huawei | |
doc2 | huawei | watch | iphone | watch | iphone | gucci |
doc3 | skirt | skirt | skirt | flower | ||
doc4 | watch | watch | huawei |
1.3計算彙總
iphone | watch | gucci | huawei | skirt | flower | |
doc1 TF | 1/5 | 1/5 | 1/5 | 2/5 | 0 | 0 |
doc2 TF | 2/6 | 2/6 | 1/6 | 1/6 | 0 | 0 |
doc3 TF | 0 | 0 | 0 | 0 | 3/4 | 1/4 |
doc4 TF | 0 | 2/3 | 0 | 1/3 | 0 | 0 |
DF 含詞的文件數 | 2 | 3 | 2 | 3 | 1 | 1 |
IDF 逆文件頻率 =log(N/(1+DF)) | log(4/(1+2)) =log(4/3) | log(4/(1+3)) =log(4/4) | log(4/(1+2)) =log(4/3) | log(4/(1+3)) =log(4/4) | log(4/(1+1)) =log(4/2) | log(4/(1+1)) =log(4/2) |
doc1 TFIDF | 1/5*log(4/3) | 1/5*log(4/4) | 1/5*log(4/3) | 2/5*log(4/4) | 0 | 0 |
doc2 TFIDF | 2/6*log(4/3) | 2/6*log(4/4) | 1/6*log(4/3) | 1/6*log(4/4) | 0 | 0 |
doc3 TFIDF | 0 | 0 | 0 | 0 | 3/4*log(4/2) | 1/4*log(4/2) |
doc4TFIDF | 0 | 2/3*log(4/4) | 0 | 1/3*log(4/4) | 0 | 0 |
1.4實現tf-idf
人肉完成,相對來說,tf-idf的實現還比較簡單。
# -*- coding: utf-8 -*- """ Author:蔚藍的天空tom Talk is cheap, show me the code Aim:實現文字型資料的TF-IDF向量化 """ import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer def sklearn_tfidf(): tag_list = ['iphone guuci huawei watch huawei', 'huawei watch iphone watch iphone guuci', 'skirt skirt skirt flower', 'watch watch huawei'] vectorizer = CountVectorizer() #將文字中的詞語轉換為詞頻矩陣 X = vectorizer.fit_transform(tag_list) #計算個詞語出現的次數 transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) #將詞頻矩陣X統計成TF-IDF值 print(tfidf.toarray()) def tfidf_alg(): docs = np.array(['iphone guuci huawei watch huawei', 'huawei watch iphone watch iphone guuci', 'skirt skirt skirt flower', 'watch watch huawei']) words = np.array(['iphone', 'guuci', 'huawei', 'watch', 'skirt', 'flower']) #calc cf way1, 詞在文件中出現的個數 cfs = [] for e in docs: cf = [e.count(word) for word in words] cfs.append(cf) print('cfs way1:\n', np.array(cfs)) #calc cf way2, 詞在文件中出現的個數 cfs = [] cfs.extend([e.count(word) for word in words] for e in docs) cfs = np.array(cfs) print('cfs way2:\n', cfs) #calc tf way1, 詞在文件中的頻率 tfs = [] for e in cfs: tf = e/(np.sum(e)) tfs.append(tf) print('tfs way1:\n', np.array(tfs)) #calc tf way2, 詞在文件中的頻率 tfs = [] tfs.extend(e/(np.sum(e)) for e in cfs)#不能使用append() print('tfs:\n',np.array(tfs)) #calc df way1, 包含詞的文件個數 dfs = list(np.zeros(words.size, dtype=int)) for i in range(words.size): for doc in docs: if doc.find(words[i]) != -1: dfs[i] += 1 print('calc df way1:', dfs) #calc df way2, 包含詞的文件個數 dfs = [] for i in range(words.size): oneHot = [(doc.find(words[i]) != -1 and 1 or 0) for doc in docs] dfs.append(oneHot.count(1)) #print('word',words[i],'df:',oneHot.count(1)) print('calc df way2:', dfs) #calc df way3, 包含文辭的文件個數 dfs, oneHots = [],[] for word in words: oneHots.append([(e.find(word) != -1 and 1 or 0) for e in docs]) dfs.extend(e.count(1) for e in oneHots) print('calc oneHots way3:', np.array(oneHots)) print('calc df way3:', dfs) #calc df way4, 包含詞的文件個數 dfs = [] oneHots = [[doc.find(word) != -1 and 1 or 0 for doc in docs] for word in words] dfs.extend(e.count(1) for e in oneHots) print('calc oneHots way4:', np.array(oneHots)) #dfs = np.reshape(dfs, (np.shape(dfs)[0],1)) #列向量1×n #print('calc df way4:', dfs) #calc idf, 計算每個詞的idf(逆向檔案頻率inverse document frequency) #log10(N/(1+DF)) N = np.shape(docs)[0] idfs = [(np.log10(N*1.0/(1+e))) for e in dfs]#f(e) = np.log10(N*1.0/(1+e)) print('idfs:',np.array(idfs)) #calc tf-idf,計算term frequency - inverse document frequency tfidfs = [] for i in range(np.shape(docs)[0]): word_tfidf = np.multiply(tfs[i], idfs) tfidfs.append(word_tfidf) #print('word_tfidf:',word_tfidf) print('calc tfidfs:\n', np.array(tfidfs)) print('==================result============================') print('\ndocs:\n', np.array(docs)) print('\nwords:\n', np.array(words)) print('\noneHots:\n', np.array(oneHots)) print('\nCF:\n', np.array(cfs)) print('\nTF:\n', np.array(tfs)) print('\nDF:\n', np.array(dfs)) print('\nIDF:\n', np.array(idfs)) print('\nTF-IDF:\n', np.array(tfidfs)) print('==============================================') return if __name__=='__main__': tfidf_alg() #sklearn_tfidf()
1.5執行結果
==================result============================
docs:
['iphone guuci huawei watch huawei'
'huawei watch iphone watch iphone guuci' 'skirt skirt skirt flower'
'watch watch huawei']
words:
['iphone' 'guuci' 'huawei' 'watch' 'skirt' 'flower']
oneHots:
[[1 1 0 0]
[1 1 0 0]
[1 1 0 1]
[1 1 0 1]
[0 0 1 0]
[0 0 1 0]]
CF:
[[1 1 2 1 0 0]
[2 1 1 2 0 0]
[0 0 0 0 3 1]
[0 0 1 2 0 0]]
TF:
[[ 0.2 0.2 0.4 0.2 0. 0. ]
[ 0.33333333 0.16666667 0.16666667 0.33333333 0. 0. ]
[ 0. 0. 0. 0. 0.75 0.25 ]
[ 0. 0. 0.33333333 0.66666667 0. 0. ]]
DF:
[2 2 3 3 1 1]
IDF:
[ 0.12493874 0.12493874 0. 0. 0.30103 0.30103 ]
TF-IDF:
[[ 0.02498775 0.02498775 0. 0. 0. 0. ]
[ 0.04164625 0.02082312 0. 0. 0. 0. ]
[ 0. 0. 0. 0. 0.2257725 0.0752575 ]
[ 0. 0. 0. 0. 0. 0. ]]
==============================================
(end)