字典統計詞頻
阿新 • • 發佈:2018-11-01
import pandas as pd
import numpy as np
#構造B列為多值,那麼B列是字串,也就是['','',''],這樣可以split。不能寫成[[],[],[]],這樣是list,list不能split。
temp=pd.DataFrame({'A':[1,2,3],'B':['4,2,1','5,3,2','6,4,3']},index=['a','b','c'])
print(temp)
# A B
# a 1 4,2,1
# b 2 5,3,2
# c 3 6,4,3
for index, row in temp[['A','B']].iterrows(): print(index) #a #b #c print(row)#下面這些類是series # A # 1 # B # 4, 2, 1 # Name: a, dtype: object # A # 2 # B # 5, 3, 2 # Name: b, dtype: object # A # 3 # B # 6, 4, 3 # Name: c, dtype: object print(row['A']) # 1 # 2 # 3 print(row['B']) # 4, 2, 1 # 5, 3, 2 # 6, 4, 3
#統計詞頻 #寫法1:(更簡單?) from collections import defaultdict back = defaultdict(lambda :0) for index, row in temp[['A', 'B']].iterrows(): word_list=row['B'].split(',')#這一列是以空格分隔的括號裡就空的,以逗號分隔就是',' for word in word_list: # print(back[word])#這種寫法在這裡寫這一句,會列印0,因為上面已設定預設為0 back[word] = back[word] + 1 print(back[word]) # 1 # 1 # 1 # 1 # 1 # 2 # 1 # 2 # 2 print(back) #defaultdict(<function <lambda> at 0x0000015191AFE598>, {'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2})
#寫法2: back = {} for index, row in temp[['A', 'B']].iterrows(): word_list=row['B'].split(',') for word in word_list: # print(back[word])#會報錯,因為字典統計詞頻需要首先有這個詞 try: back[word]=back[word]+1 except: back[word]=1 print(back[word]) # 1 # 1 # 1 # 1 # 1 # 2 # 1 # 2 # 2 print(back) #{'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2}