python進行時間資料處理
阿新 • • 發佈:2019-01-02
用頁面解析的方式從twitter爬下來的帖子時間有時候是中文的,如下:
由於時間處理的細節很多,所以在這裡做一個小結,首先要明白處理的目標資料應該是24小時制,並且形式如下
format = "%Y-%m-%d %H:%M:%S"
也就是要將字串轉換為datetime.datetime型別
程式碼如下:
from datetime import datetime
format = "%Y-%m-%d %H:%M:%S"
def chineseTime2National(time):
if time[0] == "上":
time = time.replace(r'上午' ,'').split(' ')
houmin = time[0].split(':')
if houmin[0] == '12': #要將凌晨12點換為00
houmin = "00"+":"+houmin[1]
else:
houmin = time[0]
time = time[2]+" "+houmin
#print(time)
time = time.replace(r'年','-').replace(r'月','-').replace(r'日','')
#print(time) #輸出2017-04-27
#print(type(time)) #<type 'str'>
restime = datetime.strptime(time,'%Y-%m-%d %H:%M')
#print (restime) #輸出結果:2017-04-27 00:00:00
#print (type(restime)) #<type 'datetime.datetime'>
elif time[0] == "下":
time = time.replace(r'下午','').split(' ' )
houmin = time[0].split(':')
if houmin[0] == '12':
hour = '12'
else:
hour = int(houmin[0])+12#下午時間轉換為24小時制
houmin = str(hour)+":"+houmin[1]
time = time[2]+" "+houmin
#print(time)
time = time.replace(r'年','-').replace(r'月','-').replace(r'日','')#連續替換年月日為‘-’
#print(time) #輸出2017-04-27
#print(type(time)) #<type 'str'>
restime = datetime.strptime(time,'%Y-%m-%d %H:%M')#將字串轉為datetime用strptime
#print (restime) #輸出結果:2017-04-27 00:00:00
return restime
得到datetime型別時間以後,由於需要統計發帖的小時、星期,我們需要藉助幾個簡單的函式,程式碼如下
with open('time_feature_of_user.json','w') as f:
for name,group in an_traces_df.groupby(['screen_name']):
dic = {}
dic["screen_name"] = name
hours = np.zeros(24)#統計小時的陣列
weekdays = np.zeros(7)#統計星期的陣列
for t in group["created_at"].values:
t = chineseTime2National(t)#轉為datetime
day = t.date()#datetime型別資料的函式date()獲取日期
weekday = day.weekday()#通過日期獲取星期:0代表monday以此類推
hour = t.time().hour - 1#通過datetime的time()函式的hour屬性獲取小時
hours[hour] += 1
weekdays[weekday] += 1
dic["hour_feature"] = (hours/len(group["created_at"].values)).tolist()
dic["weekday_feature"] = (weekdays/len(group["created_at"].values)).tolist()
f.write(json.dumps(dic)+'\n')