1. 程式人生 > >爬取貓眼資料

爬取貓眼資料

//原始碼

 

#
# 導包
#
import pyximport
import requests
from fake_useragent import UserAgent
import json
import os
import pandas as pd
import csv
import datetime

#
#
#
# 程式碼
# http://maoyan.com/films/42964
#
#

#偽表頭定義
pyximport.install()
ua=UserAgent()
headers = {
# "User-agent":UserAgent(verity_ssl=False).random,
"User-agent":ua.random,
"Host":"m.maoyan.com",
#"Referer":"http://m.maoyan.com/movie/1217236/comments?_v_=yes"
"Referer":"http://m.maoyan.com/movie/42964/comments?_v_=yes"
}

#請求引數定義
offsets = [0,15,30,45,60,75,90,105,120,135,150,165,180]
startTime="0"
randomTime = ""
list_info = []

for offset in offsets:
comment_api='http://m.maoyan.com/mmdb/comments/movie/42964.json?_v_=yes&offset={0}&startTime={1}'.format(offset,datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

response_comment = requests.get(comment_api,headers=headers)
json_comment=response_comment.text
json_comment=json.loads(json_comment)
#print(json_comment)
json_response = json_comment['cmts']
for data in json_response:
cityName = data['cityName']
content=data['content']
if "gender" in data:
gender = data['gender']
else:
gender=0
nickName = data['nickName']
userLevel = data['userLevel']
score = data['score']
list_one=[nickName,gender,cityName,userLevel,score,content]
list_info.append(list_one)
#print("offset:"+offset+",startTime:"+startTime)
#重新定義請求引數

print("正在儲存資料:")
file_size=os.path.getsize(r'D:\B_Hakkelujah\python\maoyan.csv')
prStr = "檔案大小:{0}".format(file_size)
print(prStr)
if file_size==0:
print("空檔案新增資料")
# 表頭
name = ['評論者暱稱', '性別', '所在城市','貓眼等級','評分','評論內容']
# 建立DataFrame物件
file_test = pd.DataFrame(columns=name, data=list_info)
# 資料寫入
file_test.to_csv(r'D:\B_Hakkelujah\python\maoyan.csv', encoding='utf_8_sig', index=False)
print("資料新增完畢")
#pd.read_csv(file_name, encoding='utf-8')

 

原文:

https://mp.weixin.qq.com/s?__biz=MjM5MjAwODM4MA==&mid=2650706418&idx=1&sn=20e57b7b1c8caa4c0b06d6dbd2b94aaa&chksm=bea6e02189d16937c8c3d934264f24b599576b14b76361018b55cca76fb73a127d4f6681af98&mpshare=1&scene=1&srcid=101045ENCgxgoTId8LKXrIaE&pass_ticket=Cgz9TOK3J64evSI%2B9Ev7kLigZCJHUOKf8eJe9%2FagJaUdYdhyn53lL%2FeRC4NnDrUq#rd

 

注:

資料爬取記錄

1.分析介面(包括介面引數的變化)

2.分析JSON資料(資料解析)

3.資料儲存(檔案、資料庫)