python獲取本人關注列表並批量存入本地mysql資料庫
阿新 • • 發佈:2019-01-30
先模擬登陸,將cookies儲存到本地。程式碼中獲得知乎關注列表的連結是16年知乎電腦網頁版改版以前的,返回一組json資料,下拉自動填充網頁,需要傳xsrf 、hash_id。2016年11月左右知乎改版後有了新的api,新的api不需要獲取xsrf和hashid,只要有內部的name就可以,不過返回的資料中沒有了贊同數和提問問題數。
mysql批量插入用的是pymsql的executemany方法。
import http.cookiejar import requests import re import json import math import time import pymysql.cursors from zhihu.author import Author from bs4 import BeautifulSoup from collections import deque deque=deque() agent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) " \ "Chrome/46.0.2490.76 Mobile Safari/537.36" headers = { 'User-Agent': agent } #zui-jiu-qing-niu-4 def getsession(): session = requests.session() session.cookies = http.cookiejar.LWPCookieJar(filename="cookies") session.cookies.load(ignore_discard=True) return session def followers_num(session,id): res = session.get('https://www.zhihu.com/people/'+id+'/followees', headers=headers) bs = BeautifulSoup(res.text, 'html.parser') fonum = bs.find("div", {'class': 'zu-main-sidebar'}).find('a', {'class': 'item'}).find('strong').text fonum = int(fonum) return fonum def get_xsrf(session): '''''_xsrf 是一個動態變化的引數''' index_url = 'http://www.zhihu.com' # 獲取登入時需要用到的_xsrf index_page = session.get(index_url, headers=headers) html = index_page.text pattern = r'name="_xsrf" value="(.*?)"' # 這裡的_xsrf 返回的是一個list _xsrf = re.findall(pattern, html) return str(_xsrf[0]) def getfollwer(fonum,session,xsrf): begin = 0 end = math.ceil(fonum / 20) num = 1 for x in range(0, end): beginnum = str(x * 20) # print(x) postdata = {'method': 'next', 'params': '{"offset":' + beginnum + ',"order_by":"created","hash_id":"29d75b4013b4631aaf7fe5848f3f6113"}', '_xsrf': xsrf} ress = session.post('https://www.zhihu.com/node/ProfileFolloweesListV2', data=postdata, headers=headers) jsons = json.loads(ress.content.decode('utf-8')) print(jsons['msg']) time.sleep(5) for a in jsons['msg']: #print(a) abs = BeautifulSoup(a, 'html.parser') print(num) name=abs.find('a', {'class': 'zg-link author-link'}).text print("使用者:" + name) homepage = abs.find('a', {'class': 'zg-link author-link'})['href'] id = homepage[29:] print(id) #print("使用者主頁:" + homepage) normals = abs.find_all(a, {'class': 'zg-link-gray-normal'}) follower_num=int(abs.find('a', {'href': '/people/' + id + '/followers'}).text[:-4]) print(follower_num) question_num=int(abs.find('a', {'href': '/people/' + id + '/asks'}).text[:-3]) print(question_num) answer_num=int(abs.find('a', {'href': '/people/' + id + '/answers'}).text[:-3]) print(answer_num) agree_num=int(abs.find('a', {'href': '/people/' + id, 'class': 'zg-link-gray-normal'}).text[:-3]) print(agree_num) author=Author(id,name,homepage,follower_num,question_num,answer_num,agree_num) deque.append(author) print(author.name) print(author.homepage) num = num + 1 print("================================================================================================") return deque def insertzhihu(deque): connetion=pymysql.connect(host='localhost', user='root', password='159366', db='zhihu', charset='utf8', cursorclass=pymysql.cursors.DictCursor) values=[] for author in deque: value=(author.id,author.name,author.homepage,author.follower_num,author.question_num, author.answer_num,author.agree_num) values.append(value) cursor=connetion.cursor() cursor.executemany("insert into zhihu_author values(%s,%s,%s,%s,%s,%s,%s)",values) connetion.commit() if __name__ == "__main__": session=getsession() xsrf=get_xsrf(session) fnum=followers_num(session,"zui-jiu-qing-niu-4") deque=getfollwer(fnum,session,xsrf) insertzhihu(deque)