1. 程式人生 > >python獲取本人關注列表並批量存入本地mysql資料庫

python獲取本人關注列表並批量存入本地mysql資料庫

先模擬登陸,將cookies儲存到本地。程式碼中獲得知乎關注列表的連結是16年知乎電腦網頁版改版以前的,返回一組json資料,下拉自動填充網頁,需要傳xsrf 、hash_id。2016年11月左右知乎改版後有了新的api,新的api不需要獲取xsrf和hashid,只要有內部的name就可以,不過返回的資料中沒有了贊同數和提問問題數。

mysql批量插入用的是pymsql的executemany方法。

import http.cookiejar
import requests
import re
import json
import math
import time
import pymysql.cursors
from zhihu.author import Author
from bs4 import BeautifulSoup
from collections import deque

deque=deque()
agent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) " \
        "Chrome/46.0.2490.76 Mobile Safari/537.36"
headers = {
    'User-Agent': agent
}
#zui-jiu-qing-niu-4
def getsession():
    session = requests.session()
    session.cookies = http.cookiejar.LWPCookieJar(filename="cookies")
    session.cookies.load(ignore_discard=True)
    return session

def followers_num(session,id):
    res = session.get('https://www.zhihu.com/people/'+id+'/followees', headers=headers)
    bs = BeautifulSoup(res.text, 'html.parser')
    fonum = bs.find("div", {'class': 'zu-main-sidebar'}).find('a', {'class': 'item'}).find('strong').text
    fonum = int(fonum)
    return fonum

def get_xsrf(session):
    '''''_xsrf 是一個動態變化的引數'''
    index_url = 'http://www.zhihu.com'
    # 獲取登入時需要用到的_xsrf
    index_page = session.get(index_url, headers=headers)
    html = index_page.text
    pattern = r'name="_xsrf" value="(.*?)"'
    # 這裡的_xsrf 返回的是一個list
    _xsrf = re.findall(pattern, html)
    return str(_xsrf[0])

def getfollwer(fonum,session,xsrf):
    begin = 0
    end = math.ceil(fonum / 20)
    num = 1
    for x in range(0, end):
        beginnum = str(x * 20)
        # print(x)
        postdata = {'method': 'next',
                    'params': '{"offset":' + beginnum + ',"order_by":"created","hash_id":"29d75b4013b4631aaf7fe5848f3f6113"}',
                    '_xsrf': xsrf}
        ress = session.post('https://www.zhihu.com/node/ProfileFolloweesListV2', data=postdata, headers=headers)
        jsons = json.loads(ress.content.decode('utf-8'))
        print(jsons['msg'])
        time.sleep(5)
        for a in jsons['msg']:
            #print(a)
            abs = BeautifulSoup(a, 'html.parser')
            print(num)
            name=abs.find('a', {'class': 'zg-link author-link'}).text
            print("使用者:" + name)
            homepage = abs.find('a', {'class': 'zg-link author-link'})['href']
            id = homepage[29:]
            print(id)
            #print("使用者主頁:" + homepage)

            normals = abs.find_all(a, {'class': 'zg-link-gray-normal'})
            follower_num=int(abs.find('a', {'href': '/people/' + id + '/followers'}).text[:-4])
            print(follower_num)
            question_num=int(abs.find('a', {'href': '/people/' + id + '/asks'}).text[:-3])
            print(question_num)
            answer_num=int(abs.find('a', {'href': '/people/' + id + '/answers'}).text[:-3])
            print(answer_num)
            agree_num=int(abs.find('a', {'href': '/people/' + id, 'class': 'zg-link-gray-normal'}).text[:-3])
            print(agree_num)
            author=Author(id,name,homepage,follower_num,question_num,answer_num,agree_num)
            deque.append(author)
            print(author.name)
            print(author.homepage)
            num = num + 1
            print("================================================================================================")
    return deque

def insertzhihu(deque):
    connetion=pymysql.connect(host='localhost',
                                 user='root',
                                 password='159366',
                                 db='zhihu',
                                 charset='utf8',
                                 cursorclass=pymysql.cursors.DictCursor)
    values=[]
    for author in deque:
        value=(author.id,author.name,author.homepage,author.follower_num,author.question_num,
               author.answer_num,author.agree_num)
        values.append(value)
    cursor=connetion.cursor()
    cursor.executemany("insert into zhihu_author values(%s,%s,%s,%s,%s,%s,%s)",values)
    connetion.commit()


if __name__ == "__main__":
    session=getsession()
    xsrf=get_xsrf(session)
    fnum=followers_num(session,"zui-jiu-qing-niu-4")
    deque=getfollwer(fnum,session,xsrf)
    insertzhihu(deque)