python爬取知乎話題的精華問題下的使用者資訊
阿新 • • 發佈:2019-02-07
今天試著用自己的爬蟲程式碼爬取了知乎【同性戀】話題下的所有精華問題的使用者位置資訊
程式碼:
__author__ = 'yang'
# -*- coding: utf-8 -*-
import configparser
import requests
import time
import re
import string
def curTime():
curTime = time.strftime('%Y-%m-%d %H:%M:%S')
timeStr = '\n<!--'+curTime+'-->'
return timeStr
def loginInfo(): #獲取使用者名稱,密碼
filename = 'test.ini' #test.ini中有知乎賬號、密碼及瀏覽器cookies
config = configparser.ConfigParser()
config.read(filename)
cookies = config.items('COOKIES')
cookies = dict(cookies)
username = config.get("USER","username")
password = config.get("USER","password" )
#print username
return username,password,cookies
def create_session():
username, password, cookies = loginInfo()
session = requests.session()
login_data = {'email':username, 'password':password}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36' ,
'Host': 'www.zhihu.com',
'Referer': 'http://www.zhihu.com/'
}
r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header)
if r.json()['r'] == 1:
print 'Login Failed, reason is:',
for m in r.json()['data']:
print r.json()['data'][m]
print 'Use cookies to login...'
has_cookies = False
for key in cookies:
if key != '__name__' and cookies[key] != '':
has_cookies = True
break
if has_cookies is False:
raise ValueError('請填寫config.ini檔案中的cookies項')
else:
r = session.get('http://www.zhihu.com/login/email', cookies=cookies)
with open('login.html', 'w') as fp:
fp.write(r.content)
return session, cookies
def writeFile(name,content):
with open(name,'w') as fp:
fp.write(content)
if __name__ == '__main__':
requests_session, requests_cookies = create_session()
with open('tong.html','w') as fp:
fp.write(curTime())
for page in range(0,49):
url = 'https://www.zhihu.com/topic/19552984/top-answers?'+str(page)
content = requests_session.get(url, cookies=requests_cookies).content
f = file('tong.html', 'a+')
f.write(content)
#f = file('url.html', 'a+')
#f.write(curTime())
#匹配問題連線字串
str = re.compile(r'<a class="question_link.*?href="(.*?)">')
with open('url.html') as file:
content = file.read()
questionLinks = str.findall(content)
print (questionLinks)
with open('resultLink.html','w') as fp:
fp.write('\n'.join(questionLinks))
with open('resultLink.html') as fp:
questionLinks = fp.readlines()
#獲取使用者連結
usrRegex = re.compile(r'<a class="author-link.*?href="(.*?)">')
for link in questionLinks:
num = link.strip()
url = 'https://www.zhihu.com'+str(num)
page = requests_session.get(url,cookies=requests_cookies).content #獲取頁面內容
#過濾使用者連結
usrLinks = usrRegex.findall(page)
f = file('usrLinks.html','a+')
f.write('\n'.join(usrLinks))
with open('usrLinks.html') as fp:
ls = fp.readlines()
links = []
for link in ls:
links.append(link.strip())
#print len(links)
links = list(set(links))
#print len(links)
#獲取使用者個人頁面
locationRegex = re.compile(r'<span class="location item.*?title="(.*?)"')
locations = []
for link in links:
url = 'https://www.zhihu.com'+str(link)
page = requests_session.get(url, cookies=requests_cookies).content
#獲取位置資訊
#locations.append(locationRegex.findall(page))
location = locationRegex.findall(page)+'\n'
if (location):
f = file('locations.html','a+')
f.write('\n'.join(location))