1. 程式人生 > 其它 >python爬取今日頭條街拍

python爬取今日頭條街拍

相信各位學習爬蟲的老鐵們一定看過崔大佬的爬蟲教學。在第六章利用Ajax爬取今日頭條街拍圖片這部分,由於網站已變更,會發現書中具體程式碼無法執行。本人作為爬蟲新手,用了2小時時間自行摸索該部分,並對相應內容進行調整,最終【成功爬取】,在這裡跟大家分享一下我踏過的各種大坑。

首先模組匯入

import requests
import re
import os
from time import sleep
from urllib.parse import urlencode
from urllib import parse
from hashlib import md5

爬蟲三步走,獲取特面--分析頁面--儲存資訊

首先,獲取頁面的函式設定。這裡值得注意的是headers部分要新增cookies,內容不做贅述。

headers = {
'Host': 'so.toutiao.com',
'Referer': 'https://so.toutiao.com/search?keyword=%20%E8%A1%97%E6%8B%8D&pd=synthesis&source=input&dvpf=pc&aid=4916&page_num=0',
'Cookie': 'ttwid=1|KviVmcSjms80bH3CAgjoWLkug459q7mO4n8oe79jffQ|1634094110|72c4e7c5de9eddb603ee7144203a64762a6e383f21d66b619e50cb9a4740e7c6',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0'
}

headers2 = {
'Host': 'www.toutiao.com',
'Cookie': 'ttwid=1|KviVmcSjms80bH3CAgjoWLkug459q7mO4n8oe79jffQ|1634096506|eaa9c570e34a6c383c184a4b0855b9d13833c4414ded5a4f82227b3f8bc3f8ea',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0'
}

接下來,搜尋框輸入:街拍搜尋,加時間等待2秒是防止被關小黑屋
def get_page(page):
"""搜尋街拍"""
params = {
'keyword': '街拍',
'pd': 'synthesis',
'source': 'input',
'dvpf': 'pc',
'aid': '4916',
'page_num': page
}
url = 'https://so.toutiao.com/search?' + urlencode(params)

try:
response = requests.get(url, headers=headers)
sleep(2)
if response.status_code == 200:
# print(response.text)
return response.text
except requests.ConnectionError as e:
print('Error', e.args)

然後,獲取一個頁面的所有文章連結:
def parse_one_page(html):
"""獲取文章連結"""
pattern = re.compile(
'"title":.*?"article_url":"(.*?)"',
re.S)

result = re.findall(pattern, html)
# print(result)
return result

接下來,開啟其中一個文章連結,獲取響應頁面:
def in_article(url):
"""進入文章裡面"""
try:
# 進入文章裡面
response = requests.get(url, headers=headers2)
sleep(2)
if response.status_code == 200:
# print(response.text)
return response.text
except requests.ConnectionError as e:
print('Error', e.args)

再然後,下載文章裡面圖片:
def get_image_url(html):
"""獲取圖片下載連結"""
try:
pattern1 = re.compile(
'alt="(.*?)" inline="0".*?<div class="pgc-img">.*?src=',
re.S)

items = re.findall(pattern1, html)
result = []
pattern2 = re.compile(
'<div class="pgc-img">.*?src="(.*?)"',
re.S)
items2 = re.findall(pattern2, html)
data = [['title', items], ['image_url', items2]]
result.append(dict(data))
# print(result)
return result
except:
print('無法匹配成功!!!!!!')

最後,儲存圖片:
def save_image(item):
"""根據圖片地址下載圖片"""
try:
if item[0].get('title')[0]:
if not os.path.exists(item[0].get('title')[0]):
os.mkdir(item[0].get('title')[0])
try:
# 某個文章下的圖片地址
list = item[0].get('image_url')
for url in list:
response = requests.get(url)
sleep(2)
if response.status_code == 200:
# 圖片名稱用其內容md5值,這樣可以去除名稱重複
file_path = '{0}/{1}.{2}'.format(item[0].get('title')[0], md5(response.content).hexdigest(),
'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
print('{0}.........下載成功!!!'.format(item[0].get('title')[0]))
else:
print('Already Downloaded', file_path)

except requests.ConnectionError:
print('Failed to Save Image')
except:
print('No Data!!!')


if __name__ == '__main__':
# 共17頁
for i in range(0, 18):
# 獲取前i頁資料
data1 = get_page(i)
# 獲取文章連結
links = parse_one_page(data1)
for link in links:
fail_url = parse.urlparse(link)
path = fail_url.path.split('/group/')
new_path = 'a' + path[-1]
true_url = parse.urljoin('https://www.toutiao.com', '{0}'.format(new_path))
article_html = in_article(true_url)
image_info = get_image_url(article_html)
print(image_info)
# 儲存圖片,以文章名稱作為資料夾名稱
save_image(image_info)