1. 程式人生 > 其它 >python爬取知乎圖片

python爬取知乎圖片

import re
import os
import time
import random
import requests

class Zhihu_Photo():
def __init__(self):
self.path = 'C:\desk\download' #儲存圖片的位置
self.question_id = 425705916 #問題的id號
self.offset = 3
self.my_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]

def header(self):
dict1 = {}
headers = []
my_headers = self.my_headers
for i in my_headers:
dict1['User-Agent'] = i
headers.append(dict1)
return headers

def save_image(self,image_url): #儲存圖片
header = random.choice(self.header())
resp = requests.get(image_url, headers=header)
time.sleep(0.5)
print("code:", resp.status_code)
if resp.status_code == 403: #請求圖片返回的狀態,如果是403,就說明是禁止下載
print("禁止下載")
else:
page = resp.content
filename1 = image_url.split('zhimg.com/')[-1]
filename = filename1.split('?')[0] #圖片命名
fpath = os.path.join(self.path, filename)
try:
with open(fpath, 'wb') as f:
print('下載成功:', fpath)
f.write(page)
f.close()
except Exception as e:
print(e)

def crawl(self,url): #獲取圖片的地址
header = random.choice(self.header())
resp = requests.get(url, headers=header)
if resp.status_code == 200:
page = resp.text
# data - original =\"https://pica.zhimg.com/v2-8bf2cab1bc32d8a45fb86a039cc97d2a_r.jpg?source=1940ef5c\"
image_urls1 = re.findall(r'data-original=\\"(.*?)\\"',page) #正則匹配,篩選出圖片地址
image_urls = list(set(image_urls1)) #去初重複地址
n = len(image_urls)
print(f"此頁一共{n}張圖片")
for image_url in image_urls:
self.save_image(image_url)
else:
print("地址不正確")

def run(self):
for i in range(500):
print(f"++++++第{self.offset}頁++++++++\n")
url = f"https://www.zhihu.com/api/v4/questions/{self.question_id}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2" \
f"Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2" \
f"Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2" \
f"Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3" \
f"Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset={self.offset}&limit=5&sort_by=default&platform=desktop"
self.offset += 5
t.crawl(url)


if __name__ == '__main__':
t = Zhihu_Photo()
t.run()