爬取煎蛋隨手拍圖
阿新 • • 發佈:2018-12-24
爬取煎蛋隨手拍圖
使用 requests + selenium 來進行圖片的爬取
爬取結果
爬取思路
- 使用 selenium 發起請求
- 對頁面進行資料的提取
- 取到頁面上每個圖片的 url
- 使用 requests 發起請求
- 將圖片進行儲存
實現程式碼
import os
import time
import requests
from selenium import webdriver
class JandanPic:
def __init__(self):
self. start_url = "http://jandan.net/ooxx"
self.driver = webdriver.Chrome()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3608.4 Safari/537.36"
}
def parse_get_url(self, url):
resp = requests.get( url, headers=self.headers)
return resp.content
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath("//ol[@class='commentlist']/li")
print(li_list)
content_list = []
for li in li_list:
items = {}
items["img_id"] = li. find_element_by_xpath(".//span[@class='righttext']/a").text if len(
li.find_elements_by_xpath(".//span[@class='righttext']/a")) > 0 else None
items["img_url"] = li.find_element_by_xpath(".//a[@class='view_img_link']").get_attribute("href") if len(
li.find_elements_by_xpath(".//a[@class='view_img_link']")) > 0 else None
print(items)
content_list.append(items)
# 下載圖片
self.save_pic(content_list)
next_url = self.driver.find_elements_by_xpath("//a[@class='previous-comment-page']")
next_url = next_url[0] if len(next_url) > 0 else None
return next_url
def save_pic(self, content_list):
folder_path = "./image/jandan/"
if not os.path.exists(folder_path):
os.mkdir(folder_path)
for content in content_list:
if content["img_url"] is not None and content["img_id"] is not None:
pic_url = content["img_url"]
pic_name = content["img_id"] + content["img_url"][-4:]
# 傳送請求,獲取圖片資料
img_bytes = self.parse_get_url(pic_url)
print("開始下載:{}".format(pic_url))
with open(folder_path + pic_name, 'wb') as f:
f.write(img_bytes)
def run(self):
# 發起請求,開啟瀏覽器
self.driver.get(self.start_url)
time.sleep(6)
# 取資料,儲存資料
next_url = self.get_content_list()
# 下一頁
while next_url is not None:
next_url.click()
print("下一頁")
time.sleep(6)
# 取資料,儲存資料
next_url = self.get_content_list()
self.driver.quit()
if __name__ == '__main__':
jandan = JandanPic()
jandan.run()