1. 程式人生 > >python3 爬取 飛G圖girl13.com 圖片

python3 爬取 飛G圖girl13.com 圖片

python3 爬取 飛G圖girl13.com 圖片

簡介:爬取 http://www.girl13.com 圖片
self.time = 2 # 設定間隔時間,預設時間為2s,以防止封IP

import os
import time
import requests
import threading
from bs4 import BeautifulSoup


class Girl13(object):
    def __init__(self):
        self.session = requests.session()
        self.
headers = { "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)" " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36" } self.time = 2 # 設定間隔時間 # 獲取狀態 def get_status(self, url):
response = self.session.get(url, headers=self.headers) if response.status_code == 200: return response else: print("ERROR: 網路連線失敗!") return False # 首頁,建立連線 def get_index(self, url): response = self.get_status(url) if
response: # response.encoding = "utf-8" # html = response.text # print(html) print("首頁,建立連線...") return True else: print("ERROR: 首頁訪問失敗!") return False # 解析 def parse_html(self, url): title_url = {} response = self.get_status(url) if not response: return None html = BeautifulSoup(response.text, "html5lib") columns = html.select("#loop-square .column-post") for column in columns: title = column.select(".entry-title")[0].text if column.select(".entry-title") else None img_url = column.select(".entry-content.cf img")[0].get("src") \ if column.select(".entry-content.cf img") else None # print(title, img_url) if not title: continue title = os.path.basename(img_url) title_url[title] = img_url return title_url # 獲取最後一頁 def get_last_page(self, url): response = self.get_status(url) if not response: return None html = BeautifulSoup(response.text, "html5lib") pages = html.select(".page-navigator li > a") if pages[-1].text == "下一頁": last_page = pages[-2].text else: last_page = pages[-2].text return int(last_page) # 翻頁 @staticmethod def next_page(last_page): for i in range(1, last_page + 1): # url = "https://www.mzitu.com/zipai/comment-page-376" url = "http://www.girl13.com/page/{}".format(i) # print(url) yield url # 下載 def download(self, path, url): print(url) with open(path, "wb") as f: response = self.get_status(url) content = response.content f.write(content) def main_(self): # 首頁,建立連線 url = "http://www.girl13.com" if not self.get_index(url): return None # 獲取最後一頁 url = "http://www.girl13.com/page/1" last_page = self.get_last_page(url) if not last_page: return None path = os.path.abspath(os.path.join(os.getcwd(), "image")) if not os.path.exists(path): os.mkdir(path) # 翻頁 urls = self.next_page(last_page) for url in urls: title_url = self.parse_html(url) thread_list = [] for title in title_url: path = os.path.abspath(os.path.join(os.getcwd(), "image", title)) url = title_url[title] t = threading.Thread(target=self.download, args=(path, url)) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join() time.sleep(self.time) def main(self): t = threading.Thread(target=self.main_) t.daemon = True t.start() t.join() if __name__ == '__main__': girl = Girl13() girl.main()