python3 爬取 飛G圖girl13.com 圖片
阿新 • • 發佈:2019-01-08
python3 爬取 飛G圖girl13.com 圖片
簡介:爬取 http://www.girl13.com 圖片
self.time = 2 # 設定間隔時間,預設時間為2s,以防止封IP
import os
import time
import requests
import threading
from bs4 import BeautifulSoup
class Girl13(object):
def __init__(self):
self.session = requests.session()
self. headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
}
self.time = 2 # 設定間隔時間
# 獲取狀態
def get_status(self, url):
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
return response
else:
print("ERROR: 網路連線失敗!")
return False
# 首頁,建立連線
def get_index(self, url):
response = self.get_status(url)
if response:
# response.encoding = "utf-8"
# html = response.text
# print(html)
print("首頁,建立連線...")
return True
else:
print("ERROR: 首頁訪問失敗!")
return False
# 解析
def parse_html(self, url):
title_url = {}
response = self.get_status(url)
if not response:
return None
html = BeautifulSoup(response.text, "html5lib")
columns = html.select("#loop-square .column-post")
for column in columns:
title = column.select(".entry-title")[0].text if column.select(".entry-title") else None
img_url = column.select(".entry-content.cf img")[0].get("src") \
if column.select(".entry-content.cf img") else None
# print(title, img_url)
if not title:
continue
title = os.path.basename(img_url)
title_url[title] = img_url
return title_url
# 獲取最後一頁
def get_last_page(self, url):
response = self.get_status(url)
if not response:
return None
html = BeautifulSoup(response.text, "html5lib")
pages = html.select(".page-navigator li > a")
if pages[-1].text == "下一頁":
last_page = pages[-2].text
else:
last_page = pages[-2].text
return int(last_page)
# 翻頁
@staticmethod
def next_page(last_page):
for i in range(1, last_page + 1):
# url = "https://www.mzitu.com/zipai/comment-page-376"
url = "http://www.girl13.com/page/{}".format(i)
# print(url)
yield url
# 下載
def download(self, path, url):
print(url)
with open(path, "wb") as f:
response = self.get_status(url)
content = response.content
f.write(content)
def main_(self):
# 首頁,建立連線
url = "http://www.girl13.com"
if not self.get_index(url):
return None
# 獲取最後一頁
url = "http://www.girl13.com/page/1"
last_page = self.get_last_page(url)
if not last_page:
return None
path = os.path.abspath(os.path.join(os.getcwd(), "image"))
if not os.path.exists(path):
os.mkdir(path)
# 翻頁
urls = self.next_page(last_page)
for url in urls:
title_url = self.parse_html(url)
thread_list = []
for title in title_url:
path = os.path.abspath(os.path.join(os.getcwd(), "image", title))
url = title_url[title]
t = threading.Thread(target=self.download, args=(path, url))
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
time.sleep(self.time)
def main(self):
t = threading.Thread(target=self.main_)
t.daemon = True
t.start()
t.join()
if __name__ == '__main__':
girl = Girl13()
girl.main()