Resquest、Bs4、多執行緒爬取全站圖片
#!/usr/bin/env python
# coding=utf-8
# author:Charles
# datetime:2021/03/23/0004 11:26
# software: meizitu
import requests, os, shutil
from bs4 import BeautifulSoup
from multiprocessing import Pool
# 封裝get方法
def geta(url, params=None, header=None):
session=requests.session()
ret={}
ret['success']=False
try:
if params:
session.params=params
if header:
session.headers=header
msg=session.get(url)
if msg:
ret['success']=True
ret['content']=msg.content
except Exception, e:
print e.message
finally:
if session:
session.close()
return ret
# 主頁面
def meizitu(kind, page):
# 程序池中執行緒數
pool=Pool(10)
for p in xrange(1, int(page) + 1):
pg='/page/%s/' % p
url='mzitu%s%s' % (kind, pg)
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
ret=geta(url=url, header=header)
if ret['success']==False:
return False
soup=BeautifulSoup(ret['content'], 'lxml')
listsoup=soup.find_all('ul', {"id": "pins"})
for i in listsoup:
if i is not None:
soup1=BeautifulSoup(str(i), 'lxml')
listsoup1=soup1.find_all('span')
soup2=BeautifulSoup(str(listsoup1), 'lxml')
listsoup2=soup2.find_all('a')
for g in listsoup2:
href=g['href'] # 獲取連結
title=g.text.decode('unicode_escape') # 標題
# print href
# 同步爬取
# detail(href, title)
# 進執行緒非同步爬取(非阻塞)
pool.apply_async(detail, args=(href, title))
print '*********************啦啦啦,已爬取%s螢幕啦*********************' % p
print '需要爬取的全站圖片寫入完成!!'
# 程序池關閉
pool.close()
# 等待程序池中的worker程序執行完畢,防止主程序在worker程序結束前結束。
pool.join()
# 詳細頁面
def detail(url, titles):
num=int(max_page(url))
title=titles.strip().replace('?', '').replace(':', '').replace(',', '').replace('@', '')
path='D:/meizitu/'
print u'檔案存放地址: ' + path + title
if os.path.exists(path + title):
raw_input('資料夾已經存在,按任意鍵刪除此資料夾!!!')
shutil.rmtree(path + title)
raw_input('資料夾已經刪除,按任意鍵執行爬取!!!')
os.makedirs(path + title)
os.chdir(path + title)
for i in xrange(1, num):
urls=url + '/' + str(i)
ret=geta(url=urls)
if ret['success']==False:
return False
soup=BeautifulSoup(ret['content'], 'lxml')
listsoup=soup.find('div', {'class': 'main-image'})
soup1=BeautifulSoup(str(listsoup), 'lxml')
listsoup1=soup1.find('img')
detail_href=listsoup1['src'] # 詳細連結
# print detail_href
header={
'Referer': 'mzitu/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
ret=geta(url=detail_href, header=header)
if ret['success']==True:
tupian=ret['content']
with open('%s-%s.jpg' % (url[21:], i), 'wb')as f:
f.write(tupian)
print ('已爬取完成編號:%s----第%s張' % (url[21:], i))
print '編號為:%s===================>已經爬取完成!!!' % url[21:]
# 詳細頁面最大張數
def max_page(url):
ret=geta(url=url)
if ret['success']==False:
return False
soup=BeautifulSoup(ret['content'], 'lxml')
listsoup=soup.find('div', {'class': 'pagenavi'})
soup1=BeautifulSoup(str(listsoup), 'lxml')
listsoup1=soup1.find_all('span')
list=[]
for i in listsoup1:
list.append(i.text)
maxpage=list[-2]
return maxpage
if __name__=='__main__':
if os.name=='nt':
print(u'你正在使用win平臺')
else:
print(u'你正在使用linux平臺')
category={'1': '', '2': '/xinggan/', '3': '/japan/', '4': '/taiwan/', '5': '/mm/'}
num=raw_input('請選擇您要爬取的妹子圖種類: 1.Index 2.Sex 3.Japan 4.TaiWan 5.Pure
')
if num=='1' or num=='2' or num=='3' or num=='4' or num=='5':
page=raw_input('輸入爬取幾螢幕:')
if page.isdigit():
meizitu(category[num], page)
else:
raw_input('輸入錯誤!按任意鍵退出!!!')
else:
raw_input('輸入錯誤!按任意鍵退出!!!')