1. 程式人生 > 其它 >Resquest、Bs4、多執行緒爬取全站圖片

Resquest、Bs4、多執行緒爬取全站圖片

  #!/usr/bin/env python

  # coding=utf-8

  # author:Charles

  # datetime:2021/03/23/0004 11:26

  # software: meizitu

  import requests, os, shutil

  from bs4 import BeautifulSoup

  from multiprocessing import Pool

  # 封裝get方法

  def geta(url, params=None, header=None):

  session=requests.session()

  ret={}

  ret['success']=False

  try:

  if params:

  session.params=params

  if header:

  session.headers=header

  msg=session.get(url)

  if msg:

  ret['success']=True

  ret['content']=msg.content

  except Exception, e:

  print e.message

  finally:

  if session:

  session.close()

  return ret

  # 主頁面

  def meizitu(kind, page):

  # 程序池中執行緒數

  pool=Pool(10)

  for p in xrange(1, int(page) + 1):

  pg='/page/%s/' % p

  url='mzitu%s%s' % (kind, pg)

  header={

  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

  }

  ret=geta(url=url, header=header)

  if ret['success']==False:

  return False

  soup=BeautifulSoup(ret['content'], 'lxml')

  listsoup=soup.find_all('ul', {"id": "pins"})

  for i in listsoup:

  if i is not None:

  soup1=BeautifulSoup(str(i), 'lxml')

  listsoup1=soup1.find_all('span')

  soup2=BeautifulSoup(str(listsoup1), 'lxml')

  listsoup2=soup2.find_all('a')

  for g in listsoup2:

  href=g['href'] # 獲取連結

  title=g.text.decode('unicode_escape') # 標題

  # print href

  # 同步爬取

  # detail(href, title)

  # 進執行緒非同步爬取(非阻塞)

  pool.apply_async(detail, args=(href, title))

  print '*********************啦啦啦,已爬取%s螢幕啦*********************' % p

  print '需要爬取的全站圖片寫入完成!!'

  # 程序池關閉

  pool.close()

  # 等待程序池中的worker程序執行完畢,防止主程序在worker程序結束前結束。

  pool.join()

  # 詳細頁面

  def detail(url, titles):

  num=int(max_page(url))

  title=titles.strip().replace('?', '').replace(':', '').replace(',', '').replace('@', '')

  path='D:/meizitu/'

  print u'檔案存放地址: ' + path + title

  if os.path.exists(path + title):

  raw_input('資料夾已經存在,按任意鍵刪除此資料夾!!!')

  shutil.rmtree(path + title)

  raw_input('資料夾已經刪除,按任意鍵執行爬取!!!')

  os.makedirs(path + title)

  os.chdir(path + title)

  for i in xrange(1, num):

  urls=url + '/' + str(i)

  ret=geta(url=urls)

  if ret['success']==False:

  return False

  soup=BeautifulSoup(ret['content'], 'lxml')

  listsoup=soup.find('div', {'class': 'main-image'})

  soup1=BeautifulSoup(str(listsoup), 'lxml')

  listsoup1=soup1.find('img')

  detail_href=listsoup1['src'] # 詳細連結

  # print detail_href

  header={

  'Referer': 'mzitu/',

  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

  }

  ret=geta(url=detail_href, header=header)

  if ret['success']==True:

  tupian=ret['content']

  with open('%s-%s.jpg' % (url[21:], i), 'wb')as f:

  f.write(tupian)

  print ('已爬取完成編號:%s----第%s張' % (url[21:], i))

  print '編號為:%s===================>已經爬取完成!!!' % url[21:]

  # 詳細頁面最大張數

  def max_page(url):

  ret=geta(url=url)

  if ret['success']==False:

  return False

  soup=BeautifulSoup(ret['content'], 'lxml')

  listsoup=soup.find('div', {'class': 'pagenavi'})

  soup1=BeautifulSoup(str(listsoup), 'lxml')

  listsoup1=soup1.find_all('span')

  list=[]

  for i in listsoup1:

  list.append(i.text)

  maxpage=list[-2]

  return maxpage

  if __name__=='__main__':

  if os.name=='nt':

  print(u'你正在使用win平臺')

  else:

  print(u'你正在使用linux平臺')

  category={'1': '', '2': '/xinggan/', '3': '/japan/', '4': '/taiwan/', '5': '/mm/'}

  num=raw_input('請選擇您要爬取的妹子圖種類: 1.Index 2.Sex 3.Japan 4.TaiWan 5.Pure

  ')

  if num=='1' or num=='2' or num=='3' or num=='4' or num=='5':

  page=raw_input('輸入爬取幾螢幕:')

  if page.isdigit():

  meizitu(category[num], page)

  else:

  raw_input('輸入錯誤!按任意鍵退出!!!')

  else:

  raw_input('輸入錯誤!按任意鍵退出!!!')