1. 程式人生 > >爬取妹子圖片

爬取妹子圖片

[] 爬取 user pict html pen chdir star download

學習練習爬蟲的時候寫了一個爬取妹子的小程序玩玩。

from bs4 import BeautifulSoup
import requests
import os
from threading import Thread

‘‘‘
    soup.find( name , attrs , recursive , string , **kwargs )函數
        name:查找哪一個標簽
        attrs:標簽裏的指定那個參數,比如class
    註意:
        BeautifulSoup()返回的類型是<class ‘bs4.BeautifulSoup‘>
        find()返回的類型是<class ‘bs4.element.Tag‘>
        find_all()返回的類型是<class ‘bs4.element.ResultSet‘>
        <class ‘bs4.element.ResultSet‘>不能再進行find/find_all操作
‘‘‘ def first_page(url): ‘‘‘ 從主頁的圖片鏈接進去匹配 http://www.mzitu.com/155036 http://www.mzitu.com/155036/2 找到最大頁數,循環訪問 ‘‘‘ headers = { UserAgent: Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1, } html = requests.get(url,headers=headers) soup
= BeautifulSoup(html.text,html.parser) girl_url_list = soup.find(div,class_=main-image).find_all(a)[href] #最大頁數 pic_max = soup.find_all(span)[10].text #圖片標題,soup對象 title = soup.find(h2,class_=main-title).text pic_urls = [] for i in range(1,int(pic_max)+1): pic_url
= url +/+ str(i) pic_urls.append(pic_url) return pic_urls,title def get_link(url): ‘‘‘ 從上層的http://www.mzitu.com/155036/2鏈接中匹配到圖片鏈接 http://i.meizitu.net/2018/10/18b01.jpg ‘‘‘ headers = { UserAgent: Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1, } link_dict = {} res = first_page(url) print(res) for pic_url in res[0]: html = requests.get(pic_url, headers=headers) mess = BeautifulSoup(html.text,html.parser) link = mess.find(img, alt=res[-1])[src] # link為<class ‘bs4.element.Tag‘>對象 pic_name = link.split(/)[-1] link_dict[link] = pic_name return link_dict def download(url): ‘‘‘ 從圖片地址下載圖片 ‘‘‘ link_dict = get_link(url) for link in link_dict: headers = { UserAgent: Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1, Referer:link #由於網站有防盜鏈,所以要告訴服務器我是從哪個頁面鏈接過來的 } html = requests.get(link,headers=headers) os.chdir(C:/Users/asus/Desktop/code/9.爬蟲/簡單web爬蟲/picture) #選擇保存文件夾 with open(link_dict[link],‘wb‘) as fp: fp.write(html.content) if __name__ == __main__:
  #這裏的圖片主頁鏈接由於沒爬取,先手動添加,有點low。。。 urls
= [http://www.mzitu.com/47580,http://www.mzitu.com/108003,http://www.mzitu.com/48342‘]

  for url in urls: t = Thread(target=download,args=(url,)) #開啟線程爬取 t_list.append(t)   for t in t_list: t.start()   for t in t_list: t.join()

爬取妹子圖片