爬取妹子圖片
阿新 • • 發佈:2018-11-08
[] 爬取 user pict html pen chdir star download
學習練習爬蟲的時候寫了一個爬取妹子的小程序玩玩。
from bs4 import BeautifulSoup import requests import os from threading import Thread ‘‘‘ soup.find( name , attrs , recursive , string , **kwargs )函數 name:查找哪一個標簽 attrs:標簽裏的指定那個參數,比如class 註意: BeautifulSoup()返回的類型是<class ‘bs4.BeautifulSoup‘> find()返回的類型是<class ‘bs4.element.Tag‘> find_all()返回的類型是<class ‘bs4.element.ResultSet‘> <class ‘bs4.element.ResultSet‘>不能再進行find/find_all操作‘‘‘ def first_page(url): ‘‘‘ 從主頁的圖片鏈接進去匹配 http://www.mzitu.com/155036 http://www.mzitu.com/155036/2 找到最大頁數,循環訪問 ‘‘‘ headers = { ‘UserAgent‘: ‘Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1‘, } html = requests.get(url,headers=headers) soup= BeautifulSoup(html.text,‘html.parser‘) girl_url_list = soup.find(‘div‘,class_=‘main-image‘).find_all(‘a‘)[‘href‘] #最大頁數 pic_max = soup.find_all(‘span‘)[10].text #圖片標題,soup對象 title = soup.find(‘h2‘,class_=‘main-title‘).text pic_urls = [] for i in range(1,int(pic_max)+1): pic_url= url +‘/‘+ str(i) pic_urls.append(pic_url) return pic_urls,title def get_link(url): ‘‘‘ 從上層的http://www.mzitu.com/155036/2鏈接中匹配到圖片鏈接 http://i.meizitu.net/2018/10/18b01.jpg ‘‘‘ headers = { ‘UserAgent‘: ‘Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1‘, } link_dict = {} res = first_page(url) print(res) for pic_url in res[0]: html = requests.get(pic_url, headers=headers) mess = BeautifulSoup(html.text,‘html.parser‘) link = mess.find(‘img‘, alt=res[-1])[‘src‘] # link為<class ‘bs4.element.Tag‘>對象 pic_name = link.split(‘/‘)[-1] link_dict[link] = pic_name return link_dict def download(url): ‘‘‘ 從圖片地址下載圖片 ‘‘‘ link_dict = get_link(url) for link in link_dict: headers = { ‘UserAgent‘: ‘Mozilla/5.0 (Windows NT 6.1 rv: 2.0.1) Gecko/20100101 Firefox/4.0.1‘, ‘Referer‘:link #由於網站有防盜鏈,所以要告訴服務器我是從哪個頁面鏈接過來的 } html = requests.get(link,headers=headers) os.chdir(‘C:/Users/asus/Desktop/code/9.爬蟲/簡單web爬蟲/picture‘) #選擇保存文件夾 with open(link_dict[link],‘wb‘) as fp: fp.write(html.content) if __name__ == ‘__main__‘:
#這裏的圖片主頁鏈接由於沒爬取,先手動添加,有點low。。。 urls = [‘http://www.mzitu.com/47580‘,‘http://www.mzitu.com/108003‘,‘http://www.mzitu.com/48342‘]
for url in urls: t = Thread(target=download,args=(url,)) #開啟線程爬取 t_list.append(t) for t in t_list: t.start() for t in t_list: t.join()
爬取妹子圖片