python爬蟲練手之鬥圖啦
阿新 • • 發佈:2019-01-11
網際網路時代,難免會和別人在線上聊天,而現在的年輕人吶!一言不合就開始鬥圖!我難道就默默看著別人裝逼嗎?NO!拒絕! 所以呢藉此機會我們找個表情網站,爬一波圖片啦
由於網站結構比較簡單,沒有非同步載入,直接從html就能查詢資訊啦,所以就不做詳細分析~
#coding:utf-8 import requests import os from lxml import html from multiprocessing import Pool class doutula(): base_url = 'https://www.doutula.com/' headers={ 'accept-encoding':'gzip, deflate, sdch, br', 'accept-language':'zh-CN,zh;q=0.8', 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36' } def get_selector(self,url): return html.fromstring(requests.get(url,headers=self.headers).text) ## 獲取第num頁下每個套圖的入口地址 def get_page_link(self,num): now_url = "{}/article/list/?page={}".format(self.base_url,num) selector = self.get_selector(now_url) page_link = [] for i in selector.xpath('//ul[@class="list-group"]/a/@href'): page_link.append(i) return page_link # 獲取詳細頁下的資訊,返回一個元組,包括標題和連結的列表 def get_page_detail(self,url): pic_list = [] selector = self.get_selector(url) # 獲取詳細頁的標題,圖片連線 title = selector.xpath('//li[@class="list-group-item"]/h3/blockquote/a/text()')[0] pic_link = selector.xpath('//div[@class="artile_des"]/table/tbody/tr/td/a/img/@src') for i in pic_link: pic_list.append(i) return title,pic_link def Make_dir(self,title): future_dir = "{}/{}".format(os.path.abspath('.'),title) if os.path.exists(future_dir): print(u'資料夾已存在,跳過') return False else: os.mkdir(future_dir) print(title,u'資料夾建立完成') return True def down_load(self,page_info): count = 1 title = page_info[0] pic_link = page_info[1] if self.Make_dir(title): for i in pic_link: now_path = "{}/{}/{}.jpg".format(os.path.abspath('.'), title, str(count)) print(now_path) page_link = "https:{}".format(i) with open(now_path,'wb') as f: f.write(requests.get(page_link,headers=self.headers).content) count+=1 def run(self,num=1): for i in self.get_page_link(num): self.down_load(self.get_page_detail(i)) if __name__ == '__main__': dt = doutula() dt.run(1)