1. 程式人生 > >python爬蟲練手之鬥圖啦

python爬蟲練手之鬥圖啦

網際網路時代,難免會和別人在線上聊天,而現在的年輕人吶!一言不合就開始鬥圖!我難道就默默看著別人裝逼嗎?NO!拒絕! 所以呢藉此機會我們找個表情網站,爬一波圖片啦

2

由於網站結構比較簡單,沒有非同步載入,直接從html就能查詢資訊啦,所以就不做詳細分析~

#coding:utf-8
import requests
import os
from lxml import html
from multiprocessing import Pool
class doutula():
    base_url = 'https://www.doutula.com/'
    headers={
            'accept-encoding':'gzip, deflate, sdch, br',
            'accept-language':'zh-CN,zh;q=0.8',
            'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
    }

    def get_selector(self,url):
        return html.fromstring(requests.get(url,headers=self.headers).text)

    ## 獲取第num頁下每個套圖的入口地址
    def get_page_link(self,num):
        now_url = "{}/article/list/?page={}".format(self.base_url,num)
        selector = self.get_selector(now_url)
        page_link = []
        for i in selector.xpath('//ul[@class="list-group"]/a/@href'):
            page_link.append(i)
        return page_link

    # 獲取詳細頁下的資訊,返回一個元組,包括標題和連結的列表
    def get_page_detail(self,url):
        pic_list = []
        selector = self.get_selector(url)
        # 獲取詳細頁的標題,圖片連線
        title = selector.xpath('//li[@class="list-group-item"]/h3/blockquote/a/text()')[0]
        pic_link = selector.xpath('//div[@class="artile_des"]/table/tbody/tr/td/a/img/@src')
        for i in pic_link:
            pic_list.append(i)
        return title,pic_link

    def Make_dir(self,title):
        future_dir = "{}/{}".format(os.path.abspath('.'),title)
        if os.path.exists(future_dir):
            print(u'資料夾已存在,跳過')
            return False
        else:
            os.mkdir(future_dir)
            print(title,u'資料夾建立完成')
            return True

    def down_load(self,page_info):
        count = 1
        title = page_info[0]
        pic_link = page_info[1]
        if self.Make_dir(title):
            for i in pic_link:
                now_path = "{}/{}/{}.jpg".format(os.path.abspath('.'), title, str(count))
                print(now_path)
                page_link = "https:{}".format(i)
                with open(now_path,'wb') as f:
                    f.write(requests.get(page_link,headers=self.headers).content)
                count+=1

def run(self,num=1):
    for i in self.get_page_link(num):
        self.down_load(self.get_page_detail(i))


if __name__ == '__main__':
    dt = doutula()
    dt.run(1)

1