1. 程式人生 > 其它 >web爬蟲02-多執行緒爬蟲

web爬蟲02-多執行緒爬蟲

多執行緒爬蟲

原理

利用CPU和IO可以同時執行的原理,讓CPU不會幹巴巴的等待IO的完成

#網站:還是豆瓣250 https://movie.douban.com/top250
import requests
from lxml import etree
import time
from threading import Thread
import codecs


# 時間裝飾器
def timer(func):
    def inner(*args, **kw):
        t1 = time.time()
        func(*args, **kw)
        t2 = time.time()
        print("-------一共花費時間:{}秒".format(t2-t1))
        return t2 - t1
    return inner


def get_content(url):
    res = requests.get(url, headers=headers)
    deal_content(res.text)


def deal_content(content):
    res = etree.HTML(content)
    items = res.xpath('//div[@id="wrapper"]//ol//li/div[@class="item"]/div[@class="info"]')
    infos = []
    for item in items:
        name = item.xpath('./div/a/span[1]/text()')[0]
        description = item.xpath('./div/p/text()')[0].strip()
        evaluate = item.xpath('./div[2]/div[@class="star"]/span[4]/text()')[0]
        infos.append(name)
        infos.append(description)
        infos.append(evaluate)
    f.write(u'{movies}\n'.format(movies='\n'.join(infos)))


@timer
def multi_thread(urls):
    threads = []
    for url in urls:
        threads.append(
            Thread(target=get_content, args=(url,))
        )
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
    }
    urls = ['https://movie.douban.com/top250?start={}&filter='.format(i*25) for i in range(10)]
    with codecs.open('moviess', 'wb', encoding='utf-8') as f:
        multi_thread(urls)
花費時間


單執行緒為:3.102s
多執行緒為:0.348s
大致為10倍速度