1. 程式人生 > 實用技巧 >tornado實現高併發爬蟲

tornado實現高併發爬蟲

from urllib.parse import urljoin

from bs4 import BeautifulSoup
from tornado import gen, httpclient, ioloop, queues

base_url = "http://www.tornadoweb.org/en/stable/"
concurrency = 3

async def get_url_links(url):
    response = await httpclient.AsyncHTTPClient().fetch("http://www.tornadoweb.org/en/stable/")
    html 
= response.body.decode("utf8") soup = BeautifulSoup(html) links = [urljoin(base_url, a.get("href")) for a in soup.find_all("a", href=True)] return links async def main(): seen_set = set() q = queues.Queue() async def fetch_url(current_url): #生產者 if current_url in
seen_set: return print("獲取: {}".format(current_url)) seen_set.add(current_url) next_urls = await get_url_links(current_url) for new_url in next_urls: if new_url.startswith(base_url): #放入佇列, await q.put(new_url) async
def worker(): async for url in q: if url is None: return try: await fetch_url(url) except Exception as e: print("excepiton") finally: q.task_done() #放入初始url到佇列 await q.put(base_url) #啟動協程 workers = gen.multi([worker() for _ in range(concurrency)]) await q.join() for _ in range(concurrency): await q.put(None) await workers if __name__ == "__main__": io_loop = ioloop.IOLoop.current() io_loop.run_sync(main)