【Python】asyncio非同步爬蟲
阿新 • • 發佈:2019-01-24
抓取圖片URL
# -*- coding: utf-8 -*-
import asyncio
import aiohttp
from pyquery import PyQuery as pq
urls = ["http://www.mzitu.com/page/{}/".format(i) for i in range(1, 157)]
# 限制併發數為5個
semaphore = asyncio.Semaphore(5)
f = open("img_url.txt", "w")
async def get_html(url):
ck = """Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1507966069,1509850072,1509851337,1509851651; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1509851653"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
"Referer": url,
"Cookie": ck,
"Host": "www.mzitu.com"
}
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as html:
# print(resp.status)
response = await html.text(encoding="utf-8")
# print(response)
return response
async def parse(url):
html = await get_html(url)
doc = pq(html)
img_urls = doc(".postlist ul li a img" ).items()
for img_url in img_urls:
url = img_url.attr("data-original")
f.write(url + '\n')
loop = asyncio.get_event_loop()
tasks = [parse(url) for url in urls]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
f.close()
下載圖片
# -*- coding: utf-8 -*-
import asyncio
import aiohttp
import aiofiles
import time
urls = []
with open("img_url.txt", "r") as f:
for line in f:
urls.append(line.strip())
# print(urls)
# 限制併發數為5個
semaphore = asyncio.Semaphore(5)
async def download(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
"Referer": 'http://i.meizitu.net'
}
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as html:
print(html.status)
img = await html.read()
fname = str(time.time()) + '.jpg'
fp = await aiofiles.open(fname, 'wb')
await fp.write(img)
return True
loop = asyncio.get_event_loop()
tasks = [download(url) for url in urls]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()