爬取圖片
阿新 • • 發佈:2018-11-26
fun lis fin tip tro btn apply res %s
import requests # 模塊導入的倆種方法
from multiprocessing import Pool
import re
def get(url):
ret=requests.get(url)
if ret.status_code==200:
return ret.content.decode(‘gbk‘)
def call_back(arg):
ret = com.finditer(arg)
dict_lst=[]
for i in ret:
dic = {
‘png‘: i.group(‘png‘),
‘name‘: i.group(‘name‘),
‘place‘: i.group(‘place‘)
}
dict_lst.append(dic)
for i in dict_lst:
res=subget(i[‘png‘])
write_func(i[‘name‘],i[‘place‘],res)
return dict_lst
def subget(url):
if ‘https‘ in url:
ret = requests.get(url)
if ret.status_code == 200:
return ret.content
else:
pass
else:
n_url = ‘http://www.xiaohuar.com‘ + url
ret = requests.get(n_url)
if ret.status_code == 200:
return ret.content
else:
pass
def write_func(path,place,picture):
with open(r‘E:\text1\爬蟲\text_png\%s-%s.png‘ %(path,place),‘wb‘) as f:
f.write(picture)
‘‘‘我要爬取的網頁的特征‘‘‘
‘‘‘http://www.xiaohuar.com/list-1-0.html‘‘‘
‘‘‘http://www.xiaohuar.com/list-1-43.html‘‘‘
if __name__ ==‘__main__‘:
com = re.compile(
‘<div class="item_t">(?:.*?)src="(?P<png>.*?)"(?:.*?)<span class="price">(?P<name>.*?)</span>(?:.*?)‘
‘<a href="http://www.xiaohuar.com/" class="img_album_btn">(?P<place>.*?)</a>‘, re.S)
pool=Pool(3)
res_lst=[]
for i in range(40):
pool.apply_async(get,args=(‘http://www.xiaohuar.com/list-1-%s.html‘ %i,),callback=call_back)
pool.close()
pool.join()
缺點:爬取的速度慢,最多17個網頁(好無奈)
爬取圖片