1. 程式人生 > >Python 爬取圖片

Python 爬取圖片

# coding:utf-8
import urllib
import urllib2
import re
import time
import threading
import socket
import urlparse
import datetime
root_domain='163.com'
beginurl = 'http://www.163.com/'
#最大深度
max_depth=2
socket.setdefaulttimeout(10)
SLEEP_TIME = 1
linkpool = [beginurl]
seedlink = {beginurl:0}
imgpool = []
dueimgpool = []
num_retries=0
lock = threading.Lock() pimg1 = re.compile( r'<img[^<>]+(?:src|original|src2)=["\']{1}([^"\']+)["\']{1}', re.IGNORECASE) pimg2 =re.compile( r'"image":"([^"\']+)"', re.IGNORECASE) plink = re.compile( r'<a[^<>]+href=["\']{1}([^"\']+)["\']{1}', re.IGNORECASE) pfilename=re.compile(r'\W|_'
) headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'} class Throttle: def __init__(self,delay): self.delay=delay self.domains={} def wait(self,url): domain=urlparse.urlparse(url).netloc last_accessed=self.domains.get(domain) if
self.delay>0 and last_accessed is not None : sleep_secs=self.delay-(datetime.datetime.now()-last_accessed).seconds if sleep_secs>0 : time.sleep(sleep_secs) self.domains[domain]=datetime.datetime.now() throttle = Throttle(SLEEP_TIME) def download(url, headers, proxy=None, num_retries=0, data=None): print 'Downloading:', url request = urllib2.Request(url, data, headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except urllib2.URLError as e: print 'Download error:', e.reason html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: return download(url, headers, proxy, num_retries-1, data) else: code = None return html def same_root_domain(url): domain=urlparse.urlparse(url).netloc return root_domain in domain def grab(url): depth = seedlink[url] if depth != max_depth: throttle.wait(url) html = download(url, headers=headers, num_retries=num_retries) imglist = pimg1.findall(html) imglist.extend(pimg2.findall(html)) linklist = plink.findall(html) for lnk in linklist: lnk=urlparse.urljoin(url,lnk) if lnk not in seedlink: seedlink[lnk] = depth + 1 if same_root_domain(lnk): linkpool.append(lnk) for img in imglist: img = urlparse.urljoin(url, img) if img in imgpool or img in dueimgpool: continue imgpool.append(img) def process_img_queue(): while True: try: imgurl = imgpool.pop() dueimgpool.append(imgurl) tail = "jpg" filename=pfilename.sub('',imgurl) except IndexError: break else: try: throttle.wait(imgurl) urllib.urlretrieve(imgurl, "D:\grab\%s.%s" % (filename, tail)) except Exception, e: print str(e) def process_link_queue(): while True: try: link = linkpool.pop() except IndexError: break else: try: grab(link) except Exception, e: print str(e) img_threads = [] link_threads = [] while link_threads or linkpool: if imgpool: for thread in img_threads: if not thread.is_alive(): img_threads.remove(thread) while len(img_threads) < 20 and imgpool: thread = threading.Thread(target=process_img_queue) thread.setDaemon(True) thread.start() img_threads.append(thread) for thread in link_threads: if not thread.is_alive(): link_threads.remove(thread) while len(link_threads) < 10 and linkpool: thread = threading.Thread(target=process_link_queue) thread.setDaemon(True) thread.start() link_threads.append(thread)