分布式爬蟲
阿新 • • 發佈:2017-11-20
分布 scrapy arc clas lsp wls utf con sts
1. 爬去首頁
1 # -*- coding: utf-8 -*- 2 from scrapy_redis.spiders import RedisCrawlSpider 3 import scrapy 4 import redis 5 6 class DemoSpider(RedisCrawlSpider): 7 name = ‘demo‘ 8 allowed_domains = [‘demo.com‘] 9 redis_key = ‘demo:start_urls‘ 10 redis_info=redis.Redis(host=‘111.111.111.111‘, port=6379) 11 12 def parse(self, response): 13 sel = scrapy.Selector(response) 14 a_list = sel.xpath(‘//div[@id="search_right_demo"]/div/div[@class="clearfixed"]/a‘) 15 for a_item in a_list: 16 url = self.host_urls + a_item.xpath(‘@href‘).extract()[0] 17 url = url[:url.find(‘=‘) + 1] + ‘489‘ + url[url.find(‘&‘):] 18 self.redis_info.lpush(‘demo:list_urls‘,url)
2. 爬去URL列表
# -*- coding: utf-8 -*- from scrapy_redis.spiders import RedisCrawlSpiderimport scrapy import redis
class DemoListSpider(RedisCrawlSpider): name = ‘demo_list‘ allowed_domains = [‘demo.com‘] redis_key = ‘demo:list_urls‘ redis_info=redis.Redis(host=‘111.111.111.111‘, port=6379) def parse(self, response): selector = scrapy.Selector(response) table_a_xpath = selector.xpath(‘//*[@id="newlist_list_content_table"]/table/tr[1]/td[1]/div/a/@href‘).extract() for url in table_a_xpath: self.redis_info.lpush(‘demo:info_urls‘, url) next_page = selector.xpath(‘//a[@class="next-page"]/@href‘).extract() if next_page: self.redis_info.lpush(‘demo:list_urls‘, next_page[0])
3. 爬去商品詳細信息
# -*- coding: utf-8 -*- from scrapy_redis.spiders import RedisCrawlSpider from demo.items import demoItem import scrapy import redis class demoInfoSpider(RedisCrawlSpider): name = ‘demo_info‘ allowed_domains = [‘zhaopin.com‘] redis_key = ‘demo:info_urls‘ redis_info=redis.Redis(host=‘111.111.111.111‘, port=6379) def parse(self, response): sel = scrapy.Selector(response) zwmc = sel.xpath(‘//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/h1/text()‘).extract() gsmc = sel.xpath(‘//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/h2/a/text()‘).extract() flxx = sel.xpath(‘//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/div/span/text()‘).extract()
yield item
分布式爬蟲