爬取西刺代理
阿新 • • 發佈:2018-12-05
spider:
# -*- coding: utf-8 -*-
import scrapy
from collectip.items import CollectipItem
class XiciSpider(scrapy.Spider):
name = 'xici'
allowed_domains = ['xicidaili.com']
start_urls = ['http://www.xicidaili.com']
def start_requests(self):
reqs = []
for i in range(1,3):
req = scrapy.Request("http://www.xicidaili.com/nn/%s"%i)
reqs.append(req)
return reqs
def parse(self, response):
ip_list = response.xpath('//table[@id="ip_list"]')
print(ip_list)
trs = ip_list[0].xpath('tr')
items = []
for ip in trs[1:]:
pre_item = CollectipItem()
pre_item['IP'] = ip.xpath('td[2]/text()')[0].extract()
pre_item['PORT'] = ip.xpath('td[3]/text()')[0].extract()
pre_item['POSITION'] = ip.xpath('string(td[4])')[0].extract().strip()
pre_item['TYPE'] = ip.xpath('td[6]/text()')[0].extract()
pre_item['SPEED'] = ip.xpath('td[8]/div[@class="bar"]/@title').re('\d{0,2}\.\d{0,}')[0]
pre_item['LAST_CHECK_TIME'] = ip.xpath('td[10]/text()')[0].extract()
items.append(pre_item)
return items