1. 程式人生 > >Scrapy操作瀏覽器獲取網易新聞資料

Scrapy操作瀏覽器獲取網易新聞資料

爬蟲程式碼:

 1 import scrapy
 2 from selenium import  webdriver
 3 
 4 class WangyiSpider(scrapy.Spider):
 5     name = 'wangyi'
 6     # allowed_domains = ['www.xxx.com']
 7     start_urls = ['https://news.163.com/']
 8 
 9     def __init__(self):
10         self.bro = webdriver.Chrome(r'D:\爬蟲相關\資料\驅動程式\chromedriver_win32\chromedriver.exe
') 11 12 def parse(self, response): 13 #解析國內國際軍事航空對應的url 14 li_list=response.xpath('//div[@class="ns_area list"]/ul/li') 15 #國內國際軍事航空對應的索引 16 index_url=[3,4,6,7] 17 #儲存四個板塊對應的li標籤 18 news_list=[] 19 for i in index_url: 20 news_list.append(li_list[i])
21 #解析獲取板塊的url 22 for li in news_list: 23 url=li.xpath('./a/@href').extract_first() 24 yield scrapy.Request(url=url,callback=self.parse_news) 25 26 def parse_news(self,response): 27 print('xxx:',response.xpath('/html/body/div[1]/div[3]/div[4]/div[1]/div/div/ul/li/div/div[3]/div[1]/h3/a/text()
').extract_first()) 28 def closed(self,spider): 29 self.bro.quit()
爬蟲程式碼

middewares中介軟體程式碼:

 1 from scrapy import signals
 2 from  scrapy.http import HtmlResponse
 3 
 4 
 5 
 6 class WangyiproDownloaderMiddleware(object):
 7     def process_response(self, request, response, spider):
 8         if request.url in ['http://news.163.com/air/','http://war.163.com/','http://news.163.com/world/','http://news.163.com/domestic/']:
 9             spider.bro.get(request.url)
10             page_text=spider.bro.page_source#獲取資料來源
11             return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
12         return response
middlewares