投訴網站爬蟲
阿新 • • 發佈:2018-12-09
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from yg.items import YgItem 4 5 class YgSpiderSpider(scrapy.Spider): 6 name = 'yg_spider' 7 allowed_domains = ['wz.sun0769.com'] 8 start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0'] 9 10 def parse(self, response):11 tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr") 12 for tr in tr_list: 13 item = YgItem() 14 item["title"] = tr.xpath("./td[2]/a[2]/@title").extract_first() 15 item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first() 16 item["update_time"] = tr.xpath("./td[last()]/text()").extract_first() 17 # print(item) 18 19 yield scrapy.Request( 20 item["href"], 21 callback=self.parse_detail, 22 meta={"item":item} 23 ) 24 25 next_url = response.xpath("//a[text()='>']/@href").extract_first() 26 if next_url is not None: 27 yield scrapy.Request( 28 next_url, 29 callback=self.parse 30 ) 31 32 def parse_detail(self,response): #處理詳情頁 33 item = response.meta["item"] 34 item["content"] = response.xpath("//div[@class='c1 text14_2']//text()").extract() 35 item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract() 36 item["content_img"] = ["http://wz.sun0769.com"+i for i in item["content_img"]] 37 # print(item) 38 yield item
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 import re 8 import json 9 10 class YgPipeline(object): 11 def process_item(self, item, spider): 12 item["content"] = self.process_content(item["content"]) 13 with open("yg.txt", "a", encoding="utf-8") as f: 14 f.write(json.dumps(dict(item), ensure_ascii=False, indent=4)) 15 f.write("\n") 16 return item 17 18 def process_content(self, content): 19 content = [re.sub(r'\xa0|\s',"",i) for i in content] 20 content = [i for i in content if len(i)>0] 21 return content
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class YgItem(scrapy.Item): 12 # define the fields for your item here like: 13 title = scrapy.Field() 14 update_time = scrapy.Field() 15 href = scrapy.Field() 16 content = scrapy.Field() 17 content_img = scrapy.Field() 18 # pass