scrapy實戰爬取cl社區評論數超過設定值的鏈接
阿新 • • 發佈:2018-12-31
chrom lee connect ngs charset format lines back nes
1、創建scrapy項目
scrapy startproject cl
2、前戲
a、註釋爬蟲文件中的allowed_domains
b、settings.py第22行,ROBOTSTXT_OBEY = True改為ROBOTSTXT_OBEY = False
c、settings.py第19行,改為USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘
d、開啟管道:67-69行,
ITEM_PIPELINES = {
‘mytestscrapy.pipelines.MytestscrapyPipeline‘: 300,
}
3、cl.py
# -*- coding: utf-8 -*- import scrapy from scrapy import Selector from mytestscrapy.items import MytestscrapyItem import time import random class TestCLSpider(scrapy.Spider): name = ‘cl‘ # allowed_domains = [‘www.baidu.com‘] start_urls = [‘https://cc.yyss.icu/thread0806.php?fid=2&search=&page=1‘] print("第1頁開始") url = ‘https://cc.yyss.icu/thread0806.php?fid=2&search=&page=%d‘ pageNum = 1 def parse(self, response): # response_text = response.text if self.pageNum == 1: tr_ele=Selector(response=response).xpath(‘//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]‘)[2:] else: tr_ele=Selector(response=response).xpath(‘//table[@id="ajaxtable"]/tbody[@style="table-layout:fixed;"]/tr[@class="tr3 t_one tac"]‘) for tr in tr_ele: count = tr.xpath(‘./td[4]/text()‘).extract_first() #過濾評論數小於4的 if int(count) < 4: continue text = tr.xpath(‘./td[2]//a/text()‘).extract_first() url = ‘https://cc.yyss.icu/‘+tr.xpath(‘./td[2]//a/@href‘).extract_first() item = MytestscrapyItem() item[‘urlname‘] = text item[‘urladdr‘] = url item[‘commentsNum‘] = count yield item #爬取1-30頁數據 if self.pageNum < 30: #每爬取一頁數據,隨機等待2-4秒 time.sleep(random.randint(2,5)) self.pageNum += 1 new_url = format(self.url % self.pageNum) print("第%s頁開始"%self.pageNum) yield scrapy.Request(url=new_url,callback=self.parse)
4.items.py
import scrapy class MytestscrapyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() urlname = scrapy.Field() urladdr = scrapy.Field() commentsNum = scrapy.Field()
5、pipelines.py(數據存入mysql數據庫,mysql數據庫cl_table表的字段urlname, urladdr, commentsNum)
import pymysql class MytestscrapyPipeline(object): connect = ‘‘ cursor = ‘‘ def open_spider(self, spider): self.connect = pymysql.Connect( host=‘localhost‘, port=3306, user=‘root‘, passwd=‘123456‘, db=‘cl‘, charset=‘utf8‘ ) def process_item(self, item, spider): urlname = item[‘urlname‘] urladdr = item[‘urladdr‘] commentsNum = item[‘commentsNum‘] self.cursor = self.connect.cursor() sql = "INSERT INTO cl_table (urlname, urladdr, commentsNum) VALUES (‘%s‘,‘%s‘,‘%s‘ )" data = (urlname, urladdr, commentsNum) try: self.cursor.execute(sql % data) except Exception as e: self.connect.rollback() # 事務回滾 print(‘事務處理失敗‘, e) else: self.connect.commit() # 事務提交 print(‘事務處理成功‘, self.cursor.rowcount) return item def close_spider(self,spider): self.cursor.close() self.connect.close()
scrapy實戰爬取cl社區評論數超過設定值的鏈接