使用scrapy框架爬取貓眼電影全部的頁碼 並寫入資料庫
阿新 • • 發佈:2018-12-10
使用scrapy框架爬取貓眼電影 爬取全部的頁數
import scrapy,re from jobmaoyan.items import JobmaoyanItem class MaoyanSpider(scrapy.Spider): name = 'maoyan_spider' allowed_domains=['maoyan.com'] start_urls=['http://maoyan.com/films?showType=3'] page_set=set() def parse(self, response): datalist = response.xpath("//dd") for data in datalist: item = JobmaoyanItem() item['title'] = data.xpath("div[@class='channel-detail movie-item-title']/a/text()").extract()[0] item['imgurl'] = data.xpath("div[@class='movie-item']/a[@target='_blank']/div[@class='movie-poster']/img[2]/@data-src").extract()[0] item['types'] = response.xpath("//li[@class='tags-line']/ul[@class='tags']/li[@class='active']/a[starts-with(@href,'javascript')]/text()").extract()[0] # print('==========================',item['types']) datail_url = "http://maoyan.com" + data.xpath("div[@class='movie-item']/a/@href").extract()[0] yield scrapy.Request(url=datail_url, callback=self.parse_detail, meta={"data": item}) #獲取頁碼網址遞迴迴圈實現獲取全部的頁碼 pageurls = response.xpath("//a[starts-with(@href,'?showType=3&offset=')]/@href").extract() for pageurl in pageurls: if pageurl in self.page_set: pass else: self.page_set.add(pageurl) purl = 'http://maoyan.com/films' + pageurl # print('------------------------',purl) yield scrapy.Request(url=purl, callback=self.parse) def parse_detail(self,response): item = response.meta['data'] item["d_type"] = response.xpath("//div[@class='movie-brief-container']/ul/li[1]/text()").extract()[0] item["d_country"] = response.xpath("//div[@class='movie-brief-container']/ul/li[2]/text()").extract()[0] item['d_country'] = re.sub(r"\s", "", item['d_country']) item["d_stime"] = response.xpath("//div[@class='movie-brief-container']/ul/li[3]/text()").extract()[0] item["d_content"]=response.xpath("//div[@class='mod-content']/span[@class ='dra']/text()").extract()[0] item["comment1"]=response.xpath("//div[@class='comment-list-container']/ul/li[1]/div[@class='main']/div[@class='comment-content']/text()").extract()[0] item["comment2"]=response.xpath("//div[@class='comment-list-container']/ul/li[2]/div[@class='main']/div[@class='comment-content']/text()").extract()[0] item["comment3"]=response.xpath("//div[@class='comment-list-container']/ul/li[3]/div[@class='main']/div[@class='comment-content']/text()").extract()[0] yield item
寫入資料庫檔案pipelines_mysql 需要在setting中寫入pipelines_mysql檔案 並放開
ITEM_PIPELINES = { # 'jobmaoyan.pipelines.JobmaoyanPipeline': 300, # 'jobmaoyan.pipelines_txt.JobmaoyanPipeline': 300, 'jobmaoyan.pipelines_mysql.JobmaoyanPipeline': 300, # 'jobmaoyan.pipelines_json.JobmaoyanPipeline': 300, # 'jobmaoyan.pipelines_xls.JobmaoyanPipeline': 300, # 'jobmaoyan.pipelines_mongdb.JobmaoyanPipeline': 300, }
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class JobmaoyanPipeline(object): def process_item(self, item, spider): db = pymysql.connect('127.0.0.1', "root", "123456", "jobbole") cursor = db.cursor() create_sql="create table if not exists catmovies(id int primary key auto_increment,types text,title text,imgurl text,type text,country text,stime text,content text,comment1 text,comment2 text,comment3 text)" cursor.execute(create_sql) insert_sql = "insert into catmovies values(0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item['types'],item['title'],item['imgurl'],item['d_type'],item['d_country'],item['d_stime'],item['d_content'],item["comment1"],item["comment2"],item["comment3"]) try: cursor.execute(insert_sql) db.commit() except: db.rollback() cursor.close() db.close() return item