1. 程式人生 > 其它 >Scrapy模組爬取中華英才網招聘資訊(分頁)

Scrapy模組爬取中華英才網招聘資訊(分頁)

技術標籤:爬蟲scrapy

import scrapy
from fenye.items import FenyeItem
import requests
class ZhfySpider(scrapy.Spider):
    name = 'zhfy'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.chinahr.com/channel/rizhao/pn1/']
    # 分頁操作
    urls = 'http://www.chinahr.com/channel/rizhao/pn%d/'
    page_num =
2 def parse(self, response): li_list = response.xpath('//div[@class="assortment_right_tab_content"]/ul/li') for li in li_list: job_name = li.xpath('./div[1]/h1/text()').extract_first() job_content = li.xpath('./div[2]/span[2]/text()').extract_first() # 例項化item物件,進行持久化儲存。
# print(job_name,job_content) item = FenyeItem() item['job_name'] = job_name item['job_content'] = job_content # 提交給管道進行持久化儲存 yield item if self.page_num < 3: new_url = format(self.urls % self.page_num)
self.page_num += 1 # 手動請求傳送:callback回撥函式用於資料解析。 yield scrapy.Request(url=new_url, callback=self.parse)
items.py
import scrapy
class Kjpro3Item(scrapy.Item):
    job_name=scrapy.Field()
    job_data=scrapy.Field()
    job_salary=scrapy.Field()
    job_content=scrapy.Field()

piplines.py
import pymysql

class Kjpro3Pipeline:
    def open_spider(self,spider):
        self.conn=pymysql.connect(
            host='localhost',
            port=3306,
            user='root',
            password='root',
            db='scrapy',
            charset='utf8'
        )
        print('爬蟲開始!!')

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        value=(item['job_name'],item['job_data'],item['job_salary'],item['job_content'])
        try:
            sql='insert into job_4(job_name,job_data,job_salary,job_content) value(%s,%s,%s,%s)'
            self.cursor.execute(sql,value)
            self.conn.commit()
            print('資料插入成功!!')
        except:
            print('資料插入失敗!!!')

        return item
    def close_spider(self,spider):
        print('爬蟲結束!!')
        self.cursor.close()
        self.conn.close()

資料庫截圖:
在這裡插入圖片描述

settings.py
USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36'
ROBOTSTXT_OBEY = False

LOG_LEVEL='ERROR'
ITEM_PIPELINES = {
   'kjPro3.pipelines.Kjpro3Pipeline': 300,
}