Scrapy模組爬取中華英才網招聘資訊(分頁)
阿新 • • 發佈:2020-12-07
import scrapy
from fenye.items import FenyeItem
import requests
class ZhfySpider(scrapy.Spider):
name = 'zhfy'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.chinahr.com/channel/rizhao/pn1/']
# 分頁操作
urls = 'http://www.chinahr.com/channel/rizhao/pn%d/'
page_num = 2
def parse(self, response):
li_list = response.xpath('//div[@class="assortment_right_tab_content"]/ul/li')
for li in li_list:
job_name = li.xpath('./div[1]/h1/text()').extract_first()
job_content = li.xpath('./div[2]/span[2]/text()').extract_first()
# 例項化item物件,進行持久化儲存。
# print(job_name,job_content)
item = FenyeItem()
item['job_name'] = job_name
item['job_content'] = job_content
# 提交給管道進行持久化儲存
yield item
if self.page_num < 3:
new_url = format(self.urls % self.page_num)
self.page_num += 1
# 手動請求傳送:callback回撥函式用於資料解析。
yield scrapy.Request(url=new_url, callback=self.parse)
items.py
import scrapy
class Kjpro3Item(scrapy.Item):
job_name=scrapy.Field()
job_data=scrapy.Field()
job_salary=scrapy.Field()
job_content=scrapy.Field()
piplines.py
import pymysql
class Kjpro3Pipeline:
def open_spider(self,spider):
self.conn=pymysql.connect(
host='localhost',
port=3306,
user='root',
password='root',
db='scrapy',
charset='utf8'
)
print('爬蟲開始!!')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
value=(item['job_name'],item['job_data'],item['job_salary'],item['job_content'])
try:
sql='insert into job_4(job_name,job_data,job_salary,job_content) value(%s,%s,%s,%s)'
self.cursor.execute(sql,value)
self.conn.commit()
print('資料插入成功!!')
except:
print('資料插入失敗!!!')
return item
def close_spider(self,spider):
print('爬蟲結束!!')
self.cursor.close()
self.conn.close()
資料庫截圖:
settings.py
USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
ITEM_PIPELINES = {
'kjPro3.pipelines.Kjpro3Pipeline': 300,
}