CrawlSpider爬取拉鉤
阿新 • • 發佈:2018-11-04
CrawlSpider繼承Spider,提供了強大的爬取規則(Rule)供使用
填充custom_settings
,瀏覽器中的請求頭
from datetime import datetime import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ArticleSpider.items import LagouJobItem, LagouJobItemLoader from ArticleSpider.utils.common import get_md5 class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com/'] custom_settings = { } rules = ( Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True), Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), ) def parse_job(self, response): # 解析拉勾網的職位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
class LagouJobItemLoader(ItemLoader): #自定義itemloader default_output_processor = TakeFirst() class LagouJobItem(scrapy.Item): #拉勾網職位資訊 title = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() salary = scrapy.Field() job_city = scrapy.Field( input_processor=MapCompose(remove_splash), ) work_years = scrapy.Field( input_processor = MapCompose(remove_splash), ) degree_need = scrapy.Field( input_processor = MapCompose(remove_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field() job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field() company_url = scrapy.Field() tags = scrapy.Field( input_processor = Join(",") ) crawl_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need, job_type, publish_time, job_advantage, job_desc, job_addr, company_name, company_url, tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc) """ params = ( self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"], self["degree_need"], self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_name"], self["company_url"], self["job_addr"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT), ) return insert_sql, params
SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for lagou_job -- ---------------------------- DROP TABLE IF EXISTS `lagou_job`; CREATE TABLE `lagou_job` ( `title` varchar(255) NOT NULL, `url` varchar(255) NOT NULL, `url_object_id` varchar(50) NOT NULL, `salary` varchar(20) DEFAULT NULL, `job_city` varchar(255) DEFAULT NULL, `work_years` varchar(255) DEFAULT NULL, `degree_need` varchar(255) DEFAULT NULL, `job_type` varchar(255) DEFAULT NULL, `tags` varchar(255) DEFAULT NULL, `publish_time` varchar(20) NOT NULL, `job_advantage` varchar(255) DEFAULT NULL, `job_desc` longtext NOT NULL, `job_addr` varchar(255) DEFAULT NULL, `company_name` varchar(255) DEFAULT NULL, `company_url` varchar(255) DEFAULT NULL, `crawl_time` datetime NOT NULL, `crawl_update_time` datetime DEFAULT NULL, PRIMARY KEY (`url_object_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;