python 爬取伯樂在線完整版
阿新 • • 發佈:2018-03-29
content params 回調 pipe bsp value cti ica ons
在jobbole.py
import scrapy import re import datetime from urllib import parse from scrapy.http import Request from ArticleSpider.items import JobBoleArticleItem from ArticleSpider.utils.common import get_md5 from scrapy.loader import ItemLoader class JobboleSpider(scrapy.Spider): name = ‘jobbole‘ allowed_domains = [‘blog.jobbole.com‘] start_urls = [‘http://blog.jobbole.com/all-posts‘] def parse(self, response): """ 1. 獲取文章列表頁中的文章url並交給scrapy下載後並進行解析 2. 獲取下一頁的url並交給scrapy進行下載, 下載完成後交給parse """ # 解析列表頁中的所有文章url並交給scrapy下載後並進行解析 post_nodes = response.css("#archive .floated-thumb .post-thumb a") for post_node in post_nodes: #獲取封面圖的url image_url = post_node.css("img::attr(src)").extract_first("") post_url = post_node.css("::attr(href)").extract_first("") #request下載完成之後,回調parse_detail進行文章詳情頁的解析 # Request(url=post_url,callback=self.parse_detail) yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url},callback=self.parse_detail) #遇到href沒有域名的解決方案 #response.url + post_url # 提取下一頁並交給scrapy進行下載 next_url = response.css(".next.page-numbers::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) def parse_detail(self, response): # front_image_url = response.meta.get("front_image_url", "") #文章封面圖 # title = response.css(".entry-header h1::text").extract_first() # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href=‘#article-comment‘] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # # content = response.css("div.entry::text").extract() # content = response.css(‘div.entry‘).extract_first() # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("評論")] # tags = ",".join(tag_list) # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item = JobBoleArticleItem() # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["url_object_id"] = get_md5(response.url) front_image_url = response.meta.get("front_image_url", "") # 文章封面圖 item_loader = ItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href=‘#article-comment‘] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") #調用這個方法來對規則進行解析生成item對象 article_item = item_loader.load_item() yield article_item
2.main.py
from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scrapy", "crawl", "jobbole"])
3.item.py
import scrapy import datetime import re from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst, Join class ArticlespiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass def date_convert(value): try: create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() return create_date def get_nums(value): match_re = re.match(".*?(\d+).*", value) if match_re: nums = int(match_re.group(1)) else: nums = 0 return nums def remove_comment_tags(value): #去掉tag中提取的評論 if "評論" in value: return "" else: return value def return_value(value): return value # class JobBoleArticleItem(scrapy.Item): # title = scrapy.Field() # create_date = scrapy.Field() # url = scrapy.Field() # url_object_id = scrapy.Field() # front_image_url = scrapy.Field() # front_image_path = scrapy.Field() # praise_nums = scrapy.Field() # comment_nums = scrapy.Field() # fav_nums = scrapy.Field() # content = scrapy.Field() # tags = scrapy.Field() class ArticleItemLoader(ItemLoader): #自定義itemloader default_output_processor = TakeFirst() class JobBoleArticleItem(scrapy.Item): title = scrapy.Field() create_date = scrapy.Field( input_processor=MapCompose(date_convert), ) url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field( output_processor=MapCompose(return_value) ) front_image_path = scrapy.Field() praise_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) comment_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) fav_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) #因為tag本身是list,所以要重寫 tags = scrapy.Field( input_processor=MapCompose(remove_comment_tags), output_processor=Join(",") ) content = scrapy.Field()
4.pipelines.py
import codecs import json import MySQLdb import MySQLdb.cursors from twisted.enterprise import adbapi from scrapy.pipelines.images import ImagesPipeline from scrapy.exporters import JsonItemExporter class ArticlespiderPipeline(object): def process_item(self, item, spider): return item class ArticleImagePipeline(ImagesPipeline): #重寫該方法可從result中獲取到圖片的實際下載地址 def item_completed(self, results, item, info): for ok, value in results: image_file_path = value["path"] item["front_image_path"] = image_file_path return item class MysqlTwistedPipline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host = settings["MYSQL_HOST"], db = settings["MYSQL_DBNAME"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWORD"], charset=‘utf8‘, cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True, ) #**dbparms-->("MySQLdb",host=settings[‘MYSQL_HOST‘] dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): #使用twisted將mysql插入變成異步執行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) #處理異常 def handle_error(self, failure, item, spider): #處理異步插入的異常 print (failure) def do_insert(self, cursor, item): #執行具體的插入 #根據不同的item 構建不同的sql語句並插入到mysql中 insert_sql, params = item.get_insert_sql() cursor.execute(insert_sql, params) class JsonWithEncodingPipeline(object): # 自定義json文件的導出 def __init__(self): # 使用codecs打開避免一些編碼問題。 self.file = codecs.open(‘article.json‘, ‘w‘, encoding="utf-8") def process_item(self, item, spider): # 將item轉換為dict,然後調用dumps方法生成json對象,false避免中文出錯 lines = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(lines) return item # 當spider關閉的時候: 這是一個spider_closed的信號量。 def spider_closed(self, spider): self.file.close() class JsonExporterPipeline(object): #調用scrapy提供的json export導出json文件 def __init__(self): self.file = open(‘articleexport.json‘, ‘wb‘) self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item class MysqlPipeline(object): #采用同步的機制寫入mysql def __init__(self): self.conn = MySQLdb.connect(‘localhost‘, ‘root‘, ‘123456‘, ‘article_spider‘, charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into jobbole_article(title,create_date,url,url_object_id, front_image_url,praise_nums,comment_nums,fav_nums,tags,content) VALUES (%s, %s, %s, %s, %s,%s,%s,%s,%s,%s) """ self.cursor.execute(insert_sql, (item["title"],item["create_date"],item["url"],item["url_object_id"],item["front_image_url"],item["praise_nums"],item["comment_nums"],item["fav_nums"],item["tags"],item["content"])) self.conn.commit()
5.setting.py
import os BOT_NAME = ‘ArticleSpider‘ SPIDER_MODULES = [‘ArticleSpider.spiders‘] NEWSPIDER_MODULE = ‘ArticleSpider.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = ‘ArticleSpider (+http://www.yourdomain.com)‘ # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, # ‘Accept-Language‘: ‘en‘, #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # ‘ArticleSpider.middlewares.ArticlespiderSpiderMiddleware‘: 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # ‘ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware‘: 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { ‘ArticleSpider.pipelines.ArticlespiderPipeline‘: 300, # ‘scrapy.pipelines.images.ImagesPipeline‘: 1, # ‘ArticleSpider.pipelines.ArticleImagePipeline‘:1, ‘ArticleSpider.pipelines.JsonExporterPipeline‘:2, # ‘ArticleSpider.pipelines.MysqlPipeline‘: 4, } IMAGES_URLS_FIELD = "front_image_url" project_dir = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE = os.path.join(project_dir, ‘images‘) # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = ‘httpcache‘ #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘ MYSQL_HOST = "localhost" MYSQL_DBNAME = "article_spider" MYSQL_USER = "root" MYSQL_PASSWORD = "123456"
python 爬取伯樂在線完整版