Scrapy爬取知名技術網站文章並儲存到MySQL資料庫
阿新 • • 發佈:2019-01-28
之前的幾篇文章都是在講如何把資料爬下來,今天記錄一下把資料爬下來並儲存到MySQL資料庫。
文章中有講同步和非同步兩種方法。
所有文章文章的地址:http://blog.jobbole.com/all-posts/
對所有文章的URL進行提取
提取第一頁URL
F12
可以看到單個文章連結在 #archive .floated-thumb .post-thumb a
標籤之下
對其進行提取,在 scrapy shell
中
在 Ipython
進行遍歷
可以得到第一頁中所有文章的URL。
用 Request
庫對提取的URL交給scrapy下載,然後呼叫自己定義的解析函式。
from scrapy.http import Request
post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
for post_url in post_urls:
#構建Request
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)#通過回撥提取文章函式
提取下一頁URL
這次通過兩個類標籤來定位 下一頁
的標籤,(中間沒有空格)
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
整個程式碼:
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy.http import Request
from urllib import parse
from ArticleSpider.items import ArticleSpiderItem
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com"]
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
# '''
# 1. 獲取文章列表頁中的文章url並交給scrapy下載後並進行解析
# 2. 獲取下一頁的url並交給scrapy進行下載, 下載完成後交給parse
# '''
# 解析列表頁中的所有文章url並交給scrapy下載後並進行解析(request庫解析)
post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
for post_url in post_urls:
#構建Request
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail)#通過回撥函式
# 提取下一頁並交給scrapy下載
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
def parse_detail(self, response):
# 提取文章的具體欄位
#通過CSS選擇器提取網頁的具體欄位
#標題
title = response.css(".entry-header h1::text").extract_first()
#釋出日期
create_data = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip()
#標籤
tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("評論")]
tags = ",".join(tag_list)
#點贊數
praise_nums = response.css(".vote-post-up h10::text").extract_first()
#收藏數
fav_nums = response.css("span.btn-bluet-bigger:nth-child(2)::text").extract_first()
match_re = re.match(".*?(\d+).*", fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0
# 評論數
comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()
match_re = re.match(".*?(\d+).*", comment_nums)
if match_re:
comment_nums = int(match_re.group(1))
else:
comment_nums = 0
#正文
content = response.css("div .entry").extract()[0]
好,到此為止,整個網站的文章的內容都已經爬取完畢了。下面開始將爬到的資料儲存到資料庫裡。
把封面圖下載下來
配置 items
在 items
封裝需要用到的類。
import scrapy
class ArticleSpiderItem(scrapy.Item):
title = scrapy.Field()
create_data = scrapy.Field()
url = scrapy.Field()
front_image_url = scrapy.Field()
front_image_path = scrapy.Field()
tags = scrapy.Field()
praise_nums = scrapy.Field()
fav_nums = scrapy.Field()
comment_nums = scrapy.Field()
content = scrapy.Field()
設定 settings
import os
ITEM_PIPELINES = {
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_URLS_FIELD = "front_image_url"
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, "images")
Spider
檔案
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy.http import Request
from urllib import parse
from ArticleSpider.items import ArticleSpiderItem
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com"]
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
# '''
# 1. 獲取文章列表頁中的文章url並交給scrapy下載後並進行解析
# 2. 獲取下一頁的url並交給scrapy進行下載, 下載完成後交給parse
# '''
# 解析列表頁中的所有文章url並交給scrapy下載後並進行解析(request庫解析)
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
for post_node in post_nodes:
#提取照片連結
image_url = post_node.css("img::attr(src)").extract_first()
post_url = post_node.css("::attr(href)").extract_first()
#構建Request
yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url}, callback=self.parse_detail)#通過回撥函式
# 提取下一頁並交給scrapy下載
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
def parse_detail(self, response):
article_items = ArticleSpiderItem()
# 提取文章的具體欄位
#通過CSS選擇器提取網頁的具體欄位
#文章封面圖
front_image_url = response.meta.get("front_image_url", "")
#標題
title = response.css(".entry-header h1::text").extract_first()
#釋出日期
create_data = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip()
#標籤
tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("評論")]
tags = ",".join(tag_list)
#點贊數
praise_nums = response.css(".vote-post-up h10::text").extract_first()
#收藏數
fav_nums = response.css("span.btn-bluet-bigger:nth-child(2)::text").extract_first()
match_re = re.match(".*?(\d+).*", fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0
# 評論數
comment_nums = response.css("a[href='#article-comment'] span::text").extract_first()
match_re = re.match(".*?(\d+).*", comment_nums)
if match_re:
comment_nums = int(match_re.group(1))
else:
comment_nums = 0
#正文
content = response.css("div .entry").extract()[0]
article_items["title"] = title
article_items["create_data"] = create_data
article_items["tags"] = tags
article_items["url"] = response.url
article_items["front_image_url"] = [front_image_url]
article_items["fav_nums"] = fav_nums
article_items["comment_nums"] = comment_nums
article_items["praise_nums"] = praise_nums
article_items["content"] = content
yield article_items
圖片被下載下來了。
通過配置 settings
檔案,儲存圖片地址。
'ArticleSpider.pipelines.ArticleImagePipeline': 1,
在 pipeline
中,重寫 ArticleImagePipeline
方法,儲存圖片路徑。
from scrapy.pipelines.images import ImagesPipeline
class ArticlespiderPipeline(object):
def process_item(self, item, spider):
return item
#儲存圖片的路徑
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
for ok, value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
儲存到 MySQL
資料庫
建立資料庫
新建表
如圖所示:
安裝 MySQL
驅動
pip install -i https://pypi.douban.com/simple/ mysqlclient
採用同步的機制寫入MySQL
修改 create_data
型別
由於抓取的資料是str格式的,需要把 create_data
型別改成 date
型別
try:
create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date()
except Exception as e:
#如果沒有資料就取當前時間
create_data = datetime.datetime.now().date()
在 Pipelines
建立 MysqlPipeline
import MySQLdb
#採用同步的機制寫入mysql
class MysqlPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'articlespider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into jobbole_spider(title, create_data, url, url_object_id, tags, front_image_url, praise_nums, fav_nums, comment_nums, content)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
self.cursor.execute(insert_sql, (item['title'], item["create_data"], item["url"], item["url_object_id"], item["tags"], item["front_image_url"], item["praise_nums"], item["fav_nums"], item["comment_nums"], item["content"]))
self.conn.commit()
settings
中新增配置
'ArticleSpider.pipelines.MysqlPipeline': 1,
資料被抓取下來了。
非同步將資料傳入MySQL
在 Pipelines
建立非同步 MysqlTwistedPipline
import MySQLdb.cursors
from twisted.enterprise import adbapi
#採用非同步的機制寫入mysql
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使用twisted將mysql插入變成非同步執行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) #處理異常
def handle_error(self, failure, item, spider):
#處理非同步插入的異常
print (failure)
def do_insert(self, cursor, item):
#執行具體的插入
insert_sql = """
insert into jobbole_spider(title, create_data, url, url_object_id, tags, front_image_url, praise_nums, fav_nums, comment_nums, content)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(insert_sql, (item['title'], item["create_data"], item["url"], item["url_object_id"], item["tags"], item["front_image_url"], item["praise_nums"], item["fav_nums"], item["comment_nums"], item["content"]))
設定 settings
ITEM_PIPELINES = {
'ArticleSpider.pipelines.MysqlTwistedPipeline': 1,
}
MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "articlespider"
MYSQL_USER = "root"
MYSQL_PASSWORD = "root"
執行的結果一樣。
歡迎關注我的個人公眾號。