利用Python批量儲存51CTO部落格
阿新 • • 發佈:2018-11-11
一、背景
最近在整理部落格,近在51CTO官網存在文章,想將之前寫的全部儲存到本地,發現用markdown寫的可以匯出,富文字的則不行,就想利用Python批量儲存自己的部落格到本地。
二、程式碼
#!/bin/env python
# -*- coding:utf-8 -*-
# _auth:kaliarch
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
class BlogSave():
# 定義headers欄位
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.89 Safari/537.36"
}
def __init__(self,blog_name,page_number,login_user_name,login_passwd):
self.login_url = 'http://home.51cto.com/index'
# 部落格使用者名稱
self.blog_name = blog_name
# 需要儲存的部落格多少頁
self.page_number = page_number
# 登陸的使用者
self.login_user_name = login_user_name
# 登陸的密碼
self.login_passwd = login_passwd
# 本地的chreomedriver驅動
self.chromedirve = 'D:\chromedriver.exe'
# blog 匯入url
self.blog_save_url = 'http://blog.51cto.com/blogger/publish/'
def get_urldict(self):
"""
爬去使用者文章的url
:param pagenumber:
:return: urllist
"""
content_dict = {}
scrapy_urllist = ["http://blog.51cto.com/" + str(self.blog_name) + "/p" + str(page) for page in
range(1, int(self.page_number) + 1)]
for scrapy_url in scrapy_urllist:
response = requests.get(scrapy_url, headers=BlogSave.headers)
soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf-8')
title_list = soup.find_all('a', class_='tit')
for content in title_list:
# 獲取url
url = content['href']
title_soup = BeautifulSoup(requests.get(url, headers=BlogSave.headers).content, 'lxml', from_encoding='utf-8')
title = title_soup.find_all('h1', class_='artical-title')
# 獲取標題
# print(title[0].get_text())
content_dict[title[0].get_text()] = url
print(title[0].get_text(),url)
return content_dict
def save_blog(self,url_list):
"""
通過模擬登陸儲存部落格檔案
:return:
"""
browser = webdriver.Chrome(self.chromedirve)
# 開啟url
browser.get(self.login_url)
time.sleep(2)
# 登陸
browser.find_element_by_id('loginform-username').send_keys(self.login_user_name)
browser.find_element_by_id('loginform-password').send_keys(self.login_passwd)
browser.find_element_by_name('login-button').click()
time.sleep(1)
for url in url_list:
browser.get(url)
time.sleep(1)
try:
browser.find_element_by_xpath('//*[@id="blogEditor-box"]/div[1]/a[14]').click()
time.sleep(2)
except Exception as e:
with open('fail.log','a') as f:
f.write(url + str(e))
def run(self):
# 獲取標題和url字典
content_dict = self.get_urldict()
# 獲取url列表
id_list = []
for value in content_dict.values():
id_list.append(str(value).split('/')[-1])
result_list = [ self.blog_save_url + str(id) for id in id_list ]
print("result_list:",result_list)
self.save_blog(result_list)
if __name__ == '__main__':
# blogOper = BlogSave('kaliarch',1)
# dict = blogOper.get_urldict()
# value_list = [ value for value in dict.values()]
# print(value_list)
blogOper = BlogSave(blog_name='kaliarch',page_number=5,login_user_name=' [email protected]',login_passwd='xxxxxxxxxxxxx')
blogOper.run()
三、測試
- 程式是使用者指定部落格的使用者名稱和需要抓取的頁碼數字,之後爬去所有與的文章標題和對應的url
- 後期通過selenium模擬登入,直接請求
http://blog.51cto.com/blogger/publish/文章id
可以直接匯出markdown寫的檔案,這個直接匯出沒辦法命名檔案很尷尬,但是匯出了總歸好的,後期可以讀檔案來給檔案命名 - 檢視匯出的markdown檔案