利用Python批量保存51CTO博客
阿新 • • 發佈:2018-10-17
文件 pro 模擬 sele req passwd http send 全部 一、背景
最近在整理博客,近在51CTO官網存在文章,想將之前寫的全部保存到本地,發現用markdown寫的可以導出,富文本的則不行,就想利用Python批量保存自己的博客到本地。
二、代碼
git地址
#!/bin/env python # -*- coding:utf-8 -*- # _auth:kaliarch import requests import time from bs4 import BeautifulSoup from selenium import webdriver class BlogSave(): # 定義headers字段 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.89 Safari/537.36" } def __init__(self,blog_name,page_number,login_user_name,login_passwd): self.login_url = ‘http://home.51cto.com/index‘ # 博客用戶名 self.blog_name = blog_name # 需要保存的博客多少頁 self.page_number = page_number # 登陸的用戶 self.login_user_name = login_user_name # 登陸的密碼 self.login_passwd = login_passwd # 本地的chreomedriver驅動 self.chromedirve = ‘D:\chromedriver.exe‘ # blog 導入url self.blog_save_url = ‘http://blog.51cto.com/blogger/publish/‘ def get_urldict(self): """ 爬去用戶文章的url :param pagenumber: :return: urllist """ content_dict = {} scrapy_urllist = ["http://blog.51cto.com/" + str(self.blog_name) + "/p" + str(page) for page in range(1, int(self.page_number) + 1)] for scrapy_url in scrapy_urllist: response = requests.get(scrapy_url, headers=BlogSave.headers) soup = BeautifulSoup(response.content, ‘lxml‘, from_encoding=‘utf-8‘) title_list = soup.find_all(‘a‘, class_=‘tit‘) for content in title_list: # 獲取url url = content[‘href‘] title_soup = BeautifulSoup(requests.get(url, headers=BlogSave.headers).content, ‘lxml‘, from_encoding=‘utf-8‘) title = title_soup.find_all(‘h1‘, class_=‘artical-title‘) # 獲取標題 # print(title[0].get_text()) content_dict[title[0].get_text()] = url print(title[0].get_text(),url) return content_dict def save_blog(self,url_list): """ 通過模擬登陸保存博客文件 :return: """ browser = webdriver.Chrome(self.chromedirve) # 打開url browser.get(self.login_url) time.sleep(2) # 登陸 browser.find_element_by_id(‘loginform-username‘).send_keys(self.login_user_name) browser.find_element_by_id(‘loginform-password‘).send_keys(self.login_passwd) browser.find_element_by_name(‘login-button‘).click() time.sleep(1) for url in url_list: browser.get(url) time.sleep(1) try: browser.find_element_by_xpath(‘//*[@id="blogEditor-box"]/div[1]/a[14]‘).click() time.sleep(2) except Exception as e: with open(‘fail.log‘,‘a‘) as f: f.write(url + str(e)) def run(self): # 獲取標題和url字典 content_dict = self.get_urldict() # 獲取url列表 id_list = [] for value in content_dict.values(): id_list.append(str(value).split(‘/‘)[-1]) result_list = [ self.blog_save_url + str(id) for id in id_list ] print("result_list:",result_list) self.save_blog(result_list) if __name__ == ‘__main__‘: # blogOper = BlogSave(‘kaliarch‘,1) # dict = blogOper.get_urldict() # value_list = [ value for value in dict.values()] # print(value_list) blogOper = BlogSave(blog_name=‘kaliarch‘,page_number=5,login_user_name=‘[email protected]‘,login_passwd=‘xxxxxxxxxxxxx‘) blogOper.run()
三、測試
- 程序是用戶指定博客的用戶名和需要抓取的頁碼數字,之後爬去所有與的文章標題和對應的url
- 後期通過selenium模擬登錄,直接請求
http://blog.51cto.com/blogger/publish/文章id
可以直接導出markdown寫的文件,這個直接導出沒辦法命名文件很尷尬,但是導出了總歸好的,後期可以讀文件來給文件命名 - 查看導出的markdown文件
利用Python批量保存51CTO博客