部落格園備份python程式
阿新 • • 發佈:2021-01-19
一、獲取部落格園url清單
1、sql表
新建cbs資料庫
通過article_list.sql新建sql表:
SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for article_list -- ---------------------------- DROP TABLE IF EXISTS `article_list`; CREATE TABLE `article_list` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `title` varchar(255) DEFAULT NULL, `countView` varchar(10) DEFAULT NULL, `countComment` varchar(10) DEFAULT NULL, `url` varchar(100) DEFAULT NULL, `datePublished` datetime DEFAULT NULL, `dateUpdated` datetime DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2、cookie
登入部落格園,此時的cookie值,存入mycookie.txt
3、獲取url清單
python依賴包
requirements.txt
certifi==2020.4.5.1 chardet==3.0.4 et-xmlfile==1.0.1 idna==2.9 jdcal==1.4.1 openpyxl==3.0.3 PyMySQL==0.9.3 requests==2.23.0 urllib3==1.25.9
Python程式
import time import json import math import requests import sys import os from openpyxl import Workbook import pymysql # 啟動前輸出 version_str = "V1.2" logo_str = "cnBlogs_List" logo_pic_str = """ ____ _ _ _ _ ___ _ __ | __ )| | ___ __ _ ___ | | (_)___| |_ / __| '_ \| _ \| |/ _ \ / _` / __| | | | / __| __| | (__| | | | |_) | | (_) | (_| \__ \ | |___| \__ \ |_ \___|_| |_|____/|_|\___/ \__, |___/ _____ |_____|_|___/\__| |___/ |_____| """ print("%s %s" % (logo_str, version_str), end='') print(logo_pic_str) print("%s %s 啟動中..." % (logo_str, version_str)) time.sleep(2.5) # 開始啟動 # 配置 # 方式一:直接賦值給常量COOKIE_STR # cookie值 COOKIE_STR = "" # 方式二:將cookie值存到一個指定的檔案 COOKIE_PATH = "./mycookie.txt" if os.path.exists(COOKIE_PATH) and len(COOKIE_STR) == 0: # 如果存在cookie存值檔案,且沒有對系統常量cookie賦值,則讀取檔案中的內容 with open(COOKIE_PATH, 'r', encoding="utf-8") as f: data_str = f.read() if len(data_str) > 0: # 如果存在檔案內容,則向系統賦值cookie常量 COOKIE_STR = data_str else: print("There is no cookie value in the file %s" % COOKIE_PATH) sys.exit() # sys.exit() CACHE_FILE_NAME = "cache.txt" # 快取檔名稱 EXCEL_FILE_NAME = "result.xls" # 電子表格檔名稱 WORKSHEET_NAME = "cnblogs_admin" # 工作簿名稱 # TABLE_HEAD_TITLE_LIST = ["title", "countView", "countCommet", "url", "datePublished", "dateUpdated"] # 表頭名稱 TABLE_HEAD_TITLE_LIST = ["標題", "閱讀量", "評論數", "連結", "首次釋出時間", "最近更新時間"] # *.xls或cache.txt表頭名稱 SINGLE_PAGE_COUNT = 10 # 每頁文章數目 # 配置請求引數 #REQUEST_URL_PART = r'https://i-beta.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0' REQUEST_URL_PART = r'https://i.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0' REQUEST_HEADERS = { 'authority': 'i-beta.cnblogs.com', 'method': 'GET', 'path': '/api/posts/list?p=1&cid=&t=1&cfg=0', 'scheme': 'https', 'accept': 'application/json, text/plain, */*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cookie': COOKIE_STR, 'referer': 'https://i-beta.cnblogs.com/posts', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', } # 全域性變數 info_result_all = [] # 所有頁面的資料彙總列表 # 配置mysql資料庫連線引數 MYSQL_HOST = 'localhost' MYSQL_PORT = 3306 MYSQL_USER = 'root' MYSQL_PASSWD = 'root' MYSQL_DB = 'cbs' MYSQL_CHARSET = 'utf8' # 連線資料庫 connect = pymysql.Connect( host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PASSWD, db=MYSQL_DB, charset=MYSQL_CHARSET ) # 獲取遊標 cursor = connect.cursor() # 定義函式 # 獲取總頁數 def get_page_count(page_n=1): page_num = page_n # 構造請求引數 url = REQUEST_URL_PART % page_num headers = REQUEST_HEADERS # 發起get請求 response = requests.get(url, headers=headers, verify=True) html_data = response.content.decode() data_dict = json.loads(html_data) all_post_count = data_dict["postsCount"] # 獲取總文章數目 page_count = math.ceil(all_post_count / 10) # 獲取總頁數 # 返回總頁數 return page_count # 請求每個列表頁的資料 # 1、列印到控制檯 # 2、存入mysql資料庫cbs中article_list表中 def get_per_page_data(page_n=1, save_mysql=True): page_num = page_n url = REQUEST_URL_PART % page_num headers = REQUEST_HEADERS # 發起get請求 response = requests.get(url, headers=headers, verify=True) html_data = response.content.decode() data_dict = json.loads(html_data) post_list = data_dict["postList"] # 獲取頁面內容 all_post_count = data_dict["postsCount"] # 獲取總文章數目 page_count = math.ceil(all_post_count / 10) # 獲取總頁數 print("【status %s】第 %s 頁" % (response.status_code, page_n)) info_result = [] # 寫入表格的資料 # 寫入首行 if page_num == 1: info = TABLE_HEAD_TITLE_LIST # 首行內容 info_result.append(info) # 獲取當頁內容 for index, item in enumerate(post_list): title = item['title'] # 獲取標題 viewCount = item['viewCount'] # 獲取點選量 comment_count = item['feedBackCount'] # 獲取評論數量 url = "https:%s" % item['url'] # 獲取文章url datePublished = item['datePublished'] # 獲取首次釋出日期 dateUpdated = item['dateUpdated'] # 獲取最近修改日期 # 列印到控制檯 print((index + ((page_n - 1) * SINGLE_PAGE_COUNT) + 1), end=' ') print(title) info = [title, viewCount, comment_count, url, datePublished, dateUpdated] # 每行的內容 # 存記憶體列表 info_result.append(info) if save_mysql is True: # 存mysql資料庫 # 增加資料操作 sql_1 = "INSERT INTO article_list(title,countView,countComment,url,datePublished,dateUpdated) VALUES ('%s','%s','%s','%s','%s','%s');" data = tuple(info) cursor.execute(sql_1 % data) # 生成增加sql語句 # cursor.execute(sql_1) # 生成增加sql語句 connect.commit() # 確認永久執行增加 return page_count, page_num, info_result # 變數列表頁,資料存mysql、txt def get_cbs_list_data(page_count, save_cache=True): for n in range(1, page_count + 1): time.sleep(1) # 休息一秒鐘,降低請求頻率 one_page_data_list = get_per_page_data(n, save_mysql=True) # 請求一次第n頁,並存mysql資料庫 page_num = one_page_data_list[1] info_result = one_page_data_list[2] info_result_all.extend(info_result) if save_cache is True: # 將本頁面資料寫入快取檔案cache.txt save_file_cache = CACHE_FILE_NAME with open(save_file_cache, 'a', encoding="utf-8") as f: f.write("## 第 %s/%s 頁\n" % (page_num, page_count)) # 快取頁面號 for line in info_result: f.write("%s\t%s\t%s\t%s\t%s\t%s\n" % tuple(line)) # 本頁內容寫入快取 # 將資料寫入excel def save_excel(): save_file = EXCEL_FILE_NAME # 存入檔名稱 sheet_name = WORKSHEET_NAME # 工作簿名稱 wb = Workbook() # 新建工作簿 ws1 = wb.active # 獲得當前活躍的工作頁,預設為第一個工作頁 ws1.title = sheet_name # 修改頁名稱。sheet名稱 for row in info_result_all: ws1.append(row) wb.save(save_file) # Excel檔名稱,儲存檔案 # 主函式 # 1、遍歷所有頁面 # 2、存入快取檔案cache.txt、存入mysql資料庫 # 3、資料存入記憶體 # 4、存入xls電子表格中 def main(): # 嘗試請求第一個列表頁,獲取總頁數 try: page_count = int(get_page_count()) # 獲取總頁面數。 except Exception: print("請求無效,請替換為新Cookie值(COOKIE_STR)後重試") sys.exit() # 遍歷所有頁面。存mysql,存cache.txt檔案 get_cbs_list_data(page_count, save_cache=False) # get_cbs_list_data(2, save_cache=True) # 遍歷前2頁 # 最後:將資料寫入excel # save_excel() # 啟動測試 if __name__ == '__main__': # 啟動主程式 main()
二、通過url清單下載html頁面
1、新建如下目錄結構
└─html ├─2 ├─3 ├─4 └─others
2、新建url清單檔案,並從sql庫中匯入url清單
1.txt
3、下載html檔案
import os import time import requests import shutil from contextlib import closing from urllib.request import urlopen # py2使用 # from urllib2 import urlopen def myurlopen(url): # url = 'https://www.baidu.com' headers = { 'authority': 'i-beta.cnblogs.com', 'method': 'GET', # 'path': '/api/posts/list?p=1&cid=&t=1&cfg=0', 'scheme': 'https', 'accept': 'application/json, text/plain, */*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 'cookie': COOKIE_STR, 'referer': 'https://i-beta.cnblogs.com/posts', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', } # 發起get請求 response = requests.get(url, headers=headers, verify=True) # 獲取html文字 # html_data = response.content.decode() return response def save_html(url): # url = "https://www.cnblogs.com/andy9468/p/10005406.html" # url = "https://www.cnblogs.com/andy9468/p/8025420.html" server_path, html_name = os.path.split(url) print(server_path) print(html_name) # with closing(urlopen('ftp://www.xxxx.com/haha.txt')) as page: resp = myurlopen(url) html_code = resp.status_code # print(html_code) html_data = resp.content if str(html_code).startswith('2'): with open("html/2/%s" % html_name, 'wb') as f: f.write(html_data) elif str(html_code).startswith('3'): with open("html/3/%s" % html_name, 'wb') as f: f.write(html_data) elif str(html_code).startswith('4'): with open("html/4/%s" % html_name, 'wb') as f: f.write(html_data) else: with open("html/others/%s" % html_name, 'wb') as f: f.write(html_data) def main(): # url = "https://www.cnblogs.com/andy9468/p/10005406.html" # save_html(url) urls_file = "1.txt" with open(urls_file, 'r')as f: urls = f.readlines() for url in urls: # print(123) # print(url.strip("\n")) url = url.strip("\n") time.sleep(1) save_html(url) if __name__ == '__main__': main()