【爬蟲】獲取Github倉庫提交紀錄歷史的指令碼 python
阿新 • • 發佈:2020-12-12
本指令碼為第一版開發;後續會進行擴充套件
#! python3 import requests import time, datetime import json from colorama import Fore,Back,Style,init from bs4 import BeautifulSoup process = 0 output = 0 def req(type,addr,data='',**args): if type == 'get': try: responses = requests.get(addr,timeout=50) except requests.exceptions.RequestException as e: pass elif type == 'post': try: responses = requests.post(addr,timeout=50) except requests.exceptions.RequestException as e: pass return responses def access(url_addr): # print("access") for i in url_addr: print(i['git_addr']) responses = req('get',i['git_addr']) if responses.status_code == 200: print("[SUCCESS]" + " %s [status] %s"%(str(i['git_addr']), str(responses.status_code))) i['git_addr'] = i['git_addr'] + '/commits/' commits(i) else : print(Fore.BLACK + Back.RED + "[ERROR] "+"%s [status] %s"%(str(i['git_addr']), str(responses.status_code))) i['code'] = responses.status_code # return def commits(addr): url = addr['git_addr'] responses = req('get', url) if responses.status_code != 200: print("[SUCCESS] %s [status] %s"%(str(url), str(responses.status_code))) addr['code'] = responses.status_code return text = BeautifulSoup(responses.text, "html.parser") # 判斷空倉庫 if "This repository is empty." in text: print(print(Fore.RED + Back.WHITE +"%s 的倉庫內容爬取過程中發現告警[This repository is empty.]"%(addr['username']))) return # commits_all_dict = [] all_commits = text.find_all(class_='TimelineItem-body') # 展露細節內容的 try: for texts in all_commits: dateBar = texts.find(class_='text-normal').get_text()[11:] # 日期 # 我們獲取的日期格式是標準的英文格式日期"Nov 26, 2020",所以我們需要進行日期的轉換 date = datetime.datetime.strptime(dateBar, '%b %d, %Y').strftime('%Y年%m月%d日') commits_second = 0 if process: print("\n=================[%s]================="%(str(date))) all_commits_find = texts.ol.find_all('li') for commits_find in all_commits_find: commits_dict = { 'commits_auth' : commits_find.div.find('div',class_='d-flex').find('div',class_='f6').find(class_='commit-author').get_text(), 'commits_time' : commits_find.find('relative-time')['datetime'], # 當前日期所提交的內容 'commits_href' : "https://github.com" + commits_find.div.p.a['href'], # 我們的text中式把summary和description內容融合在一起的於是我們需要把他們分開 'commits_summary' : commits_find.div.p.a['aria-label'][:len(commits_find.div.p.a.get_text())] , 'commits_description' : commits_find.div.p.a['aria-label'][len(commits_find.div.p.a.get_text()):].strip() } # commits_all_dict.append(commits_dict) commits_second += 1 # 處理爬取資料的輸出 if process : print("\n-----------------[%s]-----------------"%(commits_dict['commits_auth'])) print ("[提交時間] %s \n[提交程式碼] %s\n[提交主題] %s\n[提交描述] %s" %(commits_dict['commits_time'], commits_dict['commits_href'], commits_dict['commits_summary'], commits_dict['commits_description'])) print(Fore.BLACK + Back.WHITE +"%s 於 %s 共計提交了 %s 次程式碼"%(addr['username'], date, commits_second)) # 處理分頁爬取 next_a = text.find(class_='paginate-container').find_all('a') if len(next_a) and next_a[-1].get_text() == 'Older': print("------next page------") addr['git_addr'] = next_a[-1]['href'] commits(addr) except Exception as e: print(print(Fore.RED + Back.WHITE +"%s 的倉庫爬取過程中發生錯誤."%(addr['username']))) return def main(): global process url_addr = [ { 'username' : 'X1', 'git_addr' : 'https://github.com/litbird0/elevator', # 專案地址 'start' : '', 'commins' : [], }, { 'username' : 'X2', 'git_addr' : 'https://github.com/1564820398/cjwc_dianti', 'start' : '', 'commins' : [], }, ] Webcrawler_key = "mirror" if input("請輸入爬蟲Key:") != Webcrawler_key: print(Fore.RED + Back.WHITE + "Key錯誤!") time.sleep(10) exit() if input("是否爬取commits細節(Y/N):").upper() == "Y": process = 1 else : process = 0 access(url_addr) print("[OK] 爬行結束 ...") if input("是否關閉當前視窗(Y/N):").upper() == "Y": exit() else : pass exit() if __name__ == "__main__": init(autoreset=True) main()