1. 程式人生 > 實用技巧 >【爬蟲】獲取Github倉庫提交紀錄歷史的指令碼 python

【爬蟲】獲取Github倉庫提交紀錄歷史的指令碼 python

本指令碼為第一版開發;後續會進行擴充套件

#! python3

import requests
import time, datetime
import json
from colorama import Fore,Back,Style,init
from bs4 import BeautifulSoup

process = 0
output  = 0


def req(type,addr,data='',**args):
    if type == 'get':
        try:
            responses = requests.get(addr,timeout=50)
        except requests.exceptions.RequestException as e:
            pass
    elif type == 'post':
        try:
            responses = requests.post(addr,timeout=50)
        except requests.exceptions.RequestException as e:
            pass
    return responses

def access(url_addr):
    # print("access")
    for i in url_addr:
        print(i['git_addr'])
        responses = req('get',i['git_addr'])
        if responses.status_code == 200:
            print("[SUCCESS]" + " %s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
            i['git_addr'] = i['git_addr'] + '/commits/'
            commits(i)
        else :
            print(Fore.BLACK + Back.RED + "[ERROR]   "+"%s [status] %s"%(str(i['git_addr']), str(responses.status_code)))
            i['code'] = responses.status_code
            # return 

def commits(addr):
    url = addr['git_addr']
    responses = req('get', url)
    if responses.status_code != 200:
        print("[SUCCESS] %s [status] %s"%(str(url), str(responses.status_code)))
        addr['code'] = responses.status_code
        return 
    text = BeautifulSoup(responses.text, "html.parser")
    # 判斷空倉庫

    if "This repository is empty." in text:
        print(print(Fore.RED + Back.WHITE +"%s 的倉庫內容爬取過程中發現告警[This repository is empty.]"%(addr['username'])))
        return
    # commits_all_dict = []
    
    all_commits = text.find_all(class_='TimelineItem-body')
    
    # 展露細節內容的
    try:
        for texts in all_commits:
            dateBar = texts.find(class_='text-normal').get_text()[11:] # 日期
            # 我們獲取的日期格式是標準的英文格式日期"Nov 26, 2020",所以我們需要進行日期的轉換
            date = datetime.datetime.strptime(dateBar, '%b %d, %Y').strftime('%Y年%m月%d日')
            commits_second = 0
            if process:
                print("\n=================[%s]================="%(str(date)))
            all_commits_find = texts.ol.find_all('li')
            for commits_find in all_commits_find:
                commits_dict = {
                    'commits_auth' : commits_find.div.find('div',class_='d-flex').find('div',class_='f6').find(class_='commit-author').get_text(),
                    'commits_time' : commits_find.find('relative-time')['datetime'],  # 當前日期所提交的內容  
                    'commits_href' : "https://github.com" + commits_find.div.p.a['href'],
                    # 我們的text中式把summary和description內容融合在一起的於是我們需要把他們分開
                    'commits_summary' : commits_find.div.p.a['aria-label'][:len(commits_find.div.p.a.get_text())] ,
                    'commits_description' :  commits_find.div.p.a['aria-label'][len(commits_find.div.p.a.get_text()):].strip()
                }
                # commits_all_dict.append(commits_dict)
                commits_second += 1
                # 處理爬取資料的輸出
                if process :
                    print("\n-----------------[%s]-----------------"%(commits_dict['commits_auth']))
                    print ("[提交時間] %s \n[提交程式碼] %s\n[提交主題] %s\n[提交描述] %s"
                                %(commits_dict['commits_time'], commits_dict['commits_href'], 
                                commits_dict['commits_summary'], commits_dict['commits_description']))
        print(Fore.BLACK + Back.WHITE +"%s 於 %s 共計提交了 %s 次程式碼"%(addr['username'], date, commits_second))
                
        # 處理分頁爬取
        next_a = text.find(class_='paginate-container').find_all('a')
        if  len(next_a) and next_a[-1].get_text() == 'Older':
            print("------next page------")
            addr['git_addr'] = next_a[-1]['href']
            commits(addr)
    except Exception as e:
        print(print(Fore.RED + Back.WHITE +"%s 的倉庫爬取過程中發生錯誤."%(addr['username'])))
        return

def main():
    global process
    url_addr = [
        {
            'username' : 'X1',
            'git_addr' : 'https://github.com/litbird0/elevator', # 專案地址
            'start'    : '', 
            'commins'  : [],
        },
        {
            'username' : 'X2',
            'git_addr' : 'https://github.com/1564820398/cjwc_dianti',
            'start'    : '', 
            'commins'  : [],
        },

    ]

    Webcrawler_key = "mirror"
    if input("請輸入爬蟲Key:") != Webcrawler_key:
        print(Fore.RED + Back.WHITE + "Key錯誤!")
        time.sleep(10)
        exit()

    if input("是否爬取commits細節(Y/N):").upper() == "Y":
        process = 1
    else :
        process = 0

    access(url_addr)
    print("[OK] 爬行結束 ...")
    if input("是否關閉當前視窗(Y/N):").upper() == "Y":
        exit()
    else :
        pass
    exit()

if __name__ == "__main__":
    init(autoreset=True)
    main()