1. 程式人生 > >scrapy抓取github使用者郵箱功能的實現

scrapy抓取github使用者郵箱功能的實現

實現的流程大概是這樣: 1.訪問github每日精選專案 https://github.com/trending?since=daily 2.使用selenium或requests完成登入操作 3.檢視每個專案Star的使用者列表 在這裡插入圖片描述 4.遍歷所有使用者,訪問使用者詳情頁 5.如果有郵箱則儲存郵箱,沒有郵箱則不儲存任何資訊 具體程式碼如下: spiders

# -*- coding: utf-8 -*-
import scrapy
import re
from github_test.utils.login import login_get_cookies_selenium,login_get_cookies_requests
import math
from scrapy.loader import ItemLoader
from github_test.items import EmailItem

class GithubSpider(scrapy.Spider):
    name = 'github'
    allowed_domains = ['github.com']
    # start_urls = []
    start_urls = ['https://github.com/trending?since=daily']

    cookies = {}

    def parse(self, response):

        # 使用requests完成登入
        # self.cookies = login_get_cookies_requests()

        # 使用selenuim完成登入
        self.cookies = login_get_cookies_selenium(response.url)

        # 獲取專案列表
        repo_list = response.xpath('/html/body/div[4]/div[2]/div/div[1]/div[2]/ol').extract()[0]

        # 獲取每個專案收藏者列表的url
        mem_list_url = re.findall('\/.{1,20}\/.{1,20}\/stargazers', repo_list)

        for path in mem_list_url:
            yield scrapy.Request(url="https://github.com" + path, callback=self.member_list, cookies=self.cookies)

    # 獲取每個專案的收藏人列表
    def member_list(self, response):

        # 獲取收藏使用者數
        count = response.xpath('//*[@id="repos"]/div[1]/nav/a[1]/span/text()').extract()[0]

        # 計算收藏使用者頁數
        page_size = 51
        pages = min(math.ceil(int(count.replace(',', '')) / page_size), 100)
        for page_number in range(1, pages):
            yield scrapy.Request(url=response.url + '?page=' + str(page_number), callback=self.user_detail, cookies=self.cookies)

    # 獲取使用者詳情
    def user_detail(self, response):
        user_list_css = response.xpath('//*[@id="repos"]/ol').extract()[0]
        user_list = re.findall('<a href="(.{0,20})">', user_list_css)
        for user in user_list:
            yield scrapy.Request(url='https://github.com' + user, callback=self.get_email, cookies=self.cookies)

    # 獲取使用者郵箱
    def get_email(self, response):
        loader = ItemLoader(item=EmailItem(), response=response)
        # 獲取使用者郵箱
        if len(response.css('.vcard-details li')) == 4:
            email = response.xpath('//*[@id="js-pjax-container"]/div/div[1]/ul/li[3]/a/text()').extract()[0]
            loader.add_value('email', email)
            yield loader.load_item()
        elif len(response.css('.vcard-details li')) == 3:
            email = response.xpath('//*[@id="js-pjax-container"]/div/div[1]/ul/li[2]/a/text()').extract()[0]
            loader.add_value('email', email)
            yield loader.load_item()
        else:
            print("使用者未公開郵箱")

login:

# -*- coding: utf-8 -*-
# @File  : login.py
# @Author: Lyn
# @Date  : 2018/10/6
from selenium import webdriver
import scrapy
import requests

# 使用selenium完成登入
def login_get_cookies_selenium(url):
    driver = webdriver.Chrome(executable_path='../chromedriver')
    driver.get(url)
    driver.find_element_by_css_selector(
        'body > div.position-relative.js-header-wrapper > header > div > div.HeaderMenu.d-lg-flex.flex-justify-between.flex-auto > div > span > div > a:nth-child(1)').click()
    driver.implicitly_wait(5)
    driver.find_element_by_id('login_field').send_keys('your_email')
    driver.find_element_by_id('password').send_keys('your_password')
    driver.find_element_by_css_selector('#login > form > div.auth-form-body.mt-3 > input.btn.btn-primary.btn-block').click()

    return driver.get_cookies()

# 使用requests完成登入
def login_get_cookies_requests():
    response = scrapy.Request(url="https://github.com")
    authenticity_token = response.css('input::attr(value)').extract()[1]
    res1 = requests.get('https://github.com/login')
    res2 = requests.post(
        'https://github.com/session',
        data={
            'commit': 'Sign in',
            'utf8': '✓',
            'authenticity_token': authenticity_token,
            'login': 'your_email',
            'password': 'your_password'
        },
        cookies=res1.cookies.get_dict(),
    )

    return res2.cookies.get_dict()

完整程式碼及部分抓取結果以上傳至github,點選此處可直接檢視,有疑問的同學請提issues或部落格下方留言~