scrapy抓取github使用者郵箱功能的實現
阿新 • • 發佈:2018-12-14
實現的流程大概是這樣: 1.訪問github每日精選專案 https://github.com/trending?since=daily 2.使用selenium或requests完成登入操作 3.檢視每個專案Star的使用者列表 4.遍歷所有使用者,訪問使用者詳情頁 5.如果有郵箱則儲存郵箱,沒有郵箱則不儲存任何資訊 具體程式碼如下: spiders
# -*- coding: utf-8 -*- import scrapy import re from github_test.utils.login import login_get_cookies_selenium,login_get_cookies_requests import math from scrapy.loader import ItemLoader from github_test.items import EmailItem class GithubSpider(scrapy.Spider): name = 'github' allowed_domains = ['github.com'] # start_urls = [] start_urls = ['https://github.com/trending?since=daily'] cookies = {} def parse(self, response): # 使用requests完成登入 # self.cookies = login_get_cookies_requests() # 使用selenuim完成登入 self.cookies = login_get_cookies_selenium(response.url) # 獲取專案列表 repo_list = response.xpath('/html/body/div[4]/div[2]/div/div[1]/div[2]/ol').extract()[0] # 獲取每個專案收藏者列表的url mem_list_url = re.findall('\/.{1,20}\/.{1,20}\/stargazers', repo_list) for path in mem_list_url: yield scrapy.Request(url="https://github.com" + path, callback=self.member_list, cookies=self.cookies) # 獲取每個專案的收藏人列表 def member_list(self, response): # 獲取收藏使用者數 count = response.xpath('//*[@id="repos"]/div[1]/nav/a[1]/span/text()').extract()[0] # 計算收藏使用者頁數 page_size = 51 pages = min(math.ceil(int(count.replace(',', '')) / page_size), 100) for page_number in range(1, pages): yield scrapy.Request(url=response.url + '?page=' + str(page_number), callback=self.user_detail, cookies=self.cookies) # 獲取使用者詳情 def user_detail(self, response): user_list_css = response.xpath('//*[@id="repos"]/ol').extract()[0] user_list = re.findall('<a href="(.{0,20})">', user_list_css) for user in user_list: yield scrapy.Request(url='https://github.com' + user, callback=self.get_email, cookies=self.cookies) # 獲取使用者郵箱 def get_email(self, response): loader = ItemLoader(item=EmailItem(), response=response) # 獲取使用者郵箱 if len(response.css('.vcard-details li')) == 4: email = response.xpath('//*[@id="js-pjax-container"]/div/div[1]/ul/li[3]/a/text()').extract()[0] loader.add_value('email', email) yield loader.load_item() elif len(response.css('.vcard-details li')) == 3: email = response.xpath('//*[@id="js-pjax-container"]/div/div[1]/ul/li[2]/a/text()').extract()[0] loader.add_value('email', email) yield loader.load_item() else: print("使用者未公開郵箱")
login:
# -*- coding: utf-8 -*- # @File : login.py # @Author: Lyn # @Date : 2018/10/6 from selenium import webdriver import scrapy import requests # 使用selenium完成登入 def login_get_cookies_selenium(url): driver = webdriver.Chrome(executable_path='../chromedriver') driver.get(url) driver.find_element_by_css_selector( 'body > div.position-relative.js-header-wrapper > header > div > div.HeaderMenu.d-lg-flex.flex-justify-between.flex-auto > div > span > div > a:nth-child(1)').click() driver.implicitly_wait(5) driver.find_element_by_id('login_field').send_keys('your_email') driver.find_element_by_id('password').send_keys('your_password') driver.find_element_by_css_selector('#login > form > div.auth-form-body.mt-3 > input.btn.btn-primary.btn-block').click() return driver.get_cookies() # 使用requests完成登入 def login_get_cookies_requests(): response = scrapy.Request(url="https://github.com") authenticity_token = response.css('input::attr(value)').extract()[1] res1 = requests.get('https://github.com/login') res2 = requests.post( 'https://github.com/session', data={ 'commit': 'Sign in', 'utf8': '✓', 'authenticity_token': authenticity_token, 'login': 'your_email', 'password': 'your_password' }, cookies=res1.cookies.get_dict(), ) return res2.cookies.get_dict()
完整程式碼及部分抓取結果以上傳至github,點選此處可直接檢視,有疑問的同學請提issues或部落格下方留言~