1. 程式人生 > >NJU實驗室安全題庫爬蟲

NJU實驗室安全題庫爬蟲

關鍵是抓包分析請求,這裡模擬登陸有點難,需要包含很多資訊,也可以先手動登陸再用cookeis,抓取資料用到BS4解析

from bs4 import BeautifulSoup as bfs
import requests
import time

def login(username,password):

    headers = {'Host': '219.219.115.160',
               'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
               'Connection': 'Keep-Alive',
               'Accept-Encoding': 'gzip, deflate',
               'Content-Length': '618',
               'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
               'Content-Type': 'application/x-www-form-urlencoded',
               'Cookie': 'ASP.NET_SessionId=nux5h1mhokpcrelfcpqqr2w5',
               'Referer': 'http://219.219.115.160/pc/index.aspx',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}

    data = {
        '__VIEWSTATE': '/wEPDwUKMTg4ODA4NDE0Ng9kFgICAw9kFgICBw8PFgIeB0VuYWJsZWRnFgIeB29uY2xpY2sFrwJ2Y'
                       'XIgbGVmdD0oc2NyZWVuLndpZHRoLTU1MCkvMjt2YXIgdG9wPShzY3JlZW4uaGVpZ2h0LTYwLTQyMikvMjtOZXdXaW49d2'
                       'luZG93Lm9wZW4oJ1BlcnNvbkluZm8vUmVnaXN0VXNlci5hc3B4JywnUmVnaXN0VXNlcicsJ3RpdGxlYmFyPXllcyxtZW51YmFyPW5vLHRvb'
                       '2xiYXI9bm8sbG9jYXRpb249bm8sZGlyZWN0b3JpZXM9bm8sc3RhdHVzPXllcyxzY3JvbGxiYXJzPW5vLHJlc2l6YWJsZT1ubyxjb3B5aGlzdG9yeT1'
                       '5ZXMsdG9wPScrdG9wKycsbGVmdD0nK2xlZnQrJyx3aWR0aD01NTAsaGVpZ2h0PTUwMCcpO3JldHVybiBmYWxzZTtkZBwwcHXyGLjoZswfc5UGVTqx/I86h'
                       'm7z1UxpKqX/aXuT',
        '__VIEWSTATEGENERATOR': 'BBE0D82B',
        'ButLogin': '%B5%C7+%C2%BC',
        'LoginID': username, 'UserPwd': password}

    url = 'http://219.219.115.160/pc/index.aspx'

    spyder = requests.session()
    response = spyder.post(url, data=data, headers=headers)
    if(response):
        print("登陸成功")
    return spyder

def get_problems(spyder,number):

    numurl = 'http://219.219.115.160/pc/PersonInfo/StartExercise_Mobile.aspx?TestNum=%s' % (str(number))

    headers = {'Host': '219.219.115.160',
                 'Cache-Control': 'no-cache',
                 'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
                 'Connection': 'Keep-Alive',
                 'Accept-Encoding': 'gzip, deflate',
                 'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
                 'Content-Type': 'application/x-www-form-urlencoded',
                 'Cookie': 'ASP.NET_SessionId=nux5h1mhokpcrelfcpqqr2w5',
                 # 'Referer':'http://219.219.115.160/pc/PersonInfo/StartExercise_Mobile.aspx?TestNum=1&SelTestNum=1&SelectTest=yes',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}

    postdata = {'__EVENTARGUMENT': '', '__EVENTTARGET:': '', '__LASTFOCUS:': '',
                '__VIEWSTATE': '/wEPDwUKMTQ1OTIyNzY3Nw9kFgJmD2QWBGYPEA8WBh4NRGF0YVRleHRGaWVsZAUITG9yZU5hbWUeDkRhdGFWYWx1ZUZpZWxkBQZMb3JlSUQeC18hRGF0YUJvdW5kZ2QQFQkUPT09PeaJgOacieS4k+mimD09PT0V6Ziy54Gr5a6J5YWo5LiO5L+d5a+GG+WMluWtpuWNsemZqeWTgeS9v+eUqOWuieWFqAznlKjnlLXlronlhagJ6YCa6K+G57G7DOW6lOaApeaVkeaPtAnovpDlsITnsbsV6K6h566X5py6572R57uc5a6J5YWoKuS7quWZqOiuvuWkh++8iOeJueenjeiuvuWkh++8ieS9v+eUqOWuieWFqBUJATACNTICNTMCNTkCNjACNjECNjICNjMCNjQUKwMJZ2dnZ2dnZ2dnFgFmZAIBDxBkEBUBFD09PT3lhajpg6jpopjlnos9PT09FQEBMBQrAwFnFgFmZGRML2mFcR03ZNPi08k41KsuIXSn028KtQLqEv8qLV7yCA==',
                '__VIEWSTATEGENERATOR': '08FA5156',
                'AutoJudge': '0',
                'drpQuestionType': '0',
                'drpSubject': '%s' % str(64),
                'FillAutoGrade': '0',
                'irow': '1',
                'PaperID': '0',
                'PassMark': '0',
                'SeeResult': '0',
                'select1': '1',
                'TestTypeTitle1': '判斷題',
                'timeminute': '0',
                'timesecond': '0',
                'UserScoreID': '0'
                }

    data = spyder.post(numurl, data=postdata, headers=headers).content.decode('gbk')
    problem_data = bfs(data, 'html.parser').select('td[colspan]')
    problem = problem_data[0].text.strip()
    return problem


if __name__ == '__main__':
    username=input("輸入登陸賬號:")
    password=input("請輸入密碼:")
    spyder = login(username, password)
    i = 1
    while 1:
        try:
            if(i%20==0):
                print("已完成%d題爬取", i)
            f = open('problems.txt', 'a+')
            problem = get_problems(spyder,i)
            i+=1
            time.sleep(0.5)
            f.write(problem+'\n')
            f.close()
        except:
            break
    f.close()