1. 程式人生 > >python爬取codeforces比賽題目

python爬取codeforces比賽題目

cf的題目有很多Latex公式,而且是用’$$$’三個符號標記,所以複製題目寫部落格的時候很不方便,寫一個爬蟲儲存一場比賽中的所有題目資訊。

# -*- coding:utf-8 -*-

import os
import requests
from bs4 import BeautifulSoup

f = open('blog.md', 'w')

Latextag = 0

def GetHtmlText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = 'utf-8'
return r.text except: return "" def Clear(text): flag = True while flag: flag = False try: index = text.index('$$$') if Latextag == 0: pass elif Latextag == 1: text = text[:index] + text[index + 1
:] elif Latextag == 2: text = text[:index] + text[index + 2:] flag = True except: break return text def FindInfo(soup, url): AllInfo = soup.find('div', {'class', 'problemindexholder'}) divs = AllInfo.find_all('div') title = '# '
+ divs[3].get_text() f.write('%s\n' % title) problem = '## Description:\n' + divs[12].get_text() problem = Clear(problem) f.write('%s\n' % problem) Input = '## Input:\n' + divs[13].get_text()[5:] Input = Clear(Input) f.write('%s\n' % Input) Output = '## Output\n' + divs[15].get_text()[6:] Output = Clear(Output) f.write('%s\n' % Output) Sample = soup.find('div', {'class', 'sample-test'}) SampleInputs = Sample.find_all('div', {'class', 'input'}) SampleOutputs = Sample.find_all('div', {'class', 'output'}) for i in range(len(SampleInputs)): SampleInput = SampleInputs[i].get_text() SampleOutput = SampleOutputs[i].get_text() f.write('## Sample Input:\n%s\n' % SampleInput[5:]) f.write('## Sample Output:\n%s\n' % SampleOutput[6:]) f.write('### [題目連結](%s)\n\n' % url) f.write('## AC程式碼:\n```\n```\n') def main(): global Latextag print('Welcome to use codeforces contest crawler\n') Latextag = int(input("Please enter the Latex tag you need(0:'$$$',1:'$$',2:'$'):\n")) Url = input("請輸入比賽連結(eg:'http://codeforces.com/contest/1003'):\n") Problem = input('請輸入比賽題目編號(eg:A B C D E F):\n').split(' ') Url += '/problem/' for i in Problem: url = Url + i; print(url) html = GetHtmlText(url).replace('<br />', '\n').replace('</p>', '\n') soup = BeautifulSoup(html, "html.parser") FindInfo(soup, url) f.close() if __name__ == '__main__': main()

執行結果: