python爬取codeforces比賽題目
阿新 • • 發佈:2018-12-24
cf的題目有很多Latex公式,而且是用’$$$’三個符號標記,所以複製題目寫部落格的時候很不方便,寫一個爬蟲儲存一場比賽中的所有題目資訊。
# -*- coding:utf-8 -*-
import os
import requests
from bs4 import BeautifulSoup
f = open('blog.md', 'w')
Latextag = 0
def GetHtmlText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def Clear(text):
flag = True
while flag:
flag = False
try:
index = text.index('$$$')
if Latextag == 0:
pass
elif Latextag == 1:
text = text[:index] + text[index + 1 :]
elif Latextag == 2:
text = text[:index] + text[index + 2:]
flag = True
except:
break
return text
def FindInfo(soup, url):
AllInfo = soup.find('div', {'class', 'problemindexholder'})
divs = AllInfo.find_all('div')
title = '# ' + divs[3].get_text()
f.write('%s\n' % title)
problem = '## Description:\n' + divs[12].get_text()
problem = Clear(problem)
f.write('%s\n' % problem)
Input = '## Input:\n' + divs[13].get_text()[5:]
Input = Clear(Input)
f.write('%s\n' % Input)
Output = '## Output\n' + divs[15].get_text()[6:]
Output = Clear(Output)
f.write('%s\n' % Output)
Sample = soup.find('div', {'class', 'sample-test'})
SampleInputs = Sample.find_all('div', {'class', 'input'})
SampleOutputs = Sample.find_all('div', {'class', 'output'})
for i in range(len(SampleInputs)):
SampleInput = SampleInputs[i].get_text()
SampleOutput = SampleOutputs[i].get_text()
f.write('## Sample Input:\n%s\n' % SampleInput[5:])
f.write('## Sample Output:\n%s\n' % SampleOutput[6:])
f.write('### [題目連結](%s)\n\n' % url)
f.write('## AC程式碼:\n```\n```\n')
def main():
global Latextag
print('Welcome to use codeforces contest crawler\n')
Latextag = int(input("Please enter the Latex tag you need(0:'$$$',1:'$$',2:'$'):\n"))
Url = input("請輸入比賽連結(eg:'http://codeforces.com/contest/1003'):\n")
Problem = input('請輸入比賽題目編號(eg:A B C D E F):\n').split(' ')
Url += '/problem/'
for i in Problem:
url = Url + i;
print(url)
html = GetHtmlText(url).replace('<br />', '\n').replace('</p>', '\n')
soup = BeautifulSoup(html, "html.parser")
FindInfo(soup, url)
f.close()
if __name__ == '__main__':
main()