1. 程式人生 > >Python爬codefores所有的題目資訊

Python爬codefores所有的題目資訊

直接貼程式碼

from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib import request
import pymysql.cursors
import re


num = 0
for i in range(10):
    print("當前正處於", (i + 1), "頁")
    URL = "http://codeforces.com/contests/page/"
    URL += str(i+1)
    # print(URL , i)
    req = request.Request(URL)
    req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36")
    # 請求URL並把結果用UTF-8編碼
    resp = urlopen(req).read().decode("UTF-8")

    # 使用BeautifulSoup去解析
    soup = BeautifulSoup(resp, "html.parser")

    # 獲取到了datatable的 div 在進裡面在第六個 div
    # datatable = soup.find("div", {"class": "datatable"}).findAll("div") # 變成列表了 失敗

    # 讀出了當前介面的所有的比賽
    data = soup.findAll("tr")[4:-1]

    # 醜就醜一點, 先不管了,
    for c in data:
        # 提取出場次名稱
        cc = c.find("td").get_text().split()
        cc = "".join(cc[0: -5])

        # 提取出比賽所對應的連結
        url = c.a

        # print("http://codeforces.com" + url["href"])
        # 開啟新的詞條連結
        zz = "http://codeforces.com" + url["href"]
        req = request.Request(zz)
        req.add_header("User-Agent",
                       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36")
        resp1 = urlopen(req).read().decode("UTF-8")
        soup1 = BeautifulSoup(resp1, "html.parser")

        # 解析出來的是選擇的題目(我覺得不嚴謹)
        data1 = soup1.find("select").findAll("option")
        data1 = data1[1:]  # 第一個選擇是沒有用的捨棄

        for z in data1:
            num += 1
            # print(z["value"], "  " + z.get_text(),  "http://codeforces.com" + url["href"] + "/problem/" + z["value"])
            # print(cc, z.get_text(), "http://codeforces.com" + url["href"] + "/problem/" + z["value"])
            connection = pymysql.connect(host='localhost',
                                         user='root',
                                         password='admin',
                                         db='intelligence',
                                         charset='utf8mb4'
                                         )
            try:
                with connection.cursor() as cursor:
                    # 建立sql語句
                    sql = "insert into `cf` (`problem`, `problemName`, `problemUrl`) values (%s, %s, %s)"
                    # 執行sql語句
                    cursor.execute(sql, (cc, z.get_text(), "http://codeforces.com" + url["href"] + "/problem/" + z["value"]))
                    # 提交
                    connection.commit()
            finally:
                connection.close()

        print("當前以新增:", num, "條資料")
print("完成")





codeforces有反爬蟲機制

 

資料庫設計: