1. 程式人生 > 其它 >CVPR頂會論文爬取

CVPR頂會論文爬取

main.py

import pymysql
import re
import requests

# 連線資料庫函式
from bs4 import BeautifulSoup


def insertCvpr(value):

    try:
        db = pymysql.connect(host="localhost", user="root", password="password", database="article",charset="utf8")
        print("資料庫連線成功!")
        cur = db.cursor()
        sql 
= 'INSERT INTO cvpr(title,ab,author,hotword,pdf,path) VALUE (%s,%s,%s,%s,%s,%s)' cur.execute(sql, value) db.commit() print("增加資料成功!") except pymysql.Error as e: print("增加資料失敗: " + str(e)) db.rollback() db.close() #主函式 print("1") url = "https://openaccess.thecvf.com/CVPR2020.py?day=2020-06-16
" headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"} res = requests.get(url,headers=headers) res.encoding = "utf-8" # 先爬取每個論文的網址 web = re.findall("""<dt class="ptitle"><br><a href="(.*?)">.*?</a></dt>
""", res.text, re.S) print("2") for each in web: try: each = "http://openaccess.thecvf.com/" + each print("3") print(each) res = requests.get(each, headers=headers, timeout=(3, 7)) paper = BeautifulSoup(res.text) res.encoding = "utf-8" # 在各各論文網站中爬取詳細資訊 title = re.findall("""<div id="papertitle">(.*?)</div>""", res.text, re.S)#標題 ab = re.findall("""<div id="abstract" >(.*?)</div>""", res.text, re.S)#摘要 author = paper.find("div", {"id": "authors"}).find("b").find("i").text#作者 pdf = re.findall("""\[<a href="\.\./\.\./(.*?)">pdf</a>\]""", res.text, re.S)#pdf下載地址 path = each#論文簡述頁面 if (len(title) > 0): title = title[0].replace("\n", "") ab = ab[0].replace("\n", "") pdf = "http://openaccess.thecvf.com/" + pdf[0] print(title) print(author) value = (title, ab, author, "", pdf, path) insertCvpr(value) except: print("異常")

2.資料庫

遇到的問題:

注意varchar最大長度為255,資料長度可能不夠,使用longtext型別儲存。

MySQL中tinytext、text、mediumtext和longtext等各個型別詳解