CVPR頂會論文爬取
阿新 • • 發佈:2021-06-12
main.py
import pymysql import re import requests # 連線資料庫函式 from bs4 import BeautifulSoup def insertCvpr(value): try: db = pymysql.connect(host="localhost", user="root", password="password", database="article",charset="utf8") print("資料庫連線成功!") cur = db.cursor() sql= 'INSERT INTO cvpr(title,ab,author,hotword,pdf,path) VALUE (%s,%s,%s,%s,%s,%s)' cur.execute(sql, value) db.commit() print("增加資料成功!") except pymysql.Error as e: print("增加資料失敗: " + str(e)) db.rollback() db.close() #主函式 print("1") url = "https://openaccess.thecvf.com/CVPR2020.py?day=2020-06-16" headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36"} res = requests.get(url,headers=headers) res.encoding = "utf-8" # 先爬取每個論文的網址 web = re.findall("""<dt class="ptitle"><br><a href="(.*?)">.*?</a></dt>""", res.text, re.S) print("2") for each in web: try: each = "http://openaccess.thecvf.com/" + each print("3") print(each) res = requests.get(each, headers=headers, timeout=(3, 7)) paper = BeautifulSoup(res.text) res.encoding = "utf-8" # 在各各論文網站中爬取詳細資訊 title = re.findall("""<div id="papertitle">(.*?)</div>""", res.text, re.S)#標題 ab = re.findall("""<div id="abstract" >(.*?)</div>""", res.text, re.S)#摘要 author = paper.find("div", {"id": "authors"}).find("b").find("i").text#作者 pdf = re.findall("""\[<a href="\.\./\.\./(.*?)">pdf</a>\]""", res.text, re.S)#pdf下載地址 path = each#論文簡述頁面 if (len(title) > 0): title = title[0].replace("\n", "") ab = ab[0].replace("\n", "") pdf = "http://openaccess.thecvf.com/" + pdf[0] print(title) print(author) value = (title, ab, author, "", pdf, path) insertCvpr(value) except: print("異常")
2.資料庫
遇到的問題:
注意varchar最大長度為255,資料長度可能不夠,使用longtext型別儲存。