1. 程式人生 > 其它 >cvpr頂會熱詞爬取

cvpr頂會熱詞爬取


import
requests from bs4 import BeautifulSoup import re import pymysql url = 'https://openaccess.thecvf.com/CVPR2020?day=2020-06-18' response = requests.get(url) obj1 = re.compile(r'<dt class="ptitle"><br>.*?.html">(?P<name>.*?)</a></dt>.*?' r'\[<a href="(?P<pdf>.*?)">pdf</a>].*?
' r'author = {(?P<author>.*?)},<br>.*?' r'title = {(?P<title>.*?)},<br>.*?' r'booktitle = {(?P<booktitle>.*?)},<br>', re.S) result = obj1.finditer(response.text) # 連線資料庫 conn = pymysql.connect(host='localhost
', user='root', password='123456', database='exercise', charset='utf8', port=3306) # 建立遊標物件 cursor = conn.cursor() sql = 'INSERT INTO cvpr(`name`, pdf, author, title, booktitle, `date`) values(%s,%s,%s,%s,%s,%s)' for it in result: try: data = [it.group('name'), it.group('pdf'), it.group('
author'), it.group('title'), it.group('booktitle'), 20200618] cursor.execute(sql, data) conn.commit() except Exception as e: print(e) response.close() # 關閉遊標 cursor.close() # 關閉連線 conn.close() print('over!!!')

  不足的一點:你需要手動更換網址來爬取不同日期的論文資訊。

  相關資料庫結構也貼在這裡: