1. 程式人生 > 其它 >Python selenium 爬取cnvd(國家資訊保安漏洞共享平臺)

Python selenium 爬取cnvd(國家資訊保安漏洞共享平臺)

#coding = utf-8
#@author :今夕
#@Time :2021.08.06 16:09
#@file :mian.py
#@software :PyCharm
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import pymysql
import random
#應用漏洞獲取
list=[27,28,29,30,31,32,33,34,35,38]
page=[13700,79200,30800,2500,9400,2600,1000,300,0,100]
def main():
driver = webdriver.Chrome()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
i=0
while i<len(list):
leng=0
while leng<=page[i]:
url = "https://www.cnvd.org.cn/flaw/typeResult?typeId=%d&max=100&offset=%d" % (list[i],leng)
leng=leng+100
driver.get(url)
driver.refresh() # 重新整理頁面
html = driver.page_source
datalist = parseHtml(html,list[i])
save(datalist)
driver.refresh()
print("當前i=%d,leng=%d"%(list[i],leng))
driver.close()
def parseHtml(html,type):
datlist=[]#存一頁漏洞基本資訊
soup = BeautifulSoup(html, "html.parser") # 指定Beautiful的解析器為“html.parser”
findLevel = re.compile(r'</span>(.*?)</td>', re.S)#危害級別匹配規則
findDate = re.compile(r'<td width="13%">(.*?)</td>', re.S)#時間匹配規則
k=1
items=soup.find_all('tr')
if type==27:
type="作業系統"
elif type==28:
type="應用程式"
elif type==29:
type="WEB應用"
elif type==30:
type="資料庫"
elif type==31:
type="網路裝置(交換機,路由器等網路終端裝置)"
elif type==32:
type="安全產品"
elif type==33:
type="智慧裝置(物聯網終端裝置)"
elif type==34:
type="區塊鏈公鏈"
elif type==35:
type="區塊鏈聯盟鏈"
elif type==38:
type="工業控制系統"
while k<len(items):
dat = [] # 存一個漏洞基本資訊
item=items[k]
link="https://www.cnvd.org.cn"+item.a["href"]#獲取漏洞詳細資訊連結
#print(link)
title=item.a["title"]#獲取漏洞標題
#print(title)
item=str(item)#轉換為字串
level=re.findall(findLevel,item)[0]
#print(level)
date=re.findall(findDate,item)[0]
#print(date)
#print("___________________________")
dat.append(title)
dat.append(level)
dat.append(date)
dat.append(link)
dat.append(type)
datlist.append(dat)
#print(dat)
k+=1
return datlist
def save(datlist):
#print(datlist[1])
db = pymysql.connect(user='root', password='123456', host='localhost', database='cnvd')
cursor = db.cursor()
for dat in datlist:
data="'"+dat[0]+"'"+","+"'"+dat[1].replace(" ","").replace("\n","").replace("\t","")+"'"+","+"'"+dat[2]+"'"+","+"'"+dat[3]+"'"+","+"'"+dat[4]+"'"
print(data)
sql = "insert into cnvd (title,level,date,link,type) values(%s)"%data
#print(sql)
cursor.execute(sql) # 提交資料庫操作
db.commit() # 提交事務
cursor.close()
db.close()
t = random.randint(1,6) # 取隨機數
print("等待%d秒" % t)
time.sleep(t) # 休眠t秒

#輸入selenium 搜尋
# element = driver.find_element_by_name('a')
# ActionChains(driver).move_to_element(element).perform()
#單擊,彈出的Ajax元素,根據連結節點的文字內容查詢

if __name__ == '__main__':
main()
print("爬取完成")