python 批量下載知網(CNKI)論文
阿新 • • 發佈:2019-02-12
1、目的:
朋友找我去知網幫他下載點相關論文,發現老是要去點選文章。點選下載,顯得很麻煩,百度一下,別人的方法太複雜,所以自己寫了一個python指令碼自動下載知網論文。
2、前期準備
1)安裝python 2.7
2)安裝 selenium
pip install selenium
3)下載一個chromedriver.exe,放到指令碼同一個資料夾內
4)安裝chrome瀏覽器
3、直接擼程式碼
(a)指定關鍵字下載知網論文
downloadCNKI.py
#!/usr/bin/env Python
# coding=utf-8
import os
from time import sleep
from selenium import webdriver
def browser_init(isWait):
options = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(executable_path='chromedriver.exe' , chrome_options=options)
browser.set_window_size(500,500)
if isWait:
browser.implicitly_wait(50)
return browser
def searchKey(keyword):
browser.get("http://kns.cnki.net/kns/brief/default_result.aspx")
browser.find_element_by_id('txt_1_value1').send_keys(keyword)
browser.find_element_by_id('btnSearch' ).click()
def switchToFrame(browser):
#print 'start switch'
browser.switch_to.frame('iframeResult')
#print 'end switch'
def getDownloadLinks(browser,paper_downloadLinks):
for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'):
#link.click()
url=link.get_attribute('href')
url_part = url.split('&')[3:6]
url_str= '&'.join(url_part)
down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str
#print down_url
paper_downloadLinks.append(down_url)
def switchToPage(browser,n):
for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'):
url=link.get_attribute('href')
print url
pageInd='curpage=%d&'%n
print pageInd
if pageInd in url:
print "page: "+url
link.click()
break
def switchNextPage(browser):
browser.find_element_by_link_text(u'下一頁').click()
def do_download(driver,urls,fail_downLoadUrl):
for url in urls:
print url
driver.get(url)
paper_title=driver.title
print "paper title"+paper_title
if u'中國專利全文資料庫' in paper_title:
continue
print "try download :"+paper_title
try:
driver.find_element_by_xpath("//a[contains(text(),'PDF下載')]").click()
print "download success!!!"
except Exception as e:
try:
driver.find_element_by_xpath("//a[contains(text(),'整本下載')]").click()
print "download success!!!"
except Exception as e:
print "download fail!!!"
fail_downLoadUrl.append(url)
def usage():
print "example : python downloadCNKI.py -k keyword -p 1"
if __name__=="__main__":
keyword=u'三角形' #論文搜尋的關鍵字
pageNum = 1 # 下載多少頁的論文
browser=browser_init(True)
searchKey(keyword)
switchToFrame(browser)
paper_downloadLinks = [] #論文下載連結
curPage=1
while curPage<=pageNum:
getDownloadLinks(browser,paper_downloadLinks)
switchNextPage(browser);
curPage+=1
browser.quit()
print "採集了%d條資料"% len(paper_downloadLinks)
driver=browser_init(False)
fail_downLoadUrl=[] #記錄下失敗的網站
do_download(driver,paper_downloadLinks,fail_downLoadUrl)
print fail_downLoadUrl
tryNum=0
#嘗試N次重新下載沒有下載的
while tryNum<5:
if len(fail_downLoadUrl) !=0:
paper_downloadLinks=fail_downLoadUrl
fail_downLoadUrl=[]
do_download(driver, paper_downloadLinks, fail_downLoadUrl)
print fail_downLoadUrl
else:
break
tryNum+=1
sleep(60)
driver.quit()
(b)指定論文題目下載知網論文
這個需要和指令碼同目錄下新建一個downfile.txt,按行存放需要下載題目
指定題目到downfile.txt的知網下載.py
#!/usr/bin/env Python
# coding=utf-8
import os
from time import sleep
from selenium import webdriver
def browser_init(isWait):
options = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
browser.set_window_size(500,500)
if isWait:
browser.implicitly_wait(50)
return browser
def searchKey(keyword):
browser.get("http://kns.cnki.net/kns/brief/default_result.aspx")
browser.find_element_by_id('txt_1_value1').send_keys(keyword)
browser.find_element_by_id('btnSearch').click()
def switchToFrame(browser):
#print 'start switch'
browser.switch_to.frame('iframeResult')
#print 'end switch'
def getDownloadLinks(browser,paper_downloadLinks):
for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'):
#link.click()
url=link.get_attribute('href')
url_part = url.split('&')[3:6]
url_str= '&'.join(url_part)
down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str
#print down_url
paper_downloadLinks.append(down_url)
def getKeywordDownloadLink(browser,keyword,paper_downloadLinks):
link=browser.find_element_by_link_text(keyword)
url = link.get_attribute('href')
#print url
url_part = url.split('&')[3:6]
url_str = '&'.join(url_part)
down_url = 'http://kns.cnki.net/KCMS/detail/detail.aspx?' + url_str
#print down_url
paper_downloadLinks.append(down_url)
def switchToPage(browser,n):
for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'):
url=link.get_attribute('href')
print url
pageInd='curpage=%d&'%n
print pageInd
if pageInd in url:
print "page: "+url
link.click()
break
def switchNextPage(browser):
browser.find_element_by_link_text(u'下一頁').click()
def do_download(driver,urls,fail_downLoadUrl):
for url in urls:
print url
driver.get(url)
paper_title=driver.title
print "paper title"+paper_title
if u'資料庫' in paper_title:
continue
print "try download :"+paper_title
try:
driver.find_element_by_xpath("//a[contains(text(),'PDF下載')]").click()
print "download success!!!"
except Exception as e:
try:
driver.find_element_by_xpath("//a[contains(text(),'整本下載')]").click()
print "download success!!!"
except Exception as e:
print "download fail!!!"
fail_downLoadUrl.append(url)
def usage():
print "example : python downloadCNKI.py -k keyword -p 1"
if __name__=="__main__":
paper_downloadLinks = [] # 論文下載連結
pageNum = 1 # 下載多少頁的論文
browser = browser_init(True)
file = open("downfile.txt")
lineDatas = file.readlines();
for line in lineDatas:
keyword=line.strip('\n').decode('gbk')
#keyword=u'三角形' #論文搜尋的關鍵字
print u"採集: %s"% keyword
searchKey(keyword)
switchToFrame(browser)
downloadLinks=[]
getKeywordDownloadLink(browser,keyword,downloadLinks)
paper_downloadLinks.append(''.join(downloadLinks))
file.close()
browser.quit()
print "採集了%d條資料"% len(paper_downloadLinks)
driver=browser_init(False)
fail_downLoadUrl=[] #記錄下失敗的網站
do_download(driver,paper_downloadLinks,fail_downLoadUrl)
print fail_downLoadUrl
tryNum=0
#嘗試N次重新下載沒有下載的
while tryNum<5:
if len(fail_downLoadUrl) !=0:
paper_downloadLinks=fail_downLoadUrl
fail_downLoadUrl=[]
do_download(driver, paper_downloadLinks, fail_downLoadUrl)
print "重新下載 ",
print fail_downLoadUrl
else:
break
tryNum+=1
sleep(60)
driver.quit()
很好用,讓我幫助同學下載知網論文,媽媽再也不要擔心我點錯了。。。