1. 程式人生 > >python 批量下載知網(CNKI)論文

python 批量下載知網(CNKI)論文

1、目的:
朋友找我去知網幫他下載點相關論文,發現老是要去點選文章。點選下載,顯得很麻煩,百度一下,別人的方法太複雜,所以自己寫了一個python指令碼自動下載知網論文。
2、前期準備
1)安裝python 2.7
2)安裝 selenium

pip install selenium

3)下載一個chromedriver.exe,放到指令碼同一個資料夾內
4)安裝chrome瀏覽器
3、直接擼程式碼
這裡寫圖片描述
(a)指定關鍵字下載知網論文

downloadCNKI.py
#!/usr/bin/env Python
# coding=utf-8
import  os
from
time import sleep from selenium import webdriver def browser_init(isWait): options = webdriver.ChromeOptions() prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'} options.add_experimental_option('prefs', prefs) browser = webdriver.Chrome(executable_path='chromedriver.exe'
, chrome_options=options) browser.set_window_size(500,500) if isWait: browser.implicitly_wait(50) return browser def searchKey(keyword): browser.get("http://kns.cnki.net/kns/brief/default_result.aspx") browser.find_element_by_id('txt_1_value1').send_keys(keyword) browser.find_element_by_id('btnSearch'
).click() def switchToFrame(browser): #print 'start switch' browser.switch_to.frame('iframeResult') #print 'end switch' def getDownloadLinks(browser,paper_downloadLinks): for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'): #link.click() url=link.get_attribute('href') url_part = url.split('&')[3:6] url_str= '&'.join(url_part) down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str #print down_url paper_downloadLinks.append(down_url) def switchToPage(browser,n): for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'): url=link.get_attribute('href') print url pageInd='curpage=%d&'%n print pageInd if pageInd in url: print "page: "+url link.click() break def switchNextPage(browser): browser.find_element_by_link_text(u'下一頁').click() def do_download(driver,urls,fail_downLoadUrl): for url in urls: print url driver.get(url) paper_title=driver.title print "paper title"+paper_title if u'中國專利全文資料庫' in paper_title: continue print "try download :"+paper_title try: driver.find_element_by_xpath("//a[contains(text(),'PDF下載')]").click() print "download success!!!" except Exception as e: try: driver.find_element_by_xpath("//a[contains(text(),'整本下載')]").click() print "download success!!!" except Exception as e: print "download fail!!!" fail_downLoadUrl.append(url) def usage(): print "example : python downloadCNKI.py -k keyword -p 1" if __name__=="__main__": keyword=u'三角形' #論文搜尋的關鍵字 pageNum = 1 # 下載多少頁的論文 browser=browser_init(True) searchKey(keyword) switchToFrame(browser) paper_downloadLinks = [] #論文下載連結 curPage=1 while curPage<=pageNum: getDownloadLinks(browser,paper_downloadLinks) switchNextPage(browser); curPage+=1 browser.quit() print "採集了%d條資料"% len(paper_downloadLinks) driver=browser_init(False) fail_downLoadUrl=[] #記錄下失敗的網站 do_download(driver,paper_downloadLinks,fail_downLoadUrl) print fail_downLoadUrl tryNum=0 #嘗試N次重新下載沒有下載的 while tryNum<5: if len(fail_downLoadUrl) !=0: paper_downloadLinks=fail_downLoadUrl fail_downLoadUrl=[] do_download(driver, paper_downloadLinks, fail_downLoadUrl) print fail_downLoadUrl else: break tryNum+=1 sleep(60) driver.quit()

(b)指定論文題目下載知網論文
這個需要和指令碼同目錄下新建一個downfile.txt,按行存放需要下載題目

指定題目到downfile.txt的知網下載.py
#!/usr/bin/env Python
# coding=utf-8
import  os
from time import sleep
from selenium import webdriver

def browser_init(isWait):
    options = webdriver.ChromeOptions()
    prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'E:\\PycharmProjects\\downloadCNKI\\output'}
    options.add_experimental_option('prefs', prefs)

    browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=options)
    browser.set_window_size(500,500)
    if isWait:
        browser.implicitly_wait(50)
    return browser

def searchKey(keyword):
    browser.get("http://kns.cnki.net/kns/brief/default_result.aspx")
    browser.find_element_by_id('txt_1_value1').send_keys(keyword)
    browser.find_element_by_id('btnSearch').click()

def switchToFrame(browser):
    #print 'start switch'
    browser.switch_to.frame('iframeResult')
    #print 'end switch'

def getDownloadLinks(browser,paper_downloadLinks):
    for link in browser.find_elements_by_css_selector('a[href^=\/kns\/detail]'):
        #link.click()
        url=link.get_attribute('href')
        url_part = url.split('&')[3:6]
        url_str= '&'.join(url_part)
        down_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+url_str
        #print down_url
        paper_downloadLinks.append(down_url)

def getKeywordDownloadLink(browser,keyword,paper_downloadLinks):
    link=browser.find_element_by_link_text(keyword)
    url = link.get_attribute('href')
    #print url
    url_part = url.split('&')[3:6]
    url_str = '&'.join(url_part)
    down_url = 'http://kns.cnki.net/KCMS/detail/detail.aspx?' + url_str
    #print down_url
    paper_downloadLinks.append(down_url)


def switchToPage(browser,n):
    for link in browser.find_elements_by_css_selector('a[href^=\?curpage]'):
        url=link.get_attribute('href')
        print url
        pageInd='curpage=%d&'%n
        print pageInd
        if pageInd in url:
            print "page: "+url
            link.click()
            break
def switchNextPage(browser):
    browser.find_element_by_link_text(u'下一頁').click()

def do_download(driver,urls,fail_downLoadUrl):
    for url in urls:
        print url
        driver.get(url)
        paper_title=driver.title
        print "paper title"+paper_title
        if u'資料庫' in paper_title:
            continue
        print "try  download :"+paper_title
        try:
            driver.find_element_by_xpath("//a[contains(text(),'PDF下載')]").click()
            print "download success!!!"
        except Exception as e:
            try:
                driver.find_element_by_xpath("//a[contains(text(),'整本下載')]").click()
                print "download success!!!"
            except Exception as e:
                print "download fail!!!"
                fail_downLoadUrl.append(url)

def usage():
    print "example : python downloadCNKI.py -k keyword  -p 1"

if __name__=="__main__":

    paper_downloadLinks = []  # 論文下載連結
    pageNum = 1  # 下載多少頁的論文
    browser = browser_init(True)

    file = open("downfile.txt")
    lineDatas = file.readlines();
    for line in lineDatas:
        keyword=line.strip('\n').decode('gbk')
        #keyword=u'三角形'      #論文搜尋的關鍵字
        print u"採集: %s"% keyword
        searchKey(keyword)
        switchToFrame(browser)
        downloadLinks=[]
        getKeywordDownloadLink(browser,keyword,downloadLinks)

        paper_downloadLinks.append(''.join(downloadLinks))
    file.close()
    browser.quit()


    print "採集了%d條資料"% len(paper_downloadLinks)

    driver=browser_init(False)
    fail_downLoadUrl=[]         #記錄下失敗的網站
    do_download(driver,paper_downloadLinks,fail_downLoadUrl)
    print  fail_downLoadUrl
    tryNum=0
    #嘗試N次重新下載沒有下載的
    while tryNum<5:
        if len(fail_downLoadUrl) !=0:
            paper_downloadLinks=fail_downLoadUrl
            fail_downLoadUrl=[]
            do_download(driver, paper_downloadLinks, fail_downLoadUrl)
            print "重新下載 ",
            print  fail_downLoadUrl
        else:
            break
        tryNum+=1
    sleep(60)
    driver.quit()

很好用,讓我幫助同學下載知網論文,媽媽再也不要擔心我點錯了。。。