1. 程式人生 > 實用技巧 >【爬蟲】爬取淮安資訊職業學校(蘇電院)的新聞網 python

【爬蟲】爬取淮安資訊職業學校(蘇電院)的新聞網 python

爬取淮安資訊職業技術學院所有的新聞內容;包含所有不同的新聞內容,本指令碼會輸出顯示爬取到的新聞詳細頁URL、檔案下載URL,同時提供了下載檔案的功能,可以自行研究處理檔案下載的!(暫只支援下載爬取到的DOC/xls/PDF檔案)。

該指令碼是一個框體,解決了爬取過程種的URL爬取,使用者可自行根據實際需要來改動程式碼。同時也為了方便用來存進資料庫,這裡也預留了news_*[],用來向資料庫寫入,他家可以基於本指令碼自行擴充套件,歡迎留言你的github倉庫。

作者在處理頁面URL遍歷的時候,用了一個比較牽強的方法來處理hcit這種逆向的新聞頁面編號htm;如果大家又有了高明的方法,歡迎與我討論哈~~~

#! python3

import requests
from bs4 import BeautifulSoup


def reptiles(responses,addr):
    soup = BeautifulSoup(responses.text,"html.parser")
    # print (soup.find_all(id="line_u8_0"))
    k = 0
    news_link   = []
    news_title  = []
    news_time   = []
    news_text   = []
    while True:
        key_text = "line_u8_%d"%(k)
        soup_li = soup.find(id=str(key_text))
        if key_text not in responses.text : # 遇空則break
            break
        # 獲取標題
        title = soup_li.a.get_text()
        title = title[10:]
        news_title.append(title)
        # 獲取時間
        time = soup_li.span.get_text()
        news_time.append(time)
        # 獲取正文頁
        link = soup_li.a['href']
        news_link.append(link)
        newsText_url = addr + link
        res = requests.get(newsText_url)
        res.encoding = "utf-8"
        print("[爬取正文]%s  [%d]"%(newsText_url,res.status_code))
        soup_newsTtext = BeautifulSoup(res.text,"html.parser")
        newsText = soup_newsTtext.find(id="vsb_content")
        news_text.append(newsText)
        # 獲取正文附件並下載
        try:
            newsText_a = newsText.a['href']
            newsText_name = newsText.a.string
            newsText_link = "http://www.hcit.edu.cn" + newsText_a
            print("[附件檔案]"+newsText_link)
            # res_link = requests.get(newsText_link)
            # with open(newsText_name,'wb') as code:
            #     code.write(res_link.content)
        except Exception as e:
            print("[無附件檔案]")
        else:
            pass
        
        k += 1 
    # print(responses.url)
    return 


def main():
    url_addr = [
        # "http://www.hcit.edu.cn/sdxw/xyyw",
        # "http://www.hcit.edu.cn/sdxw/ybdt",
        # "http://www.hcit.edu.cn/sdxw/mtjj",
        "http://www.hcit.edu.cn/sdxw/ggtz"
    ]

    for addr in url_addr:
        page = 1
        while True:
            addr_url = addr + "/" + str(page) + ".htm"
            # print(addr_url)
            responses = requests.get(addr_url)
            if responses.status_code != 200 :
                print("[爬取頁面] %s [%s]" % (str(addr + ".htm"), str(requests.get(addr + ".htm").status_code)))
                reptiles(requests.get(addr + ".htm"), addr + "/../")
                print("+++++++++++++++++++++++++[None]++++++++++++++++++++++++++++")
                break
            responses.encoding = "utf-8"
            # 確認訪問正常
            print("[爬取頁面] %s [%s]"%(str(addr_url),str(responses.status_code)))
            reptiles(responses,addr + "/")
            page += 1
            print("=============================================")

if __name__ == "__main__":
    main()