記錄python爬取某程式設計網站內容

阿新 • • 發佈：2022-04-09

記錄一下python爬取某程式設計網站

程式碼：

註釋寫得太多反而覺得很混亂。

新手，突發興趣，遞迴下載網頁內容，來學習一下python的常用操作。

以下程式碼不包含對圖片之類的處理。

下載完成以後，放在nginx下跑，怎麼也不能讓程式碼像原網站一樣上色，使用fiddler一分析，才發現少下載了一個js檔案。奈何本人對js不熟，還找不到載入那個js的地方，只好自己手動下載了。

http://localhost/templets/new/script/jquery.snippet.js

域名顯然是你爬取網站的域名

#!/usr/bin/python
# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import os


def createDir(path,host,data=''):
    originalPath = os.getcwd() #每次記錄本地網站的根目錄，建立資料夾，檔案後進行恢復
    # print("original = ",originalPath)
    arr = path.split('/')
    arr.pop(0)  #去掉域名後面的第一個/
    lens = len(arr)
    if lens == 1:
        print("only one")   #這裡沒有遇見，要是其他網站有遇見，就要自己處理了
        return
    filename = arr[lens-1]
    if filename.find('&') !=-1: #這裡僅針對這個網站做的處理，去掉 主要是什麼.php&，直接不處理
        return
    index = filename.find('?')  #某些資原始檔比如hello.js?v=1234 儲存檔案不能帶後面的東西
    if index != -1:
        filename = filename[0:index]
    arr.pop(lens-1)
    for dir in arr:
        if dir == '' or dir.find('.')!=-1:    #一些處理
            continue
        if not os.path.exists(dir): #已有的資料夾，就不能再建立了
            os.mkdir(dir)
            os.chdir(os.getcwd() + "/" + dir)   #建立資料夾，進行切換路徑
        else:
            os.chdir(os.getcwd() + "/" + dir)
    # file = open(filename,"w",encoding='utf-8')
    file = open(filename,'wb')  #建議使用wb，上面的會新增很多換行符
    #沒有資料就要請求網頁，其實只有copyWeb那個url才有資料，減少get請求
    if data == '':
        url = host + path
        # print(url)
        data = requests.get(url)
        data = data.content
        # soup = BeautifulSoup(data, "html.parser")
        # data = soup.prettify()  #資原始檔，比如js/css之類的，解析成str型別，直接寫檔案就可以了
    # else:
    #     file.close()
    #     file = open(filename,'w',encoding='utf-8')  #除了第一個index.html，其他的，都用wb
    file.write(data)
    file.close()
    os.chdir(originalPath)  #最後恢復路徑

    if filename.find("css")!=-1:    #css檔案，再處理裡面的font檔案
        print(path)
        soup = BeautifulSoup(data, "html.parser")
        data = soup.prettify()  #資原始檔，比如js/css之類的
        last = path.rfind('/')
        prefix = path[0:last]
        last = prefix.rfind('/')
        prefix = prefix[0:last]     #因為是../，所以需要恢復字首
        data = data.split('\n')
        for url in data:
            if url.find("url") != -1:
                # print(url)
                left = url.find('(')
                right = url.find(')')
                temp = url[left + 4:right - 1]
                # print(temp)
                newurl = prefix + temp
                index = newurl.find('?')  # 某些資原始檔比如hello.js?v=1234 儲存檔案不能帶後面的東西
                if index != -1:
                    newurl = newurl[0:index]
                print(newurl)
                createDir(newurl,host)
def copyWeb(url):
    data = requests.get(url)
    link = urlparse(url)
    host = link.hostname
    head = link.scheme + "://" + host   #儲存一下協議+域名
    if not os.path.exists(host):
        os.mkdir(host)  #建立站點根目錄
    os.chdir(host)

    path = link.path
    data = data.content
    soup = BeautifulSoup(data, "html.parser")

    createDir(path,head,data)    #先建立index.html，只有這裡呼叫createDir才會提供資料

    data = soup.prettify()
    #其他的資料
    link = soup.find_all('link')
    for links in link:
        # print(links['href'])
        createDir(links['href'],head)   #實際上是自己分析嘗試，這樣能取出css檔案的路徑，進行儲存

    script = soup.find_all('script',attrs = {'src' : True}) #soup太強了，這個是獲取javascript檔案的路徑
    for scripts in script:
        createDir(scripts['src'],head) #同理，取出js的路徑，進行get儲存

    href = soup.find_all('div',attrs = {'id' : 'contents-detail'})  #這是分析網站結構得出的

    soup = BeautifulSoup(str(href),"html.parser")   #因為本人對soup用法不太熟，所以分兩次得出每一章內容的url

    href = soup.find_all('a')
    for hrefs in href:
        if str(hrefs['href']).find('html') != -1:   #這裡去掉一些章節
            createDir(hrefs['href'],head)
    return

url = 'http://xxx.biancheng.net/csharp/index.html'
copyWeb(url)