【2022.05.20】對無驗證碼的整個網頁公告的內容進行爬取

阿新 • • 發佈：2022-05-20

學習內容

xpath，以及python字串替換，

url自適應拼接，因為很多網站的href不完整

使用Selenium 抓取動態頁面內容

前言

這次要實現的是根據網址和xpath，去抓取同一頁面中的所有公告內容

程式碼

原始碼

import requests
from lxml import etree
import lxml
import json
import time
import string
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# 全域性變數
with open('config.json', 'r', encoding='utf-8') as f:
    JsonFile = json.load(f)

base_url = JsonFile['url']
notice_title_href_xpath = JsonFile['notice_title_href_xpath']
notice_title_xpath = JsonFile['notice_title_xpath']
notice_content_xpath = JsonFile['notice_content_xpath']
search = JsonFile['search']
notice_content_xpath = notice_content_xpath.replace("替換", search)


# 返回下一頁的連結，這部分還沒寫完
def get_next_page_url(current_url):
    current_html = requests.get(current_url)
    current_html.encoding = "utf-8"
    selecter = etree.HTML(current_html.text)
    next_page_url = selecter.xpath("""//*[contains(text(),'下一頁') or contains(text(),'下頁') or contains(text(),'next') or contains(text(),'Next')]/@href""")
    print("下一頁連結", next_page_url)
    return next_page_url

if __name__ == '__main__':
    current_url = base_url

    current_html = requests.get(current_url)
    current_html.encoding = "utf-8"
    selecter = etree.HTML(current_html.text)
    notice = selecter.xpath(notice_title_href_xpath)
    print(notice)

    # print(current_html)
    # next_page_url_xpath = """//*[@href = 'tj-sgsj_2.shtml']/@href"""
    # next_page_url = selecter.xpath(next_page_url_xpath)
    # print("下一頁連結", next_page_url)

    # 獲取當前頁面的所有公告
    for result in notice:
        # 獲得網址
        result_url = urljoin(current_url, result)
        print("網址： ", result_url)
        result_html = requests.get(result_url)
        result_html.encoding = "utf-8"
        result_detail = etree.HTML(result_html.text)
        result_title = result_detail.xpath(notice_title_xpath)
        print("標題: ", result_title)

        result_content = result_detail.xpath(notice_content_xpath)
        print("內容： ")
        for result_print in result_content:
            print(result_print)
        print("\n")
        time.sleep(1)

配置檔案

{
    "？url": "【可修改】用於查詢的網頁",
    "url": "http://www.lswz.gov.cn/html/ywpd/lstk/tj-sgsj.shtml",
    "？search": "【可修改】要查詢的公告內容",
    "search": "玉米",
    "？notice_title_href_xpath": "【不修改】獲取每個公告href的Xpath位置",
    "notice_title_href_xpath": "//*[@class='lists diylist']/li/a/@href",
    "？notice_title_xpath": "【不修改】獲取每個公告title的Xpath位置",
    "notice_title_xpath": "//div[@class='pub-det-title']/text()",
    "？notice_content_xpath": "【不修改】獲取每個公告content的Xpath位置",
    "notice_content_xpath": "//*[contains(text(),'替換') and @style]/text()"
}