【2022.05.24】對無驗證碼的整個網頁公告的內容進行自適應爬取(3)
阿新 • • 發佈:2022-05-24
前言
今天發現昨天的想法有誤了,動態獲取到的html檔案,後面如果使用requests去獲取的話就是靜態html檔案,那麼就有可能得不到真實的html檔案,大部分網站是不會變的,但是像大連交易所的網站就會動態載入,需要使用selenium來獲取html檔案
經過反覆排查比對,發現是html的註釋部分出了問題,只要註釋裡面也含有標籤,就會影響到判斷,因此在獲取xpath前,需要對得到的html檔案進行刪除註釋處理,因此加入以下程式碼
# 去除掉註釋,防止getroottree獲取到不正確的xpath soup = BeautifulSoup(page.text,'html.parser') for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() print(str(soup))
獲取通告標題很簡單,只要獲取得到的連結的標題就行
具體的路徑是
/html/head/title
學習內容
最折磨的莫過於找出公告的具體位置xpath
我的想法是,找出最多字數的div,然後返回其xpath,但是有的div太大了,把其他位置div的文字算進去了
和朋友討論了一下,最後用了一個取巧的辦法,就是找出最多字數的p,然後將其xpath刪減到最近的div
未解決問題
selenium因為沒法子返回xpath路徑,所以被這個問題折磨,還有的網站完全不按規範來,就很難做到自適應
還有的網站,比如福師大的,它的標題不是放在ul中的,而是放在tbody,這讓我非常痛苦
程式碼
from bs4 import BeautifulSoup, Comment import requests from lxml import etree from lxml import html import json import time from urllib.parse import urljoin from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service # 全域性變數 with open('config.json', 'r', encoding='utf-8') as f: JsonFile = json.load(f) base_url = JsonFile['url'] notice_title_href_xpath = JsonFile['notice_title_href_xpath'] notice_title_xpath = JsonFile['notice_title_xpath'] notice_content_xpath = JsonFile['notice_content_xpath'] notice_pages = JsonFile['notice_pages'] search = JsonFile['search'] notice_content_xpath = notice_content_xpath.replace("替換", search) # 全域性變數不修改,除錯用的 chrome_driver = r'.\chromedriver.exe' # chromedriver的檔案位置 next_page_keywords = JsonFile['next_page_keywords'] delay = JsonFile['delay'] notice_list_number = JsonFile['notice_list_number'] open_xpath_search_mode = JsonFile['open_xpath_search_mode'] content_page_number = JsonFile['content_page_number'] # 返回下一頁的連結,因為是動態js載入的,所以我要使用selenium def get_next_page_url(current_url, notice_pages): s = Service(executable_path=chrome_driver) driver = webdriver.Chrome(service=s) driver.get(current_url) time.sleep(5) result = [] result.append(driver.current_url) i = 1 print(driver.current_url) print("開始尋找所有的公告頁面") while True: try: pre_url = driver.current_url driver.find_element(by=By.XPATH, value=next_page_keywords).click() if driver.current_url == pre_url: break i = i + 1 result.append(driver.current_url) if i >= notice_pages: break time.sleep(delay) except: print("到達頁數上限") break print("已找到所有頁面,一共有", i, "頁") time.sleep(3) driver.quit() print(result) return result # 自適應尋找公告列表的xpath def get_notice_page_title(base_url): s = Service(executable_path=chrome_driver) driver = webdriver.Chrome(service=s) driver.get(base_url) time.sleep(1) # dynamic_html = driver.execute_script("return document.documentElement.outerHTML") # print(dynamic_html) # print(driver.execute_script("return document.documentElement.innerHTML")) ul_list = driver.find_elements(By.XPATH, value="//ul") result = "" # 找出最長ul temp = 0 for ul in ul_list: # 沒有字數的列表塊跳過不輸出 if len(ul.text) == 0: continue elif len(ul.text) > temp: print("————————————————————————————————————————————") print("【當前最大列表塊】\n字數:", len(ul.text), "\n內容:\n", ul.text) print("————————————————————————————————————————————") temp = len(ul.text) print(temp) # li_list記錄的是最長板塊的列表集合 li_list = ul.find_elements(By.XPATH, value="li") try: result = li_list[notice_list_number].find_element(By.XPATH, value="a").text except: continue # 將列表中的第二個公告賦予xpath,防止置頂貼 print("\n") print("最後得到的公告標題為:\n", result) return result # driver.quit() # 將標題去獲取頁面的標題xpath def get_xpath(title, url, search_element): page = requests.get(url) page.encoding = 'utf-8' # print(page.text) # 去除掉註釋,防止getroottree獲取到不正確的xpath soup = BeautifulSoup(page.text, 'html.parser') for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() # print(str(soup)) root = html.fromstring(str(soup)) tree = root.getroottree() temp_xpath = "//" + search_element + "[contains(text(),\"" + title + "\")]" # print(temp_xpath) result = root.xpath(temp_xpath) result_xpath = tree.getpath(result[0]) print("得到的的xpath是:\n", result_xpath) # 如果有多項才使用for迴圈 # for r in result: # print(tree.getpath(r)) return result_xpath # 將取得的單項xpath處理成多項,將最後的li後面中括號去掉 def handle_suffix_of_title_xpath(pre_xpath): # print(pre_xpath.rfind('[', beg=0, end=len(title_xpath))) print(str.rfind(pre_xpath, '[', 0, len(pre_xpath))) x = str.rfind(pre_xpath, '[', 0, len(pre_xpath)) # print(pre_xpath) string_list = list(pre_xpath) # print(len(string_list)) # print(string_list) del string_list[x:x+3] # print(len(string_list)) # print(string_list) after_xpath = ''.join(string_list) # print(len(pre_xpath)) # # print(pre_xpath) # # print(len(after_xpath)) print("處理後的通用xpath為", after_xpath) return after_xpath # 獲取所有公告的連結 def get_all_notice_url_list(notice_pages_list): notice_list = [] for notice_page in notice_pages_list: # current_url = notice_page # print(current_url) # print(notice_page) current_html = requests.get(notice_page) current_html.encoding = "utf-8" selector = etree.HTML(current_html.text) title_href_xpath = after_title_xpath + "/@href" notice = selector.xpath(title_href_xpath) # print(notice) 列印當前頁面的所有公告href # 獲取當前頁面所有公告 for result in notice: # 獲得所有公告網址 notice_url = urljoin(notice_page, result) notice_list.append(notice_url) # print("網址: ", notice_url) # # 獲取文字標題 # notice_html = requests.get(notice_url) # notice_html.encoding = "utf-8" # notice_selector = etree.HTML(notice_html.text) # notice_title_name_xpath = """/html/head/title/text()""" # notice_title_name = notice_selector.xpath(notice_title_name_xpath) # print("標題: ", notice_title_name[0]) return notice_list # 自適應獲取公告的內容位置 def get_notice_content_location(all_notice_url_list): i = 0 for url in all_notice_url_list: # 自適應載入頁數 i = i + 1 if i > content_page_number: break # 啟用selenium s = Service(executable_path=chrome_driver) driver = webdriver.Chrome(service=s) driver.get(url) time.sleep(1) p_list = driver.find_elements(By.XPATH, value="//p") # 找出字最多的p temp = 0 result = '' for p in p_list: # 沒有字數跳過不輸出 if len(p.text) == 0: continue elif len(p.text) > temp: temp = len(p.text) print("————————————————————————————————————————————") print("【當前最大塊】\n字數:", len(p.text), "\n內容:\n", p.text) print("————————————————————————————————————————————") result = p.text print("最終獲得的p欄位內容為:", result) return result def handle_suffix_of_content_xpath(pre_xpath): x = str.rfind(pre_xpath, "div", 0, len(pre_xpath)) string_list = list(pre_xpath) for i in range(x, len(pre_xpath)): # print(string_list[i]) if string_list[i] == '/': del string_list[i:len(pre_xpath)] break after_xpath = ''.join(string_list) print("處理後的通用xpath為\n", after_xpath) return after_xpath def get_notice(url_list, notice_content_xpath): for url in url_list: print("網址: ", url) # 獲取文字標題 notice_html = requests.get(url) notice_html.encoding = "utf-8" notice_selector = etree.HTML(notice_html.text) notice_title_name_xpath = """/html/head/title/text()""" notice_title_name = notice_selector.xpath(notice_title_name_xpath) print("標題: ", notice_title_name[0]) notice_content = notice_selector.xpath(notice_content_xpath)[0].xpath("string(.)") print("正文:\n", "".join([s for s in notice_content.splitlines(True) if s.strip()])) if __name__ == '__main__': # 【動態】自適應獲取通知列表的標題,思路是根據ul的字數最多數量判斷 title = get_notice_page_title(base_url) # 【靜態】將上面函式獲取的標題去獲取頁面的標題xpath title_xpath = get_xpath(title, base_url, "a") # 將取得的單項xpath處理成多項 after_title_xpath = handle_suffix_of_title_xpath(title_xpath) # 【動態】自適應獲取當前後續的所有頁面List,防止頁面使用js生成下一頁連結 notice_pages_list = get_next_page_url(base_url, notice_pages) # 【靜態】將取得的通用xpath去獲取所有的連結 all_notice_list = get_all_notice_url_list(notice_pages_list) # 【動態】自適應獲取公告的內容位置,思路是找到字數最多的p notice_content_location = get_notice_content_location(all_notice_list) # 【靜態】將上面函式獲取的內容去獲取頁面的內容xpath notice_content_location_xpath = get_xpath(notice_content_location, all_notice_list[0], "p") # 將取得的單項xpath處理成上級div目錄 after_content_xpath = handle_suffix_of_content_xpath(notice_content_location_xpath) # 【靜態】將取得的通用xpath轉化為內容 get_notice(all_notice_list, after_content_xpath) # current_html = requests.get(current_url) # current_html.encoding = "utf-8" # selector = etree.HTML(current_html.text) # notice = selector.xpath(notice_title_href_xpath) # print(notice) # 獲取當前頁面的所有公告 # for result in notice: # # 獲得網址 # result_url = urljoin(current_url, result) # print("網址: ", result_url) # result_html = requests.get(result_url) # result_html.encoding = "utf-8" # result_detail = etree.HTML(result_html.text) # result_title = result_detail.xpath(notice_title_xpath) # print("標題: ", result_title) # # result_content = result_detail.xpath(notice_content_xpath) # print("內容: ") # for result_print in result_content: # print(result_print) # print("\n") # time.sleep(delay)
配置
{ "?url": "【可修改】用於查詢的網頁", "url": "http://www.dce.com.cn/dalianshangpin/ywfw/jystz/ywtz/13305-2.html", "?search": "【可修改】要查詢的公告內容", "search": "玉米", "?notice_pages": "【可修改】爬取公告的頁數的上限,預設100頁", "notice_pages": 10, "?1": "——————【分割線,以下請在瞭解Xpath後修改】——————", "?open_xpath_search_mode": "是否開啟xpath搜尋模式,預設關閉0,開啟是1", "open_xpath_search_mode": 0, "?notice_title_href_xpath": "獲取每個公告href的Xpath位置", "notice_title_href_xpath": "//*[@class='lists diylist']/li/a/@href", "?notice_title_xpath": "獲取每個公告title的Xpath位置", "notice_title_xpath": "//div[@class='pub-det-title']/text()", "?notice_content_xpath": "獲取每個公告content的Xpath位置", "notice_content_xpath": "//*[contains(text(),'替換') and @style]/text()", "?2": "——————【分割線,以下只允許程式設計師除錯】——————", "?next_page_keywords": "呼叫selenium時候獲取下一頁href的關鍵詞", "next_page_keywords": "//*[contains(text(),'下一頁') or contains(text(),'下頁')]", "?delay": "延遲,防止反爬蟲", "delay": 1, "?notice_list_number": "防止置頂帖子引起公告列表xpath混亂,預設採用第二個公告作為採集xpath連結", "notice_list_number": 1, "?content_page_number": "公告內容位置監測自適應頁面數量,越多越慢", "content_page_number": 1 }
參考連結
innerhtml與outerhtml的區別_高明懿大可愛的部落格-CSDN部落格_innerhtml和outerhtml的區別
Python BeautifulSoup 實戰: 去除 HTML 中的註釋 - 樂天筆記 (letianbiji.com)