1. 程式人生 > 其它 >python+ selenium爬取房天下新房詳情

python+ selenium爬取房天下新房詳情

新房詳情

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
from datetime import datetime
import re

option = webdriver.ChromeOptions()
# 防止列印一些無用的日誌
option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging
']) b = webdriver.Chrome(executable_path ="D:\chrome_driver_win32\chromedriver.exe", chrome_options=option) num = 1 base_urls = "https://nanjing.newhouse.fang.com/house/s/b9{}/".format(num) b.get(base_urls) name = b.find_elements_by_xpath('//*[@class="nl_con clearfix"]/ul/li/div/div[1]/a') house_lst = []
for i in name: href = (i.get_attribute('href')) house_lst.append(href) data_list = [] for url in house_lst: b.get(url) data = {} # 獲取樓盤詳情 quyu = b.find_element_by_xpath( '//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a').text # 一級區域 data['subarea'] = quyu[:-2] #
字串切片,去掉後面2個字 data['area'] = b.find_element_by_xpath('//div[@class="s2"]/div/a').text # 當前城市 try: # 詳情裡的屬性 fangyuan_url = b.find_element_by_xpath( "//*[@class='main_1200 tf']//div[@class='cxfnav']//a[contains(text(),'樓盤詳情')]") href1 = fangyuan_url.get_attribute('href') b.get(href1) nodes= any main_items = b.find_elements_by_xpath('//div[@class="main_1200 tf"]//div[@class="main_1200"]//div[@class="main-cont clearfix"]//div[@class="main-left"]//div[@class="main-item"]') for i in main_items: # print(i.find_element_by_xpath(".//h3").text) # .//表示當前目錄下的 xxx nodes1 = i.find_elements_by_xpath('.//ul//li') for n in nodes1: print(n.text) print('-'*50) # xxx位置及周邊 dingwei_url = b.find_element_by_xpath('//div[@class="mapbox_dt"]/iframe').get_attribute( "src") # 獲取定位連線 b.get(dingwei_url) sound_code = b.page_source # 獲取網站的原始碼 re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', sound_code, re.DOTALL) # 樓盤座標..正則匹配"mapx":後面數數字 data['housecoord'] = re_search.group(2) + "," + re_search.group(1) except Exception as e: pass data_list.append(data) break print(data_list) with open('詳情(南京).jsonlines', 'a', encoding='utf8') as f: for data in data_list: json.dump(data, f, ensure_ascii=False) f.write('\n') b.quit()