python 爬蟲例項(四)
阿新 • • 發佈:2018-12-19
環境:
OS:Window10
python:3.7
爬取鏈家地產上面的資料,兩個畫面上的資料的爬取
效果,下面的兩個網頁中的資料取出來
程式碼
import datetime import threading import requests from bs4 import BeautifulSoup class LianjiaHouseInfo: ''' 初期化變數的值 ''' def __init__(self): # 定義自己要爬取的URL self.url = "https://dl.lianjia.com/ershoufang/pg{0}" self.path = r"C:\pythonProject\Lianjia_House" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"} ''' 訪問URL ''' def request(self, param): # 如果不加的話可能會出現403的錯誤,所以儘量的都加上header,模仿網頁來訪問 req = requests.get(param, headers=self.headers) # req.raise_for_status() # req.encoding = req.apparent_encoding return req.text ''' page設定 ''' def all_pages(self, pageCn): dataListA = [] for i in range(1, pageCn+1): if pageCn == 1: dataListA = dataListA + self.getData(self.url[0:self.url.find("pg")]) else: url = self.url.format(i) dataListA = dataListA + self.getData(url) # self.dataOrganize(dataListA) ''' 資料取得 ''' def getData(self, url): dataList = [] thread_lock.acquire() req = self.request(url) # driver = webdriver.Chrome() # driver.get(self.url) # iframe_html = driver.page_source # driver.close() # print(iframe_html) soup = BeautifulSoup(req, 'lxml') countHouse = soup.find(class_="total fl").find("span") print("共找到 ", countHouse.string, " 套大連二手房") sell_all = soup.find(class_="sellListContent").find_all("li") for sell in sell_all: title = sell.find(class_="title") if title is not None: print("------------------------概要--------------------------------------------") title = title.find("a") print("title:", title.string) housInfo = sell.find(class_="houseInfo").get_text() print("houseInfo:", housInfo) positionInfo = sell.find(class_="positionInfo").get_text() print("positionInfo:", positionInfo) followInfo = sell.find(class_="followInfo").get_text() print("followInfo:", followInfo) print("------------------------詳細資訊--------------------------------------------") url_detail = title["href"] req_detail = self.request(url_detail) soup_detail = BeautifulSoup(req_detail, "lxml") total = soup_detail.find(class_="total") unit = soup_detail.find(class_="unit").get_text() dataList.append(total.string+unit) print("總價:", total.string, unit) unitPriceValue = soup_detail.find(class_="unitPriceValue").get_text() dataList.append(unitPriceValue) print("單價:", unitPriceValue) room_mainInfo = soup_detail.find(class_="room").find(class_="mainInfo").get_text() dataList.append(room_mainInfo) print("戶型:", room_mainInfo) type_mainInfo = soup_detail.find(class_="type").find(class_="mainInfo").get_text() dataList.append(type_mainInfo) print("朝向:", type_mainInfo) area_mainInfo = soup_detail.find(class_="area").find(class_="mainInfo").get_text() dataList.append(area_mainInfo) print("面積:", area_mainInfo) else: print("広告です") thread_lock.release() return dataList # # def dataOrganize(self, dataList): # # data2 = pd.DataFrame(dataList) # data2.to_csv(r'C:\Users\peiqiang\Desktop\lagoujob.csv', header=False, index=False, mode='a+') # data3 = pd.read_csv(r'C:\Users\peiqiang\Desktop\lagoujob.csv', encoding='gbk') thread_lock = threading.BoundedSemaphore(value=100) house_Info = LianjiaHouseInfo() startTime = datetime.datetime.now() house_Info.all_pages(1) endTime = datetime.datetime.now() print("実行時間:", (endTime - startTime).seconds)
執行之後的效果