1. 程式人生 > 其它 >爬蟲+資料分析:重慶買房嗎?爬取重慶房價

爬蟲+資料分析:重慶買房嗎?爬取重慶房價

技術標籤:爬蟲爬蟲

現在結婚,女方一般要求城裡有套房。要了解近些年的房價,首先就要獲取網上的房價資訊,今天以重慶鏈家網上出售的房價資訊為例,將資料爬取下來分析。

爬蟲部分

一.網址分析
https://cq.fang.lianjia.com/loupan/

下面我們來分析我們所要提取的資訊的位置,開啟開發者模式查詢元素,我們找到房子如下圖.如圖發現,一個房子資訊被儲存到一個li標籤裡。

單擊一個li標籤,再查詢房子名,地址,房價資訊。

網址分析,當我點選下一頁時,網路地址pg引數會發生變化。
第一頁pg1,第二頁pg2…

二.單頁網址爬取
採取requests-Beautiful Soup的方式來爬取

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import  RequestException
import pandas as pd
#讀取網頁
def craw(url,page):
    try:

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"
} html1 = requests.request("GET", url, headers=headers,timeout=10) html1.encoding ='utf-8' # 加編碼,重要!轉換為字串編碼,read()得到的是byte格式的 html=html1.text return html except RequestException:#其他問題 print('讀取error') return None for i in range(1,2):#遍歷網頁1
url="https://cq.fang.lianjia.com/loupan/pg"+str(i)+"/" html=craw(url,i) print(html) print('結束')

三.網頁資訊提取


#解析網頁並儲存資料到表格
def pase_page(url,page):
    html=craw(url,page)
    html = str(html)
    if html is not None:
        soup = BeautifulSoup(html, 'lxml')
        "--先確定房子資訊,即li標籤列表--"
        houses=soup.select('.resblock-list-wrapper li')#房子列表
        "--再確定每個房子的資訊--"
        for house in houses:#遍歷每一個房子
            "名字"
            recommend_project=house.select('.resblock-name a.name')
            recommend_project=[i.get_text()for i in recommend_project]#名字 英華天元,斌鑫江南御府...
            #print(recommend_project)
            "型別"
            house_type=house.select('.resblock-name span.resblock-type')
            house_type=[i.get_text()for i in house_type]#寫字樓,底商...
            #print(house_type)
            "銷售狀態"
            sale_status = house.select('.resblock-name span.sale-status')
            sale_status=[i.get_text()for i in sale_status]#在售,在售,售罄,在售...
            #print(sale_status)
            "大地址:如['南岸', '南坪']"
            big_address=house.select('.resblock-location span')
            big_address=[i.get_text()for i in big_address]#['南岸', '南坪'],['巴南', '李家沱']...
            #print(big_address)
            "具體地址:如:銅元局輕軌站菜園壩長江大橋南橋頭堡上"
            small_address=house.select('.resblock-location a')
            small_address=[i.get_text()for i in small_address]#銅元局輕軌站菜園壩長江大橋南橋頭堡上,龍洲大道1788號..
            #print(small_address)
            "優勢。如:['環線房', '近主幹道', '配套齊全', '購物方便']"
            advantage=house.select('.resblock-tag span')
            advantage=[i.get_text()for i in advantage]#['環線房', '近主幹道', '配套齊全', '購物方便'],['地鐵沿線', '公交直達', '配套齊全', '購物方便']
            #print(advantage)
            "均價:多少1平"
            average_price=house.select('.resblock-price .main-price .number')
            average_price=[i.get_text()for i in average_price]#16000,25000,價格待定..
            #print(average_price)
            "總價,單位萬"
            total_price=house.select('.resblock-price .second')
            total_price=[i.get_text()for i in total_price]#總價400萬/套,總價100萬/套'...
            #print(total_price)

四.多頁爬取,並將資訊儲存到表格

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import  RequestException
import pandas as pd
#讀取網頁
def craw(url,page):
    try:

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
        html1 = requests.request("GET", url, headers=headers,timeout=10)
        html1.encoding ='utf-8' # 加編碼,重要!轉換為字串編碼,read()得到的是byte格式的
        html=html1.text

        return html
    except RequestException:#其他問題
        print('第{0}讀取網頁失敗'.format(page))
        return None
#解析網頁並儲存資料到表格
def pase_page(url,page):
    html=craw(url,page)
    html = str(html)
    if html is not None:
        soup = BeautifulSoup(html, 'lxml')
        "--先確定房子資訊,即li標籤列表--"
        houses=soup.select('.resblock-list-wrapper li')#房子列表
        "--再確定每個房子的資訊--"
        for j in range(len(houses)):#遍歷每一個房子
            house=houses[j]
            "名字"
            recommend_project=house.select('.resblock-name a.name')
            recommend_project=[i.get_text()for i in recommend_project]#名字 英華天元,斌鑫江南御府...
            recommend_project=' '.join(recommend_project)
            #print(recommend_project)
            "型別"
            house_type=house.select('.resblock-name span.resblock-type')
            house_type=[i.get_text()for i in house_type]#寫字樓,底商...
            house_type=' '.join(house_type)
            #print(house_type)
            "銷售狀態"
            sale_status = house.select('.resblock-name span.sale-status')
            sale_status=[i.get_text()for i in sale_status]#在售,在售,售罄,在售...
            sale_status=' '.join(sale_status)
            #print(sale_status)
            "大地址:如['南岸', '南坪']"
            big_address=house.select('.resblock-location span')
            big_address=[i.get_text()for i in big_address]#['南岸', '南坪'],['巴南', '李家沱']...
            big_address=''.join(big_address)
            #print(big_address)
            "具體地址:如:銅元局輕軌站菜園壩長江大橋南橋頭堡上"
            small_address=house.select('.resblock-location a')
            small_address=[i.get_text()for i in small_address]#銅元局輕軌站菜園壩長江大橋南橋頭堡上,龍洲大道1788號..
            small_address=' '.join(small_address)
            #print(small_address)
            "優勢。如:['環線房', '近主幹道', '配套齊全', '購物方便']"
            advantage=house.select('.resblock-tag span')
            advantage=[i.get_text()for i in advantage]#['環線房', '近主幹道', '配套齊全', '購物方便'],['地鐵沿線', '公交直達', '配套齊全', '購物方便']
            advantage=' '.join(advantage)
            #print(advantage)
            "均價:多少1平"
            average_price=house.select('.resblock-price .main-price .number')
            average_price=[i.get_text()for i in average_price]#16000,25000,價格待定..
            average_price=' '.join(average_price)
            #print(average_price)
            "總價,單位萬"
            total_price=house.select('.resblock-price .second')
            total_price=[i.get_text()for i in total_price]#總價400萬/套,總價100萬/套'...
            total_price=' '.join(total_price)
            #print(total_price)

            "--------------寫入表格-------------"
            information = [recommend_project, house_type, sale_status,big_address,small_address,advantage,average_price,total_price]
            information = np.array(information)
            information = information.reshape(-1, 8)
            information = pd.DataFrame(information, columns=['名稱', '型別', '銷售狀態','大地址','具體地址','優勢','均價','總價'])
            if page== 1 and j==0:
                information.to_csv('鏈家網重慶房子資料.csv', mode='a+', index=False)  # mode='a+'追加寫入
            else:
                information.to_csv('鏈家網重慶房子資料.csv', mode='a+', index=False, header=False)  # mode='a+'追加寫入
        print('第{0}頁儲存資料成功'.format(page))
    else:
        print('解析失敗')


for i  in range(1,101):#遍歷網頁1
    url="https://cq.fang.lianjia.com/loupan/pg"+str(i)+"/"
    pase_page(url,i)


print('結束')


五.多執行緒爬取

from bs4 import BeautifulSoup
import numpy as np
import requests
from requests.exceptions import  RequestException
import pandas as pd


#讀取網頁
def craw(url,page):
    try:

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}
        html1 = requests.request("GET", url, headers=headers,timeout=10)
        html1.encoding ='utf-8' # 加編碼,重要!轉換為字串編碼,read()得到的是byte格式的
        html=html1.text

        return html
    except RequestException:#其他問題
        print('第{0}讀取網頁失敗'.format(page))
        return None
#解析網頁並儲存資料到表格
def pase_page(url,page):
    html=craw(url,page)
    html = str(html)
    if html is not None:
        soup = BeautifulSoup(html, 'lxml')
        "--先確定房子資訊,即li標籤列表--"
        houses=soup.select('.resblock-list-wrapper li')#房子列表
        "--再確定每個房子的資訊--"
        for j in range(len(houses)):#遍歷每一個房子
            house=houses[j]
            "名字"
            recommend_project=house.select('.resblock-name a.name')
            recommend_project=[i.get_text()for i in recommend_project]#名字 英華天元,斌鑫江南御府...
            recommend_project=' '.join(recommend_project)
            #print(recommend_project)
            "型別"
            house_type=house.select('.resblock-name span.resblock-type')
            house_type=[i.get_text()for i in house_type]#寫字樓,底商...
            house_type=' '.join(house_type)
            #print(house_type)
            "銷售狀態"
            sale_status = house.select('.resblock-name span.sale-status')
            sale_status=[i.get_text()for i in sale_status]#在售,在售,售罄,在售...
            sale_status=' '.join(sale_status)
            #print(sale_status)
            "大地址:如['南岸', '南坪']"
            big_address=house.select('.resblock-location span')
            big_address=[i.get_text()for i in big_address]#['南岸', '南坪'],['巴南', '李家沱']...
            big_address=''.join(big_address)
            #print(big_address)
            "具體地址:如:銅元局輕軌站菜園壩長江大橋南橋頭堡上"
            small_address=house.select('.resblock-location a')
            small_address=[i.get_text()for i in small_address]#銅元局輕軌站菜園壩長江大橋南橋頭堡上,龍洲大道1788號..
            small_address=' '.join(small_address)
            #print(small_address)
            "優勢。如:['環線房', '近主幹道', '配套齊全', '購物方便']"
            advantage=house.select('.resblock-tag span')
            advantage=[i.get_text()for i in advantage]#['環線房', '近主幹道', '配套齊全', '購物方便'],['地鐵沿線', '公交直達', '配套齊全', '購物方便']
            advantage=' '.join(advantage)
            #print(advantage)
            "均價:多少1平"
            average_price=house.select('.resblock-price .main-price .number')
            average_price=[i.get_text()for i in average_price]#16000,25000,價格待定..
            average_price=' '.join(average_price)
            #print(average_price)
            "總價,單位萬"
            total_price=house.select('.resblock-price .second')
            total_price=[i.get_text()for i in total_price]#總價400萬/套,總價100萬/套'...
            total_price=' '.join(total_price)
            #print(total_price)

            "--------------寫入表格-------------"
            information = [recommend_project, house_type, sale_status,big_address,small_address,advantage,average_price,total_price]
            information = np.array(information)
            information = information.reshape(-1, 8)
            information = pd.DataFrame(information, columns=['名稱', '型別', '銷售狀態','大地址','具體地址','優勢','均價','總價'])

            information.to_csv('鏈家網重慶房子資料.csv', mode='a+', index=False, header=False)  # mode='a+'追加寫入
        print('第{0}頁儲存資料成功'.format(page))
    else:
        print('解析失敗')


#雙執行緒
import threading
for i  in range(1,99,2):#遍歷網頁1-101
    url1="https://cq.fang.lianjia.com/loupan/pg"+str(i)+"/"
    url2 = "https://cq.fang.lianjia.com/loupan/pg" + str(i+1) + "/"

    t1 = threading.Thread(target=pase_page, args=(url1,i))#執行緒1
    t2 = threading.Thread(target=pase_page, args=(url2,i+1))#執行緒2
    t1.start()
    t2.start()

可能是網的問題,很多頁的資料沒有讀取下來。

儲存到的資訊有近438條。原始資料有1838條。
可以自己把失敗的頁數儲存下來,再重新請求一次。我這裡就不搞啦。將就用。

在這裡插入圖片描述