1. 程式人生 > >python_爬取【安居客房源資訊】

python_爬取【安居客房源資訊】

最近在看房子,試著抓取了安居客上房源資訊,供大家學習參考。

#-*- encoding=UTF-8 -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import xlrd
import xlwt

city="sz"     ###城市縮寫
html_sheet=5   ###頁面數
url="https://"+str(city)+".fang.anjuke.com/loupan/all/p"

###各個title資訊
address_key="<span class=\"list-map\" target=\"_blank\">"
huxing_key="huxing"
url_key="class=\"tags-wrap\" href=\""
panel_key="class=\"tag-panel\""
price_key="class=\"price\""
price_around_key="\"favor-tag around-price\""
tel_key="class=\"tel\">"

###樓盤名
loupan_title=[]
###樓盤地址
loupan_address=[]
###樓盤戶型
loupan_huxing=[]
###樓盤URL
loupan_url=[]
###樓盤panel
loupan_panel=[]
###樓盤price
loupan_price=[]
###樓盤tel
loupan_tel=[]

for inum in range(html_sheet):
    real_url=url+str(inum+1)+"/"
    print(real_url)
    html = urlopen(real_url)
    anjuke_html = BeautifulSoup(html.read())
    ###得到原始資訊
    title_key_start = "<span class=\"items-name\">"
    loupan_arr = []
    for ihtml in anjuke_html:
        data = str(ihtml).split(title_key_start)
        if len(data) > 1:
            for i in data:
                loupan_arr.append(i)
    ###篩選原始資訊
    title_key_stop = "\"list-page\""
    dest_loupan_arr = []
    for i in range(len(loupan_arr)):
        if i != 0 and i != len(loupan_arr) - 1:
            dest_loupan_arr.append(loupan_arr[i])
        if i == len(loupan_arr) - 1:
            data = str(loupan_arr[i]).split(title_key_stop)
            dest_loupan_arr.append(data[0])
    ###解析網頁
    for i in dest_loupan_arr:
        price_flag = 0  ##部分樓盤售價待定
        huxing_flag = 0 ##部分樓盤戶型未知
        tel_flag = 0    ##部分樓盤電環未知
        data = str(i).split("\n")
        for j in range(len(data)):
            if j == 0:  # loupan_key
                loupan_title.append(data[j].split("<")[0])
                continue
            if address_key in data[j]:
                loupan_address.append(data[j].split(address_key)[1].split("<")[0])
                continue
            if huxing_key in data[j]:
                huxing_flag = 1
                real_j = j + 1
                tmp_huxing_str = ""
                while data[real_j] != "</a>":
                    if "<span>" in data[real_j]:
                        tmp = data[real_j].split("<span>")
                        for it in tmp:
                            if "<" in it:
                                t = it.split("<")[0]
                                if "建築面積" in t:
                                    tmp_huxing_str = tmp_huxing_str.rstrip("/") + " "
                                    tmp_huxing_str = tmp_huxing_str + t
                                else:
                                    tmp_huxing_str = tmp_huxing_str + t + "/"
                    else:
                        tmp_huxing_str = tmp_huxing_str + data[real_j].strip()
                    real_j = real_j + 1
                loupan_huxing.append(tmp_huxing_str)
                continue
            if url_key in data[j]:
                loupan_url.append(data[j].split(url_key)[1].split("\"")[0])
                continue
            if panel_key in data[j]:
                real_j = j + 1
                tmp_panel_str = ""
                while data[real_j] != "</div>":
                    tmp_panel_str = tmp_panel_str + data[real_j].split(">")[1].split("<")[0] + " "
                    real_j = real_j + 1
                loupan_panel.append(tmp_panel_str.strip())
                continue
            if price_key in data[j]:
                price_flag = 1
                tmp = data[j].split(">")
                tmp_price_str = ""
                for it in tmp:
                    tmp_price_str = tmp_price_str + it.split("<")[0]
                loupan_price.append(tmp_price_str)
                continue
            if price_around_key in data[j]:
                price_flag = 1
                real_j = j + 1
                tmp = data[real_j].split(">")
                tmp_price_str = ""
                for it in tmp:
                    tmp_price_str = tmp_price_str + it.split("<")[0].strip()
                loupan_price.append(tmp_price_str)
                continue
            if tel_key in data[j]:
                tel_flag = 1
                loupan_tel.append(data[j].split(tel_key)[1].split("<")[0])
                continue
        if price_flag == 0:
            loupan_price.append("售價待定")
        if huxing_flag == 0:
            loupan_huxing.append("戶型未知")
        if tel_flag == 0:
            loupan_tel.append("號碼未知")

excel_col=[]
excel_col.append([u'樓盤',u'價格',u'戶型',u'地址',u'狀態',u'網址',u'電話'])
for icol in range(len(loupan_title)):
    tmp=[loupan_title[icol],loupan_price[icol],loupan_huxing[icol],loupan_address[icol],loupan_panel[icol],loupan_url[icol],loupan_tel[icol]]
    excel_col.append(tmp)

####寫入excel
app = xlwt.Workbook() #建立工作簿
sheet1 = app.add_sheet(u'sheet1',cell_overwrite_ok=True) #建立sheetapp

for icol in range(len(excel_col)):
    for jcol in range(0,len(excel_col[icol])):
        sheet1.write(icol,jcol,excel_col[icol][jcol])
app.save("C:/bz/"+str(city)+".xlsx") #儲存檔案

結果展示: