python_爬取【安居客房源資訊】
阿新 • • 發佈:2019-02-16
最近在看房子,試著抓取了安居客上房源資訊,供大家學習參考。
#-*- encoding=UTF-8 -*- from urllib.request import urlopen from bs4 import BeautifulSoup import xlrd import xlwt city="sz" ###城市縮寫 html_sheet=5 ###頁面數 url="https://"+str(city)+".fang.anjuke.com/loupan/all/p" ###各個title資訊 address_key="<span class=\"list-map\" target=\"_blank\">" huxing_key="huxing" url_key="class=\"tags-wrap\" href=\"" panel_key="class=\"tag-panel\"" price_key="class=\"price\"" price_around_key="\"favor-tag around-price\"" tel_key="class=\"tel\">" ###樓盤名 loupan_title=[] ###樓盤地址 loupan_address=[] ###樓盤戶型 loupan_huxing=[] ###樓盤URL loupan_url=[] ###樓盤panel loupan_panel=[] ###樓盤price loupan_price=[] ###樓盤tel loupan_tel=[] for inum in range(html_sheet): real_url=url+str(inum+1)+"/" print(real_url) html = urlopen(real_url) anjuke_html = BeautifulSoup(html.read()) ###得到原始資訊 title_key_start = "<span class=\"items-name\">" loupan_arr = [] for ihtml in anjuke_html: data = str(ihtml).split(title_key_start) if len(data) > 1: for i in data: loupan_arr.append(i) ###篩選原始資訊 title_key_stop = "\"list-page\"" dest_loupan_arr = [] for i in range(len(loupan_arr)): if i != 0 and i != len(loupan_arr) - 1: dest_loupan_arr.append(loupan_arr[i]) if i == len(loupan_arr) - 1: data = str(loupan_arr[i]).split(title_key_stop) dest_loupan_arr.append(data[0]) ###解析網頁 for i in dest_loupan_arr: price_flag = 0 ##部分樓盤售價待定 huxing_flag = 0 ##部分樓盤戶型未知 tel_flag = 0 ##部分樓盤電環未知 data = str(i).split("\n") for j in range(len(data)): if j == 0: # loupan_key loupan_title.append(data[j].split("<")[0]) continue if address_key in data[j]: loupan_address.append(data[j].split(address_key)[1].split("<")[0]) continue if huxing_key in data[j]: huxing_flag = 1 real_j = j + 1 tmp_huxing_str = "" while data[real_j] != "</a>": if "<span>" in data[real_j]: tmp = data[real_j].split("<span>") for it in tmp: if "<" in it: t = it.split("<")[0] if "建築面積" in t: tmp_huxing_str = tmp_huxing_str.rstrip("/") + " " tmp_huxing_str = tmp_huxing_str + t else: tmp_huxing_str = tmp_huxing_str + t + "/" else: tmp_huxing_str = tmp_huxing_str + data[real_j].strip() real_j = real_j + 1 loupan_huxing.append(tmp_huxing_str) continue if url_key in data[j]: loupan_url.append(data[j].split(url_key)[1].split("\"")[0]) continue if panel_key in data[j]: real_j = j + 1 tmp_panel_str = "" while data[real_j] != "</div>": tmp_panel_str = tmp_panel_str + data[real_j].split(">")[1].split("<")[0] + " " real_j = real_j + 1 loupan_panel.append(tmp_panel_str.strip()) continue if price_key in data[j]: price_flag = 1 tmp = data[j].split(">") tmp_price_str = "" for it in tmp: tmp_price_str = tmp_price_str + it.split("<")[0] loupan_price.append(tmp_price_str) continue if price_around_key in data[j]: price_flag = 1 real_j = j + 1 tmp = data[real_j].split(">") tmp_price_str = "" for it in tmp: tmp_price_str = tmp_price_str + it.split("<")[0].strip() loupan_price.append(tmp_price_str) continue if tel_key in data[j]: tel_flag = 1 loupan_tel.append(data[j].split(tel_key)[1].split("<")[0]) continue if price_flag == 0: loupan_price.append("售價待定") if huxing_flag == 0: loupan_huxing.append("戶型未知") if tel_flag == 0: loupan_tel.append("號碼未知") excel_col=[] excel_col.append([u'樓盤',u'價格',u'戶型',u'地址',u'狀態',u'網址',u'電話']) for icol in range(len(loupan_title)): tmp=[loupan_title[icol],loupan_price[icol],loupan_huxing[icol],loupan_address[icol],loupan_panel[icol],loupan_url[icol],loupan_tel[icol]] excel_col.append(tmp) ####寫入excel app = xlwt.Workbook() #建立工作簿 sheet1 = app.add_sheet(u'sheet1',cell_overwrite_ok=True) #建立sheetapp for icol in range(len(excel_col)): for jcol in range(0,len(excel_col[icol])): sheet1.write(icol,jcol,excel_col[icol][jcol]) app.save("C:/bz/"+str(city)+".xlsx") #儲存檔案
結果展示: