1. 程式人生 > >爬蟲鏈家網站獲取信息

爬蟲鏈家網站獲取信息

url ping __name__ rsh .com lib pin ensure %d

import re
import json
from urllib.request import urlopen
import ssl
# ?掉數字簽名證書
ssl._create_default_https_context = ssl._create_unverified_context

ershoufang_url=https://bj.lianjia.com/ershoufang/rs/

def get_html_content(url):
    html=urlopen(url)
    content=html.read().decode(utf-8)
    # print(content)
return content def chuli(content): obj=re.compile(r<span.*?>關註</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>萬</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>
,re.S) it=obj.finditer(content) for el in it: yield { 價格::el.group(price)+, 房屋信息::el.group(title), 平米數::el.group(pingmi), 朝向:el.group(fangxiang), 裝修::el.group(zhuangxiu).replace(<span>/</span>
,,), 房本信息::el.group(fangben).replace(隨時看房,無信息).replace(關註,無信息), } def xieru(jieguo): txt=json.dumps(jieguo,ensure_ascii=False) with open(houseInfo,mode=a,encoding=utf-8)as f: f.write(txt+\n) def main(): for i in range(1,101): if i ==1: new_content = get_html_content(ershoufang_url) else: dong_url=https://bj.lianjia.com/ershoufang/pg%d/%i new_content = get_html_content(dong_url) ret = chuli(new_content) for el in ret: xieru(el) print(el) if __name__==__main__: main()

爬蟲鏈家網站獲取信息