爬蟲鏈家網站獲取信息
阿新 • • 發佈:2018-10-14
url ping __name__ rsh .com lib pin ensure %d
import re import json from urllib.request import urlopen import ssl # ?掉數字簽名證書 ssl._create_default_https_context = ssl._create_unverified_context ershoufang_url=‘https://bj.lianjia.com/ershoufang/rs/‘ def get_html_content(url): html=urlopen(url) content=html.read().decode(‘utf-8‘) # print(content)return content def chuli(content): obj=re.compile(r‘<span.*?>關註</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>萬</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>‘,re.S) it=obj.finditer(content) for el in it: yield { ‘價格:‘:el.group(‘price‘)+‘萬‘, ‘房屋信息:‘:el.group(‘title‘), ‘平米數:‘:el.group(‘pingmi‘), ‘朝向‘:el.group(‘fangxiang‘), ‘裝修:‘:el.group(‘zhuangxiu‘).replace(‘<span>/</span>‘,‘,‘), ‘房本信息:‘:el.group(‘fangben‘).replace(‘隨時看房‘,‘無信息‘).replace(‘關註‘,‘無信息‘), } def xieru(jieguo): txt=json.dumps(jieguo,ensure_ascii=False) with open(‘houseInfo‘,mode=‘a‘,encoding=‘utf-8‘)as f: f.write(txt+‘\n‘) def main(): for i in range(1,101): if i ==1: new_content = get_html_content(ershoufang_url) else: dong_url=‘https://bj.lianjia.com/ershoufang/pg%d/‘%i new_content = get_html_content(dong_url) ret = chuli(new_content) for el in ret: xieru(el) print(el) if __name__==‘__main__‘: main()
爬蟲鏈家網站獲取信息