爬蟲08-鏈家
阿新 • • 發佈:2018-11-01
import requests import re start = int(input('起始頁碼:')) end = int(input('終止頁碼:')) for page in range(start, end+1): url = 'https://sh.lianjia.com/ershoufang/pg'+str(page) html = requests.get(url).content.decode('utf-8', 'ignore') p = re.compile('(?<=<div class="info clear">).*?(?=</div>)', re.S | re.M) div = p.findall(html) div.pop() for d in div: pa = re.compile(r'(?<=data-sl=\"\">).*?(?=</a>)') title = re.findall(pa, d) # title title = title[0] pat = re.compile(r'(?<=href=\").*?(?=\")') href = re.findall(pat, d) # url href = href[0] # 第二個頁面的資料 second = requests.get(href).content.decode('utf-8', 'ignore') second_list = re.compile('<span class="label">(.*?)</span>(.*?)</li>') second_con = re.findall(second_list, second) print(second_con) # 總價 total = re.compile(r'<div class="totalPrice"><span>(.*?)</span>', re.S | re.M) totalPrice = total.findall(html) for t in totalPrice: total_price = t + '萬' # 單價 price = re.compile(r'<div class="unitPrice" .*?<span>(.*?)</span>') unitPrice = re.findall(price, html) for unit in unitPrice: print(unit) # 小區名稱 name = re.compile(r'<div class="houseInfo">.*?region\">(.*?)</a>', re.S | re.M) houseInfo = re.findall(name, html) for house in houseInfo: print(house) # 房屋戶型 面積 朝向 裝修情況 有無電梯 some = re.compile(r'<div class=\"houseInfo\">.*?</a>(.*?)</div>', re.S | re.M) some = re.findall(some, html) for so in some: print(so)