1. 程式人生 > >爬蟲08-鏈家

爬蟲08-鏈家

import requests
import re

start = int(input('起始頁碼:'))
end = int(input('終止頁碼:'))
for page in range(start, end+1):
    url = 'https://sh.lianjia.com/ershoufang/pg'+str(page)
    html = requests.get(url).content.decode('utf-8', 'ignore')
    p = re.compile('(?<=<div class="info clear">).*?(?=</div>)', re.S | re.M)
    div = p.findall(html)
    div.pop()
    for d in div:
        pa = re.compile(r'(?<=data-sl=\"\">).*?(?=</a>)')
        title = re.findall(pa, d)
        # title
        title = title[0]
        pat = re.compile(r'(?<=href=\").*?(?=\")')
        href = re.findall(pat, d)
        # url
        href = href[0]
        # 第二個頁面的資料
        second = requests.get(href).content.decode('utf-8', 'ignore')
        second_list = re.compile('<span class="label">(.*?)</span>(.*?)</li>')
        second_con = re.findall(second_list, second)
        print(second_con)
    # 總價
    total = re.compile(r'<div class="totalPrice"><span>(.*?)</span>', re.S | re.M)
    totalPrice = total.findall(html)
    for t in totalPrice:
        total_price = t + '萬'
    # 單價
    price = re.compile(r'<div class="unitPrice" .*?<span>(.*?)</span>')
    unitPrice = re.findall(price, html)
    for unit in unitPrice:
        print(unit)
    # 小區名稱
    name = re.compile(r'<div class="houseInfo">.*?region\">(.*?)</a>', re.S | re.M)
    houseInfo = re.findall(name, html)
    for house in houseInfo:
        print(house)
    # 房屋戶型 面積 朝向 裝修情況 有無電梯
    some = re.compile(r'<div class=\"houseInfo\">.*?</a>(.*?)</div>', re.S | re.M)
    some = re.findall(some, html)
    for so in some:
        print(so)