1. 程式人生 > >鏈家資料爬取+地圖找房

鏈家資料爬取+地圖找房

一、鏈家資料爬取

(由於鏈家二手房搜尋結果有100頁的限制,也就是隻能搜到3000條結果,因此,我將按照城區搜尋結果進行爬取)

首先從搜尋結果頁面獲得二手房詳情頁面的url,儲存到apartment_url.csv中

# -*- coding: utf-8 -*-
import csv
import re
import urllib2  
import sqlite3
import random
import threading
from bs4 import BeautifulSoup

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\
    {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\
    {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\
    {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\
    {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\
    {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]


websites = ["dongcheng","xicheng","chaoyang/p1p2p3p4","chaoyang/p5","chaoyang/p6","chaoyang/p7p8","haidian/p1p2p3p4p5p6","haidian/p7p8","fengtai","shijingshan","tongzhou","changping","daxing","yizhuangkaifaqu","shunyi","fangshan","mentougou","pinggu","huairou","miyun"]


pages = [37,42,60,50,80,82,85,42,77,24,41,97,54,13,33,26,16,1,1,1]

pages_url = []

detail_pages_url = []

lock = threading.Lock()

def generate_pages_url():
    for i in range(len(websites)):
        for j in range(1,pages[i]+1):
            pages_url.append("https://bj.lianjia.com/ershoufang/"+websites[i] + "/pg" + str(j))

def generate_apartments_url():
    # url_spider(pages_url[0])
    for url in pages_url:
        url_spider(url)

    apartment_w = open("apartment_url.csv", "a")
    csv_writer = csv.writer(apartment_w, dialect = "excel")
    for i in detail_pages_url:
        row = [i]
        csv_writer.writerow(row)

def url_spider(url):
    try:
        req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)])
        source_code = urllib2.urlopen(req,timeout=5).read()
        plain_text=unicode(source_code)#,errors='ignore')   
        soup = BeautifulSoup(plain_text)
    except (urllib2.HTTPError, urllib2.URLError), e:
        print e
        return
    except Exception,e:
        print e
        return

    cj_list=soup.findAll('div',{'class':'info clear'})
    for cj in cj_list:
        title=cj.find('div',{'class':'title'})
        a=title.find('a').get('href')
        detail_pages_url.append(a)
        # print a
        # row[a]
        # csv_writer.writerow(row)

if __name__=="__main__":
    
    #產生北京市內在售二手房所有搜尋頁面列表 儲存到pages_url
    generate_pages_url()

    #產生北京市內在售二手房所有詳情頁面列表 儲存到detail_pages_url
    generate_apartments_url()
從詳情頁面獲取二手房詳情資訊以及圖片url
# -*- coding: utf-8 -*-
import csv
import re
import urllib2  
import sqlite3
import random
import threading
from bs4 import BeautifulSoup

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

#Some User Agents
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\
    {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\
    {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\
    {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\
    {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\
    {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]

def apartment_spider(url):
    print(url)
    try:
        req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)])
        source_code = urllib2.urlopen(req,timeout=5).read()
        plain_text=unicode(source_code)#,errors='ignore')   
        soup = BeautifulSoup(plain_text)
    except (urllib2.HTTPError, urllib2.URLError), e:
        print e
        return
    except Exception,e:
        print e
        return

    #房屋資訊title
    title = soup.find('div',{'class':'sellDetailHeader'}).find('div',{'class':'title-wrapper'}).find('div',{'class':'content'}).find('div',{'class':'title'}).find('h1').get('title')

    overview = soup.find('div',{'class':'overview'}).find('div',{'class':'content'})

    #圖片img
    img_info = soup.find('div',{'class':'overview'}).find('div',{'class':'thumbnail'}).find('li')
    if(img_info!=None):
        img = img_info.get('data-src')
    else:
        img = ""

    #總價total_price
    total_price = soup.find('div',{'class':'price'}).find('span',{'class':'total'}).next_element

    #單價unit_price
    unit_price = overview.find('div',{'class':'price'}).find('div',{'class':'text'}).find('div',{'class':'unitPrice'}).find('span').next_element

    #朝向orientation
    orientation = overview.find('div',{'class':'type'}).find('div',{'class':'mainInfo'}).next_element

    #面積area
    area = overview.find('div',{'class':'area'}).find('div',{'class':'mainInfo'}).next_element.split('平米')[0]

    year_info = overview.find('div',{'class':'area'}).find('div',{'class':'subInfo'}).next_element

    aroundInfo = overview.find('div',{'class':'aroundInfo'})

    #小區名稱name
    name_info = aroundInfo.find('div',{'class':'communityName'})
    
    name = name_info.find('a',{'class':'info'}).next_element
    name_id = name_info.find('a',{'class':'info'}).get('href').split('/')[2]
    
    subway_info = aroundInfo.find('div',{'class':'areaName'}).find('a',{'class':'supplement'}).next_element

    place_info = aroundInfo.find('div',{'class':'areaName'}).find('span',{'class':'info'}).findAll('a')
    #城區district
    district = place_info[0].next_element

    #地段place
    place = place_info[1].next_element

    #id
    id = aroundInfo.find('div',{'class':'houseRecord'}).find('span',{'class':'info'}).next_element
    
    introContent = soup.find('div',{'class':'m-content'}).find('div',{'class':'box-l'}).find('div',{'id':'introduction'}).find('div',{'class':'introContent'})

    room_type = str(introContent.find('div',{'class':'content'}).findAll('li')[0]).split('</span>')[1].split('</li>')[0]

    time = str(introContent.find('div',{'class':'transaction'}).findAll('li')[0]).split('</span>')[1].split('</li>')[0]

    # print "title: " + title
    # print "img: " + img
    # print "price: " + total_price
    # print "unit price: " + unit_price
    # print "orientation: " + orientation
    # print district
    # print place
    # print "area: " + area
    # print "name: " + name
    # print "name id: " + name_id
    # print time
    # print room_type
    # print "id: " + id
    # print "subway info:" + subway_info



    row = [title, total_price, unit_price, orientation, district, place, area, name, name_id, time, 
    room_type, id, subway_info]

    csv_writer.writerow(row)

    img_row = [id, img]
    csv_writer2.writerow(img_row)

if __name__=="__main__":

    # 負責記錄apartment詳情
    apartment_w = open("apartments19000+.csv", "a")
    csv_writer = csv.writer(apartment_w, dialect = "excel")

    # 負責記錄img網址
    img_w = open("img19000+.csv", "a")
    csv_writer2 = csv.writer(img_w, dialect = "excel")

    # apartment url讀取
    urls = open("19000+.csv", "r")

    for url in urls:
        apartment_spider(url)

    apartment_w.close()
    img_w.close()

爬取圖片
#coding = utf-8
import urllib

def img(url,id):
    try:
        response = urllib.urlopen(url)
        get_img = response.read()
    except Exception,e:
        print e
        return
    with open('imgs/'+str(id)+'.jpg','wb') as fp:
        fp.write(get_img)
        print(id)
    return

if __name__=="__main__":
    
    lines = open("img19000+.csv", "r")

    count = 0

    for line in lines:
        id = line.split(',')[0]
        url = line.split(',')[1]
        img(url,id)
        count = count + 1
        print count
二、使用kettle將csv中的資料處理並存儲到mysql資料庫中