鏈家資料爬取+地圖找房
阿新 • • 發佈:2019-02-01
一、鏈家資料爬取
(由於鏈家二手房搜尋結果有100頁的限制,也就是隻能搜到3000條結果,因此,我將按照城區搜尋結果進行爬取)
首先從搜尋結果頁面獲得二手房詳情頁面的url,儲存到apartment_url.csv中
從詳情頁面獲取二手房詳情資訊以及圖片url# -*- coding: utf-8 -*- import csv import re import urllib2 import sqlite3 import random import threading from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding("utf-8") #Some User Agents hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\ {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\ {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\ {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\ {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}] websites = ["dongcheng","xicheng","chaoyang/p1p2p3p4","chaoyang/p5","chaoyang/p6","chaoyang/p7p8","haidian/p1p2p3p4p5p6","haidian/p7p8","fengtai","shijingshan","tongzhou","changping","daxing","yizhuangkaifaqu","shunyi","fangshan","mentougou","pinggu","huairou","miyun"] pages = [37,42,60,50,80,82,85,42,77,24,41,97,54,13,33,26,16,1,1,1] pages_url = [] detail_pages_url = [] lock = threading.Lock() def generate_pages_url(): for i in range(len(websites)): for j in range(1,pages[i]+1): pages_url.append("https://bj.lianjia.com/ershoufang/"+websites[i] + "/pg" + str(j)) def generate_apartments_url(): # url_spider(pages_url[0]) for url in pages_url: url_spider(url) apartment_w = open("apartment_url.csv", "a") csv_writer = csv.writer(apartment_w, dialect = "excel") for i in detail_pages_url: row = [i] csv_writer.writerow(row) def url_spider(url): try: req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)]) source_code = urllib2.urlopen(req,timeout=5).read() plain_text=unicode(source_code)#,errors='ignore') soup = BeautifulSoup(plain_text) except (urllib2.HTTPError, urllib2.URLError), e: print e return except Exception,e: print e return cj_list=soup.findAll('div',{'class':'info clear'}) for cj in cj_list: title=cj.find('div',{'class':'title'}) a=title.find('a').get('href') detail_pages_url.append(a) # print a # row[a] # csv_writer.writerow(row) if __name__=="__main__": #產生北京市內在售二手房所有搜尋頁面列表 儲存到pages_url generate_pages_url() #產生北京市內在售二手房所有詳情頁面列表 儲存到detail_pages_url generate_apartments_url()
# -*- coding: utf-8 -*- import csv import re import urllib2 import sqlite3 import random import threading from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding("utf-8") #Some User Agents hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\ {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},\ {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},\ {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},\ {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},\ {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},\ {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},\ {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}] def apartment_spider(url): print(url) try: req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)]) source_code = urllib2.urlopen(req,timeout=5).read() plain_text=unicode(source_code)#,errors='ignore') soup = BeautifulSoup(plain_text) except (urllib2.HTTPError, urllib2.URLError), e: print e return except Exception,e: print e return #房屋資訊title title = soup.find('div',{'class':'sellDetailHeader'}).find('div',{'class':'title-wrapper'}).find('div',{'class':'content'}).find('div',{'class':'title'}).find('h1').get('title') overview = soup.find('div',{'class':'overview'}).find('div',{'class':'content'}) #圖片img img_info = soup.find('div',{'class':'overview'}).find('div',{'class':'thumbnail'}).find('li') if(img_info!=None): img = img_info.get('data-src') else: img = "" #總價total_price total_price = soup.find('div',{'class':'price'}).find('span',{'class':'total'}).next_element #單價unit_price unit_price = overview.find('div',{'class':'price'}).find('div',{'class':'text'}).find('div',{'class':'unitPrice'}).find('span').next_element #朝向orientation orientation = overview.find('div',{'class':'type'}).find('div',{'class':'mainInfo'}).next_element #面積area area = overview.find('div',{'class':'area'}).find('div',{'class':'mainInfo'}).next_element.split('平米')[0] year_info = overview.find('div',{'class':'area'}).find('div',{'class':'subInfo'}).next_element aroundInfo = overview.find('div',{'class':'aroundInfo'}) #小區名稱name name_info = aroundInfo.find('div',{'class':'communityName'}) name = name_info.find('a',{'class':'info'}).next_element name_id = name_info.find('a',{'class':'info'}).get('href').split('/')[2] subway_info = aroundInfo.find('div',{'class':'areaName'}).find('a',{'class':'supplement'}).next_element place_info = aroundInfo.find('div',{'class':'areaName'}).find('span',{'class':'info'}).findAll('a') #城區district district = place_info[0].next_element #地段place place = place_info[1].next_element #id id = aroundInfo.find('div',{'class':'houseRecord'}).find('span',{'class':'info'}).next_element introContent = soup.find('div',{'class':'m-content'}).find('div',{'class':'box-l'}).find('div',{'id':'introduction'}).find('div',{'class':'introContent'}) room_type = str(introContent.find('div',{'class':'content'}).findAll('li')[0]).split('</span>')[1].split('</li>')[0] time = str(introContent.find('div',{'class':'transaction'}).findAll('li')[0]).split('</span>')[1].split('</li>')[0] # print "title: " + title # print "img: " + img # print "price: " + total_price # print "unit price: " + unit_price # print "orientation: " + orientation # print district # print place # print "area: " + area # print "name: " + name # print "name id: " + name_id # print time # print room_type # print "id: " + id # print "subway info:" + subway_info row = [title, total_price, unit_price, orientation, district, place, area, name, name_id, time, room_type, id, subway_info] csv_writer.writerow(row) img_row = [id, img] csv_writer2.writerow(img_row) if __name__=="__main__": # 負責記錄apartment詳情 apartment_w = open("apartments19000+.csv", "a") csv_writer = csv.writer(apartment_w, dialect = "excel") # 負責記錄img網址 img_w = open("img19000+.csv", "a") csv_writer2 = csv.writer(img_w, dialect = "excel") # apartment url讀取 urls = open("19000+.csv", "r") for url in urls: apartment_spider(url) apartment_w.close() img_w.close()
爬取圖片
二、使用kettle將csv中的資料處理並存儲到mysql資料庫中#coding = utf-8 import urllib def img(url,id): try: response = urllib.urlopen(url) get_img = response.read() except Exception,e: print e return with open('imgs/'+str(id)+'.jpg','wb') as fp: fp.write(get_img) print(id) return if __name__=="__main__": lines = open("img19000+.csv", "r") count = 0 for line in lines: id = line.split(',')[0] url = line.split(',')[1] img(url,id) count = count + 1 print count