基於大資料的房價分析--1.資料爬取
阿新 • • 發佈:2019-02-03
爬取資料用的是python2.6+scrapy爬蟲框架,一開始我寫的是一個全站爬蟲,可以根據一個種子url爬取58同城所有房價資訊,但有個問題就是必須使用代理IP,否則爬蟲很快就會被封禁,於是我想了個辦法就是在linux中每五分鐘執行一次爬蟲程式,每次只爬取一個城市的房價資訊,程式碼如下
1.spiders
#encoding=utf-8
import sys
sys.path.append("..")
from scrapy.spiders import Spider
from lxml import html
import plug
from plug.utils import StringUtil,NumberUtil
from ershoufang.items import HouseItem
import re
import scrapy
import time
import pymongo
from scrapy.utils.project import get_project_settings
class erShouSpider(Spider):
name = "ershoufang"
allowed_domains = ["58.com"]
def __init__(self):
super(erShouSpider,self).__init__()
self.settings = get_project_settings()
self.client = pymongo.MongoClient(
self.settings['MONGO_IP' ],
self.settings['MONGO_PORT'])
self.cities_db = self.client[self.settings['CITY_DB']]
self.cities_Col = self.cities_db[self.settings['CITY_COL']]
self.fillurl=""
self.cityhost=""
self.city=""
def get_specify_request(self):
#返回指定的請求
condition = {"city":self.settings['CITY']}
if self.settings['PROVIENCE'] and self.settings['PROVIENCE']!="":
condition = {"city":self.settings['CITY'],"provience":self.settings['PROVIENCE']}
content = self.cities_Col.find_one(condition)
self.cityhost = content['cityhost']
self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost
self.city = content["_id"]
return [scrapy.Request(self.fillUrl)]
def get_sequence_request(self):
#按順序進行爬取
requests = []
if self.cities_Col.count({"status":False}) <= 0:
print("全部設為false")
self.cities_Col.update({},{"$set":{"status":False}},True,True,True)
content = self.cities_Col.find_one({"status":False})
self.cities_Col.update({"_id":content["_id"]},{"$set":{"status":True}})
self.client.close()
self.cityhost = content['cityhost']
self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost
self.city = content["_id"]
requests.append(scrapy.Request(self.fillUrl))
return requests
def start_requests(self):
if self.settings['CITY'] and self.settings['CITY'] != '':
return self.get_specify_request()
else:
return self.get_sequence_request()
def parseUrls(self,html):
links = html.xpath(".//a/@href")
urls = []
for link in links:
if StringUtil.filtString(self.fillUrl+"pn\d+?/",link):
urls.append(link)
return urls
def parseItems(self,html,url):
houselist = html.xpath(".//ul[@class='house-list-wrap']//div[@class='list-info']")
items = []
for houseinfo in houselist:
detailurl = houseinfo.xpath(".//h2[1]/a/@href")
title = "".join(houseinfo.xpath(".//h2[1]/a/text()"))
roomNum = "".join(houseinfo.xpath(".//p[1]/span[1]/text()")[0].split())
size = "".join(houseinfo.xpath(".//p[1]/span[2]/text()"))
orient = "".join(houseinfo.xpath(".//p[1]/span[3]/text()"))
floor = "".join(houseinfo.xpath(".//p[1]/span[4]/text()"))
address = "".join(("".join(houseinfo.xpath(".//p[2]/span[1]//a/text()"))).split())
sumprice = "".join(houseinfo.xpath("./following-sibling::div[1]//p[@class='sum']/b/text()"))
unitprice = "".join(houseinfo.xpath("./following-sibling::div[@class='price']//p[@class='unit']/text()"))
items.append(HouseItem(
_id = "".join(detailurl),
title = title,
roomNum = roomNum,
size = NumberUtil.fromString(size),
orient = orient,
floor = floor,
address = address,
sumPrice = NumberUtil.fromString(sumprice),
unitPrice = NumberUtil.fromString(unitprice),
city=self.city,
fromUrl = url,
nowTime = time.time(),
status = "SUBSPENDING")
)
return items
def parse(self,response):
if(response.body =='None'):
return
doc = html.fromstring(response.body.decode("utf-8"))
urls = self.parseUrls(doc)
items = self.parseItems(doc,response.url)
for url in urls:
yield scrapy.Request(url,callback=self.parse)
for item in items:
yield item
2.items
class HouseItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
roomNum = scrapy.Field()
size = scrapy.Field()
orient = scrapy.Field()
floor = scrapy.Field()
address = scrapy.Field()
sumPrice = scrapy.Field()
unitPrice = scrapy.Field()
_id = scrapy.Field()
imageurl = scrapy.Field()
fromUrl = scrapy.Field()
city = scrapy.Field()
nowTime = scrapy.Field()
status = scrapy.Field()
3.pipelines
#coding: utf-8
import codecs
import json
import pymongo
from scrapy.utils.project import get_project_settings
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from ershoufang.items import ProxyItem
class ErshoufangPipeline(object):
def __init__(self):
self.settings = get_project_settings()
self.client = pymongo.MongoClient(
host=self.settings['MONGO_IP'],
port=self.settings['MONGO_PORT'])
self.db = self.client[self.settings['MONGO_DB']]
self.proxyclient = self.proxy = self.client[self.settings['PROXY_DB']][self.settings['POOL_NAME']]
self.itemNumber = 0
def process_proxy(self,item):
self.proxyclient.insert(dict(item))
def process_item(self, item, spider):
if isinstance (item,ProxyItem):
self.process_proxy(item)
return item
try:
if not item['address']:
print(item["fromUrl"+"網頁異常"])
return item
'''
if self.db.ershoufang.count({"_id":item["_id"],"city":item['city']})<= 0:
print("刪除")
self.db.ershoufang.remove({"_id":item["_id"]})
'''
coll = self.db[self.settings['ALL']]
coll.insert(dict(item))
self.itemNumber += 1
print("爬取到第%s個房屋,地址為%s"%(self.itemNumber,item['address']))
except Exception,e:
print("房屋已存在"+item['address'])
return item
def closed_spider(self,spider):
self.client.close()
self.db.close()
print("本次爬取共爬取到%s條房屋資料"%self.itemNumber)
爬取了三天,爬取了兩百多萬的資料,結果如下