1. 程式人生 > >基於大資料的房價分析--1.資料爬取

基於大資料的房價分析--1.資料爬取

爬取資料用的是python2.6+scrapy爬蟲框架,一開始我寫的是一個全站爬蟲,可以根據一個種子url爬取58同城所有房價資訊,但有個問題就是必須使用代理IP,否則爬蟲很快就會被封禁,於是我想了個辦法就是在linux中每五分鐘執行一次爬蟲程式,每次只爬取一個城市的房價資訊,程式碼如下

1.spiders

#encoding=utf-8
import sys

sys.path.append("..")
from scrapy.spiders import Spider
from lxml import html
import plug
from plug.utils import
StringUtil,NumberUtil from ershoufang.items import HouseItem import re import scrapy import time import pymongo from scrapy.utils.project import get_project_settings class erShouSpider(Spider): name = "ershoufang" allowed_domains = ["58.com"] def __init__(self): super(erShouSpider,self).__init__() self.settings = get_project_settings() self.client = pymongo.MongoClient( self.settings['MONGO_IP'
], self.settings['MONGO_PORT']) self.cities_db = self.client[self.settings['CITY_DB']] self.cities_Col = self.cities_db[self.settings['CITY_COL']] self.fillurl="" self.cityhost="" self.city="" def
get_specify_request(self):
#返回指定的請求 condition = {"city":self.settings['CITY']} if self.settings['PROVIENCE'] and self.settings['PROVIENCE']!="": condition = {"city":self.settings['CITY'],"provience":self.settings['PROVIENCE']} content = self.cities_Col.find_one(condition) self.cityhost = content['cityhost'] self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost self.city = content["_id"] return [scrapy.Request(self.fillUrl)] def get_sequence_request(self): #按順序進行爬取 requests = [] if self.cities_Col.count({"status":False}) <= 0: print("全部設為false") self.cities_Col.update({},{"$set":{"status":False}},True,True,True) content = self.cities_Col.find_one({"status":False}) self.cities_Col.update({"_id":content["_id"]},{"$set":{"status":True}}) self.client.close() self.cityhost = content['cityhost'] self.fillUrl = "http://%s.58.com/ershoufang/"%self.cityhost self.city = content["_id"] requests.append(scrapy.Request(self.fillUrl)) return requests def start_requests(self): if self.settings['CITY'] and self.settings['CITY'] != '': return self.get_specify_request() else: return self.get_sequence_request() def parseUrls(self,html): links = html.xpath(".//a/@href") urls = [] for link in links: if StringUtil.filtString(self.fillUrl+"pn\d+?/",link): urls.append(link) return urls def parseItems(self,html,url): houselist = html.xpath(".//ul[@class='house-list-wrap']//div[@class='list-info']") items = [] for houseinfo in houselist: detailurl = houseinfo.xpath(".//h2[1]/a/@href") title = "".join(houseinfo.xpath(".//h2[1]/a/text()")) roomNum = "".join(houseinfo.xpath(".//p[1]/span[1]/text()")[0].split()) size = "".join(houseinfo.xpath(".//p[1]/span[2]/text()")) orient = "".join(houseinfo.xpath(".//p[1]/span[3]/text()")) floor = "".join(houseinfo.xpath(".//p[1]/span[4]/text()")) address = "".join(("".join(houseinfo.xpath(".//p[2]/span[1]//a/text()"))).split()) sumprice = "".join(houseinfo.xpath("./following-sibling::div[1]//p[@class='sum']/b/text()")) unitprice = "".join(houseinfo.xpath("./following-sibling::div[@class='price']//p[@class='unit']/text()")) items.append(HouseItem( _id = "".join(detailurl), title = title, roomNum = roomNum, size = NumberUtil.fromString(size), orient = orient, floor = floor, address = address, sumPrice = NumberUtil.fromString(sumprice), unitPrice = NumberUtil.fromString(unitprice), city=self.city, fromUrl = url, nowTime = time.time(), status = "SUBSPENDING") ) return items def parse(self,response): if(response.body =='None'): return doc = html.fromstring(response.body.decode("utf-8")) urls = self.parseUrls(doc) items = self.parseItems(doc,response.url) for url in urls: yield scrapy.Request(url,callback=self.parse) for item in items: yield item

2.items

class HouseItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    roomNum = scrapy.Field()
    size = scrapy.Field()
    orient = scrapy.Field()
    floor = scrapy.Field()
    address = scrapy.Field()
    sumPrice = scrapy.Field()
    unitPrice = scrapy.Field()
    _id = scrapy.Field()
    imageurl = scrapy.Field()
    fromUrl = scrapy.Field()
    city = scrapy.Field()
    nowTime = scrapy.Field()
    status = scrapy.Field()

3.pipelines

#coding: utf-8
import codecs
import json
import pymongo
from scrapy.utils.project import get_project_settings       

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from ershoufang.items import ProxyItem

class ErshoufangPipeline(object):
        def __init__(self):
            self.settings = get_project_settings()
            self.client = pymongo.MongoClient(
                host=self.settings['MONGO_IP'],
                port=self.settings['MONGO_PORT'])
            self.db = self.client[self.settings['MONGO_DB']]
            self.proxyclient = self.proxy = self.client[self.settings['PROXY_DB']][self.settings['POOL_NAME']]
            self.itemNumber = 0
        def process_proxy(self,item):
            self.proxyclient.insert(dict(item))
        def process_item(self, item, spider):
            if isinstance (item,ProxyItem):
                self.process_proxy(item)
                return item
            try:
                if not item['address']: 
                    print(item["fromUrl"+"網頁異常"])
                    return item
                '''
                if self.db.ershoufang.count({"_id":item["_id"],"city":item['city']})<= 0:
                    print("刪除")
                    self.db.ershoufang.remove({"_id":item["_id"]})
                '''
                coll = self.db[self.settings['ALL']]
                coll.insert(dict(item))
                self.itemNumber += 1
                print("爬取到第%s個房屋,地址為%s"%(self.itemNumber,item['address']))
            except Exception,e:
                print("房屋已存在"+item['address'])
            return item
        def closed_spider(self,spider):
            self.client.close()
            self.db.close() 
            print("本次爬取共爬取到%s條房屋資料"%self.itemNumber)

爬取了三天,爬取了兩百多萬的資料,結果如下
這裡寫圖片描述