1. 程式人生 > >pyspider爬取TripAdvisor

pyspider爬取TripAdvisor

attr ems com () comment save format blog tex

 1 #!/usr/bin/env python
 2 # -*- encoding: utf-8 -*-
 3 # Created on 2017-06-11 10:10:53
 4 # Project: london
 5 
 6 from pyspider.libs.base_handler import *
 7 import pymongo
 8 
 9 
10 class Handler(BaseHandler):
11     crawl_config = {
12     }
13     client = pymongo.MongoClient(localhost)
14     db = client[
trip] 15 16 @every(minutes=24 * 60) 17 def on_start(self): 18 self.crawl(https://www.tripadvisor.cn/Attractions-g186338-Activities-c47-London_England.html, callback=self.index_page) 19 20 @config(age=10 * 24 * 60 * 60) 21 def index_page(self, response): 22 for each in response.doc(
.listing_title > a).items(): 23 self.crawl(each.attr.href, callback=self.detail_page) 24 next_page = response.doc(.pagination .nav.next).attr.href 25 self.crawl(next_page,callback = self.index_page) 26 27 @config(priority=2) 28 def detail_page(self, response):
29 return { 30 "name":response.doc(h1).text(), 31 "url": response.url, 32 comment:response.doc(.heading_ratings .taLnk).text(), 33 address:response.doc(.addressReset > span.format_address).text(), 34 phone:response.doc(.phoneNumber).text(), 35 duration:response.doc(#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(5) > div > div:nth-child(1)).text(), 36 instruction:response.doc(#MAP_AND_LISTING > div.main_section.listingbar > div > div.above_fold_listing_details > div > div:nth-child(6) > div > b).text() 37 } 38 def on_result(self,result): 39 if result: 40 self.save_to_mongo(result) 41 42 def save_to_mongo(self,result): 43 if self.db[london].insert(result): 44 print(saved to mongo,result) 45

pyspider爬取TripAdvisor