python爬蟲:案例三:去哪兒酒店價格資訊
#去哪兒單頁面資料爬取: #coding=utf-8 import sys reload(sys) sys.setdefaultencoding( "utf-8" ) import urllib from selenium import webdriver import time from bs4 import BeautifulSoup import requests import os from selenium.webdriver.common.action_chains import ActionChains flie_path='file:///'+os.path.abspath('test.html') #flie_path="http://hotel.qunar.com/city/hangzhou/dt-11571/?tag=hangzhou#fromDate=2016-05-24&toDate=2016-05-25&q=%E5%A6%82%E5%AE%B6%E5%BF%AB%E6%8D%B7%E6%9D%AD%E5%B7%9E%E8%A5%BF%E6%B9%96%E9%BB%84%E9%BE%99%E6%97%85%E6%B8%B8%E9%9B%86%E6%95%A3%E4%B8%AD%E5%BF%83%E5%BA%97&from=qunarHotel|sug&fromFocusList=0&filterid=eaa9982b-b203-4ed2-bf7e-e063936143c3_A&showMap=0&qptype=hotelName|poi&haspoi=1&QHFP=ZSS_A7D48C73" class Xc(): def pc(self): driver = webdriver.PhantomJS() time.sleep(5) driver.get(flie_path) driver.implicitly_wait(30) time.sleep(5) dic={} aa = driver.find_elements_by_class_name('hotel-quote-list') for a in aa: lists=[] bb=a.find_elements_by_class_name('js-dprice') for b in bb: print b.text.strip()[1:] lists.append(b.text.strip()[1:]) dic[a.find_element_by_class_name('rtype').find_element_by_tag_name('h2').text]=lists driver.quit return dic s=Xc() print s.pc()
結果:
212237212218218237256236236237237265237244244265265244244265254284254269284{u'\u5927\u5e8a\u623f': [u'237', u'256', u'236', u'236', u'237'], u'\u7279\u60e0\u5546\u52a1\u623f(\u65e0\u7a97)': [u'265', u'265', u'244', u'244', u'265'], u'\u5355\u4eba\u623f': [u'212', u'237', u'212', u'218', u'218'], u'\u6807\u51c6\u53cc\u4eba\u623f': [u'254', u'284', u'254', u'269', u'284'], u'\u7279\u60e0\u53cc\u4eba\u95f4': [u'237', u'265', u'237', u'244', u'244']}
這個只是一個簡單的模型,數字是價格,unicode 是房型,上面的程式爬的是一個html檔案,因為我這裡的網速實在太慢,於是我把瀏覽器f12後的原始碼儲存在一個html檔案中爬取資料,這個頁面的url我也貼在上面
上面的程式碼有點問題,取到的資料不全,因為有些價格資訊被隱藏了,看頁面上會有“檢視其他3條報價”之類的超連結
模擬滑鼠點選,展開整個節點#coding=utf-8 import sys reload(sys) sys.setdefaultencoding( "utf-8" ) import urllib from selenium import webdriver import time from bs4 import BeautifulSoup import requests import os from selenium.webdriver.common.action_chains import ActionChains flie_path='file:///'+os.path.abspath('test.html') #flie_path="http://hotel.qunar.com/city/hangzhou/dt-11571/?tag=hangzhou#fromDate=2016-05-24&toDate=2016-05-25&q=%E5%A6%82%E5%AE%B6%E5%BF%AB%E6%8D%B7%E6%9D%AD%E5%B7%9E%E8%A5%BF%E6%B9%96%E9%BB%84%E9%BE%99%E6%97%85%E6%B8%B8%E9%9B%86%E6%95%A3%E4%B8%AD%E5%BF%83%E5%BA%97&from=qunarHotel|sug&fromFocusList=0&filterid=eaa9982b-b203-4ed2-bf7e-e063936143c3_A&showMap=0&qptype=hotelName|poi&haspoi=1&QHFP=ZSS_A7D48C73" class Xc(): def pc(self): driver = webdriver.PhantomJS() time.sleep(5) driver.get(flie_path) driver.implicitly_wait(30) time.sleep(5) dic={} aa = driver.find_elements_by_class_name('hotel-quote-list') for a in aa: lists=[] left=a.find_element_by_class_name("js-expand-more") #找到“檢視其他3條報價”的節點 ActionChains(driver).click_and_hold(left).perform() #模擬點選 driver.implicitly_wait(30) time.sleep(5) bb=a.find_elements_by_class_name('js-dprice') for b in bb: print b.text.strip()[1:] lists.append(b.text.strip()[1:]) dic[a.find_element_by_class_name('rtype').find_element_by_tag_name('h2').text]=lists driver.quit return dic s=Xc() print s.pc()
因為我是本地檔案所以模擬也沒有效果,不過過程中沒有報錯,理論上應該沒有錯誤,如果有錯誤請指正!
上面的程式碼還是不能抓到完全的價格資料,去哪兒會把最後幾個房型的價格隱藏住,需要點開“展開報價”,然後再點開“檢視其他3條報價”之類的超連結,才能看到全部價格!
webdriver的class定位好像有點問題,find_element_by_class_name()中的class如果有空格就會報錯,無法解析,所以我改用xpath
#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import urllib
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
import os
from selenium.webdriver.common.action_chains import ActionChains
flie_path='file:///'+os.path.abspath('test.html')
#flie_path="http://hotel.qunar.com/city/hangzhou/dt-11571/?tag=hangzhou#fromDate=2016-05-24&toDate=2016-05-25&q=%E5%A6%82%E5%AE%B6%E5%BF%AB%E6%8D%B7%E6%9D%AD%E5%B7%9E%E8%A5%BF%E6%B9%96%E9%BB%84%E9%BE%99%E6%97%85%E6%B8%B8%E9%9B%86%E6%95%A3%E4%B8%AD%E5%BF%83%E5%BA%97&from=qunarHotel|sug&fromFocusList=0&filterid=eaa9982b-b203-4ed2-bf7e-e063936143c3_A&showMap=0&qptype=hotelName|poi&haspoi=1&QHFP=ZSS_A7D48C73"
class Xc():
def pc(self):
driver = webdriver.PhantomJS()
time.sleep(5)
driver.get(flie_path)
driver.implicitly_wait(30)
time.sleep(5)
dic={}
aa=driver.find_elements_by_xpath("//div[@class='room-item-inner room-item-wrapper']")
for a in aa:
lists=[]
left_ll=a.find_element_by_xpath("//p[@class='btn-book-ct']/a")
ActionChains(driver).click_and_hold(left_ll).perform()
time.sleep(5)
left=a.find_element_by_xpath("//a[@class='js-expand-more']")
#找到“檢視其他3條報價”的節點
ActionChains(driver).click_and_hold(left).perform()
#模擬點選
driver.implicitly_wait(30)
time.sleep(5)
bb=a.find_elements_by_class_name('js-dprice')
for b in bb:
if not b.text.strip()[1:].startswith('¥'):
lists.append(b.text.strip()[1:])
print b.text.strip()[1:]
dic[a.find_element_by_class_name('rtype').find_element_by_tag_name('h2').text]=lists
driver.quit
return dic
s=Xc()
print s.pc()
還是無法提供結果,因為我是本地檔案,跑了一遍沒有報錯,理論上沒有問題,如果你們執行報錯請指正,謝謝!