python 使用selenium和requests爬取頁面數據
阿新 • • 發佈:2018-12-04
ret pre tex 爬取 test user 發現 rom request
目的:獲取某網站某用戶下市場大於1000秒的視頻信息
1.本想通過接口獲得結果,但是使用post發送信息到接口,提示服務端錯誤。
2.通過requests獲取頁面結果,使用html解析工具,發現麻煩而且得不到想要的結果
3.直接通過selenium獲取控件的屬性信息,如圖片、視頻地址,再對時間進行篩選。將信息保存到以id命名的文件夾下
# -*- coding:utf-8 -*- from selenium import webdriver import sys,os,requests,shutil class GetUserVideo(): def __init__(self,driver,id): self.id = str(id) self.driver = driver self.base_url = "http://www.xxxxx.com/user/%s?t=2"%(self.id) def get_pagecounts(self): #獲取頁面數 self.driver.get(self.base_url) page_counts=int(self.driver.find_elements_by_xpath("//div[@class=‘page‘]/a")[-2].text)+1 return page_counts def get_video(self,driver,page,f): video_times = driver.find_elements_by_xpath("//i[@class=‘continue_time‘]") video_urls = driver.find_elements_by_xpath("//div[@class=‘video‘]/a[@class=‘url‘]") video_imgs = driver.find_elements_by_xpath("//a[@class=‘url‘]/img") length = len(video_times) for i in range(length): " 當前頁面內篩選出時長大於1000秒的,並將圖片、時長、地址保存到指定目錄" time_list = video_times[i].text.split(":") time_count = int(time_list[0]) * 3600 + int(time_list[1]) * 60 + int(time_list[2]) if time_count > 1000: video_time = video_times[i].text video_url = video_urls[i].get_attribute(‘href‘) video_img = video_imgs[i].get_attribute("src") img_name = str(page) + "_" + str(i)+"_"+os.path.basename(video_img) f.write(img_name + "\t") f.write(video_time + "\t") f.write(video_url + "\n") img_url = requests.get(video_img) with open(self.id + "/" + img_name, "wb") as b: b.write(img_url.content) def test(self): "如果存在同名文件夾,就刪除" if os.path.exists(self.id): shutil.rmtree(self.id) os.mkdir(self.id) driver = self.driver page_counts=self.get_pagecounts() f=open(self.id+"/video.txt","w") for page in range(1,page_counts): detail_url = "&page=%s" % page driver.get(self.base_url+detail_url) self.get_video(driver,page,f) f.close() driver.quit() if __name__=="__main__": path = sys.path[0].split("/") index = path.index("SeleniumOfJenkins") + 1 ph_driver = "/driver/phantomjs-2.1.1-macosx/bin/phantomjs" if index == len(path): driver_path = sys.path[0] + ph_driver else: driver_path = "/".join(path[:index]) + ph_driver driver = webdriver.PhantomJS(executable_path=driver_path) driver.maximize_window() driver.implicitly_wait(10) test = GetUserVideo(driver,123456) test.test()
python 使用selenium和requests爬取頁面數據