爬取鬥魚平臺
阿新 • • 發佈:2018-12-01
知識點:
1.運用selenium自動化驅動模組
2.find_elements_by_xpath()與fin_element_by_xpath()的區別,以及對元素的定位,內容的提取
3.獲取請求下一頁方法,注:time.sleep()
程式碼:
#encoding=utf-8 from selenium import webdriver import time class DouYu(): def __init__(self): self.url = "https://www.douyu.com/directory/all" self.driver = webdriver.Chrome() def get_content_list(self): """ get:每頁中直播間的內容資訊 :return: """ # 獲取使用者直播間的物件列表 content_list = self.driver.find_elements_by_xpath(".//ul[@id='live-list-contentbox']/li") # 提取每頁每個直播間的資訊 get_contents_list = [] for content in content_list: dict = {} dict["room_img"] = content.find_element_by_xpath(".//span[@class='imgbox']/img").get_attribute("src") dict["room_name"] = content.find_element_by_xpath(".//a").get_attribute("title") dict["room_info"] = content.find_element_by_xpath(".//div[@class='mes-tit']/span").text get_contents_list.append(dict) # 獲取下一頁元素,獲得元素物件列表 next_url = self.driver.find_elements_by_xpath("//a[@class='shark-pager-next']") # 獲取元素物件:next_url[0] next_url = next_url[0] if len(next_url) > 0 else None return get_contents_list, next_url def run(self): # 獲取請求 self.driver.get(self.url) # 獲取請求頁內容列表,每頁的內容資訊 get_contents_list, next_url = self.get_content_list() # 儲存 print(get_contents_list) # 請求下一頁的元素 while next_url is not None: next_url.click() time.sleep(3) get_contents_list, next_url = self.get_content_list() # 儲存 print(get_contents_list) # with open("D:\\save.txt", "rb") as f: # f.write(get_contents_list) # f.close() if __name__=="__main__": spider = DouYu() spider.run()