Python3-selenium\phantomjs\bs4爬取鬥魚頁面
阿新 • • 發佈:2019-02-04
from selenium import webdriver import time from bs4 import BeautifulSoup class douyuSelenium(): #初始化,啟動鬥魚瀏覽器 def setup(self): self.driver=webdriver.PhantomJS() #獲取鬥魚房間資訊 def testDouyu(self): self.driver.get('https://www.douyu.com/directory/all') while True: time.sleep(2) #指定解析器,生成一個soup物件 soup=BeautifulSoup(self.driver.page_source,'lxml') # 獲取當前頁面所有的房間標題,觀眾人數 titles=soup.find_all('h3',{'class':'ellipsis'}) # for title in titles: # title=title.text.strip() # print(title) # #人氣 nums=soup.find_all('span',{'class':'dy-num fr'}) # for num in nums: # num=num.text.strip()'房間標題:'+title.text.strip()+'\t'+'人氣:'+num.text # print(num) # print(title+'\t'+num) for title,num in zip(titles,nums):#感覺標題和人氣不匹配 info='房間標題:' + title.text.strip() + '\t' + '人氣:'+ num.text print(info) #下一頁 #查詢下一頁 # self.driver.find_element_by_class_name('shark-pager-next shark-pager-disable shark-pager-disable-next') if self.driver.page_source.find('shark-pager-disable-next')!=-1: break #點選 next_page=self.driver.find_element_by_class_name('shark-pager-next') next_page.click() #退出 def shutdown(self): print('載入完成。。。。') self.driver.quit() if __name__=='__main__': douyu=douyuSelenium() douyu.setup() douyu.testDouyu() douyu.shutdown()