爬蟲04-網易科技新聞
阿新 • • 發佈:2018-11-01
""" __title__ = '' __author__ = 'Thompson' __mtime__ = '2018/7/26' # code is far away from bugs with the god animal protecting I love animals. They taste delicious. ┏┓ ┏┓ ┏┛┻━━━┛┻┓ ┃ ☃ ┃ ┃ ┳┛ ┗┳ ┃ ┃ ┻ ┃ ┗━┓ ┏━┛ ┃ ┗━━━┓ ┃ 神獸保佑 ┣┓ ┃ 永無BUG! ┏┛ ┗┓┓┏━┳┓┏┛ ┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ from selenium import webdriver import time import random from bs4 import BeautifulSoup import json browser = webdriver.Chrome() browser.get("http://tech.163.com/") last_height = browser.execute_script("return document.body.scrollHeight") while True: print('頁面載入中...') # 滑動一次 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 等待載入 time.sleep(random.random()*10) # 計算新的滾動高度並與上一個滾動高度進行比較 new_height = browser.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height html = browser.page_source #print(html) browser.close() # 資料提取 soup = BeautifulSoup(html,'lxml') #print(soup.prettify()) ls = soup.select('div.data_row.news_article.clearfix') print(len(ls)) file = open('./data/163tech.json', 'w', encoding='utf-8') for item in ls: title = item.select('h3 > a')[0].get_text() print('title:',title) url = item.select('h3 > a')[0]['href'] print('url:', url) content = json.dumps({'title':title,'url':url}, ensure_ascii=False) + "\n" file.write(content) file.close() file = open('./data/163tech.json', 'r', encoding='utf-8') ls = file.readlines() for it in ls: print(json.loads(it))