Webdriver 爬取新浪滾動新聞
阿新 • • 發佈:2018-12-03
Webdriver 爬取新浪滾動新聞
初始想法
本人現在是國際關係學院2016級的本科生,學的是資訊管理與資訊系統。講道理不知道這個專業到底是幹啥的,現在選擇的後續方向是資料科學與工程,並且在老師的自然語言處理小組。爬蟲是做自然語言處理的基礎嘛,學習機器學習之前先學學怎麼爬取內容還是挺有意義的。本來開始想著爬一下新浪微博的內容,但是又涉及到滾動爬取,賬號登陸之類的繁瑣問題,還是先玩玩滾動新聞吧。其實講道理中國新聞網的滾動新聞做的比新浪的好多了,介面也好看,不過這都是爬完之後才發現的哈哈哈哈哈哈哈
背景介紹
本專案為基於新浪滾動新聞(https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1)進行網頁爬取,以新聞標題為檔名稱,以新聞內容為檔案正文儲存。
內容介紹
環境要求
- 環境要求:python3.7+
- 安裝包要求:time,requests,random,Beautifulsoup,selenium等
檔案介紹
- main:調取其他所有檔案相關函式,輸入初始url,並計算爬取全部網頁耗時
- date_helper:對網頁日期進行調整實現自動翻頁
- data_helper:對資料的所有路徑進行調整
- spider:爬取網頁的主檔案,呼叫Webdriver獲取主索引頁的子頁面並獲取網頁內容
- article_spider:爬取新聞正文內容
程式碼
main
from date_helper import date_processing
from data_helper import pickle_writer
from spider import *
import time
start = time.clock()
if __name__ == '__main__':
url_org = 'http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&date={}&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page='
while True:
date = date_processing() # 獲取日期
output_list = [] # 存放輸出序列 list
url = url_org.format(date) # 生成待爬取URL
sina(url,output_list,date) # 爬蟲
print(output_list)
print(len(output_list))
file_name = ''.format(date)
pickle_writer(output_list, file_name) # 寫入臨時檔案存放
end = time.clock()
print('Running:%s seconds.'%(end - start))
date_helper
from selenium import webdriver
import re
import time
import calendar
import re
import codecs
from data_helper import *
def count_days(year, month):
cal = calendar.monthrange(year, month)
pattern = re.compile(r'\d+')
days = pattern.findall(str(cal))[1]
return days
def month_sub(year,month):
if month > 10:
month -= 1
month = str(month)
elif month <= 10 and month > 1 :
month -= 1
month = '0'+str(month)
else:
year -= 1
month = 12
return year,month
def date_sub(year,month,day):
if day > 10:
day -= 1
day = str(day)
elif day <= 10 and day > 1:
day -= 1
day = '0'+str(day)
else:
year, month = month_sub(int(year),int(month))
days = count_days(year, int(month))
day = days
date = str(year)+'-'+str(month) +'-'+str(day) #新浪滾動新聞
return date
def date_processing():
date_txt = ""
last_date = txt_load(date_txt)
date = str(last_date[0])
year = int(date.split("-")[0])
month = date.split("-")[1]
day = int(date.split("-")[2])
date = date_sub(year, month, day)
writer = codecs.open(date_txt,'w','UTF-8')
writer.write(date)
writer.flush()
return date
data_helper
import re
import pickle
import codecs
import jieba
'''
讀取原始資料
'''
def txt_load(path):
reader = codecs.open(path,'r','UTF-8')
lines = reader.readlines()
return lines
def join_list(ss):
c = ""
for k in ss:
c+=k
return c
def pickle_writer(input_,name):
'''
:param input_: 待儲存的資料
:param name: 存放路徑
'''
writer = open(name,"wb")
pickle.dump(input_,writer)
writer.close()
print("finish to write data")
# 定義讀plk檔案函式
def pickle_load(input_):
'''
:param input_: 路徑
:return: 原始資料
'''
raeder = open(input_,"rb")
content = pickle.load(raeder)
raeder.close()
print("finish to read data")
return content
def jieba_cut(content):
'''
:param content: str 句子 待分詞
:return: 分好詞的list
'''
cut = jieba.cut(content)
l = []
for con in cut:
if con!=" ":
l.append(con)
return l
def is_chinese(uchar):
"""判斷一個unicode是否是漢字"""
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return uchar
elif uchar == re.sub('[^a-zA-Z]', '', uchar):
return str(uchar).lower()
else:
return ''
spider
# -*- coding: utf-8 -*-
from selenium import webdriver
from article_spider import *
import re
def get_pages(driver,url):
'''
:param driver: Webdriver頁面
:param url: 指定日期的連結
:return page_num: 指定日期內頁面的數量
'''
start_url = url + '1'
driver.get(start_url)
time.sleep(2)
driver.refresh()
time.sleep(2)
page_html = driver.page_source
pagelist = re.findall('onclick="newsList.page.goTo(.*?);return false', page_html, re.S)
pattern = re.compile('\d+') # 獲取頁碼數
page_num = pattern.findall(pagelist[len(pagelist)-1])[0]
return (page_num)
def Get_content(driver,page_num,url,output_list,date):
'''
:param driver: Webdriver頁面
:param page_num: 指定日期內頁面的數量
:param url: 指定日期的連結
:param output_list: 輸出list
:param date: 指定日期
'''
k = 1
while k <= int(page_num):
driver.get(url + str(k))
time.sleep(2.5)
driver.refresh()
for i in range(1, 11):
for j in range(1, 6):
classfy_cn = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[1]').text
title = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').text
href = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').get_attribute('href')
times = driver.find_element_by_xpath(
'//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[3]').text
pubtime = times.split(" ")[1]
content, classfy_en = get_article(href)
content_list = [classfy_cn, classfy_en, date, pubtime, title, href, content]
test = '' + title + '.txt'
with open(test, 'w') as f:
for content_list_element in content_list:
f.write(content_list_element)
output_list.append(content_list)
print(len(output_list))
k = k + 1
def sina(url,output_list,date):
'''
:param url: 待爬取的url
:param output_list: 輸出list
:param date: 日期
:return:
'''
driver = webdriver.Chrome()
page_num = get_pages(driver, url)
Get_content(driver, page_num, url, output_list, date)
driver.close()
article_spider
#-*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from user_agents import agents
import requests
import time
import random
def get_article(url):
'''
:param url: 指定日期的連結
:return content: 文字的內容
:return classfy: 文字的型別
'''
try:
classfy = url.split('.')[0].split('//')[1]
agent = random.choice(agents)
header = {'User-Agent': agent}
res = requests.get(url.rsplit('\r\n')[0], headers=header)
time.sleep(1)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
newsArticle = getnewsArticle(soup.select('.art_p'))
content = ''
for con in newsArticle:
content = content + con
return content, classfy
except Exception as e:
print(e)
def getnewsArticle(news):
'''
:param news: 新聞主題內容連結
:return newsArticle: 新聞主題內容
'''
newsArticle = []
for p in news:
newsArticle.append(p.text.strip())
return newsArticle
後記
爬取下來的內容還算ok,雖然這樣看起來又繁瑣又蛋疼,而且好像如果新聞標題中含有" / "這個字元的話,就會報錯,顯示沒有這個資料夾,或許第二版程式碼我再考慮怎麼加一下中文匹配(?)。程式碼裡面的agent就隨便網上找點就行,不用太在意。
第一次寫,想來也有很多很多毛病,如果有人看到,還請指出,感恩的心,感謝有你。