Webdriver 爬取新浪滾動新聞

阿新 • • 發佈：2018-12-03

Webdriver 爬取新浪滾動新聞

初始想法

本人現在是國際關係學院2016級的本科生，學的是資訊管理與資訊系統。講道理不知道這個專業到底是幹啥的，現在選擇的後續方向是資料科學與工程，並且在老師的自然語言處理小組。爬蟲是做自然語言處理的基礎嘛，學習機器學習之前先學學怎麼爬取內容還是挺有意義的。本來開始想著爬一下新浪微博的內容，但是又涉及到滾動爬取，賬號登陸之類的繁瑣問題，還是先玩玩滾動新聞吧。其實講道理中國新聞網的滾動新聞做的比新浪的好多了，介面也好看，不過這都是爬完之後才發現的哈哈哈哈哈哈哈

背景介紹

本專案為基於新浪滾動新聞（https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1）進行網頁爬取，以新聞標題為檔名稱，以新聞內容為檔案正文儲存。

內容介紹

環境要求

環境要求：python3.7+
安裝包要求：time，requests，random，Beautifulsoup，selenium等

檔案介紹

main：調取其他所有檔案相關函式，輸入初始url，並計算爬取全部網頁耗時
date_helper：對網頁日期進行調整實現自動翻頁
data_helper：對資料的所有路徑進行調整
spider：爬取網頁的主檔案，呼叫Webdriver獲取主索引頁的子頁面並獲取網頁內容
article_spider：爬取新聞正文內容

程式碼

main

from 
 date_helper import date_processing
from data_helper import pickle_writer
from spider import *
import time

start = time.clock()
if __name__ == '__main__':
    url_org = 'http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&date={}&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page=' 

    while True:
        date = date_processing()  # 獲取日期
        output_list = []          # 存放輸出序列 list
        url = url_org.format(date) # 生成待爬取URL
        sina(url,output_list,date) # 爬蟲
        print(output_list)
        print(len(output_list))
        file_name = ''.format(date)
        pickle_writer(output_list, file_name)  # 寫入臨時檔案存放
end = time.clock()
print('Running:%s seconds.'%(end - start))

date_helper

from selenium import webdriver
import re
import time
import calendar
import re
import codecs
from data_helper import *

def count_days(year, month):
    cal = calendar.monthrange(year, month)
    pattern = re.compile(r'\d+')
    days = pattern.findall(str(cal))[1]
    return days

def month_sub(year,month):
    if month > 10:
        month -= 1
        month = str(month)
    elif month <= 10 and month > 1 :
        month -= 1
        month = '0'+str(month)
    else:
        year -= 1
        month = 12
    return year,month

def date_sub(year,month,day):
    if day > 10:
        day -= 1
        day = str(day)
    elif day <= 10 and day > 1:
        day -= 1
        day = '0'+str(day)
    else:
        year, month = month_sub(int(year),int(month))
        days = count_days(year, int(month))
        day = days
    date = str(year)+'-'+str(month) +'-'+str(day)  #新浪滾動新聞
    return date

def date_processing():
    date_txt = ""
    last_date = txt_load(date_txt)
    date = str(last_date[0])
    year = int(date.split("-")[0])
    month = date.split("-")[1]
    day = int(date.split("-")[2])
    date = date_sub(year, month, day)
    writer = codecs.open(date_txt,'w','UTF-8')
    writer.write(date)
    writer.flush()
    return date

data_helper

import re
import pickle
import codecs
import jieba
'''
讀取原始資料
'''
def txt_load(path):
    reader = codecs.open(path,'r','UTF-8')
    lines = reader.readlines()
    return lines

def join_list(ss):
    c = ""
    for k in ss:
        c+=k
    return c

def pickle_writer(input_,name):
    '''
    :param input_: 待儲存的資料
    :param name:  存放路徑
    '''
    writer = open(name,"wb")
    pickle.dump(input_,writer)
    writer.close()
    print("finish to write data")

# 定義讀plk檔案函式
def pickle_load(input_):
    '''
    :param input_: 路徑
    :return:  原始資料
    '''
    raeder = open(input_,"rb")
    content = pickle.load(raeder)
    raeder.close()
    print("finish to read data")
    return content

def jieba_cut(content):
    '''
    :param content: str 句子 待分詞
    :return: 分好詞的list
    '''
    cut = jieba.cut(content)
    l = []
    for con in cut:
        if con!=" ":
            l.append(con)
    return l

def is_chinese(uchar):
    """判斷一個unicode是否是漢字"""
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return uchar
    elif uchar == re.sub('[^a-zA-Z]', '', uchar):
        return str(uchar).lower()
    else:
        return ''

spider

# -*- coding: utf-8 -*-

from selenium import webdriver
from article_spider import *
import re

def get_pages(driver,url):
    '''
    :param driver: Webdriver頁面
    :param url: 指定日期的連結
    :return page_num: 指定日期內頁面的數量
    '''
    start_url = url + '1'
    driver.get(start_url)
    time.sleep(2)
    driver.refresh()
    time.sleep(2)
    page_html = driver.page_source
    pagelist = re.findall('onclick="newsList.page.goTo(.*?);return false', page_html, re.S)
    pattern = re.compile('\d+')  # 獲取頁碼數
    page_num = pattern.findall(pagelist[len(pagelist)-1])[0]
    return (page_num)

def Get_content(driver,page_num,url,output_list,date):
    '''
    :param driver: Webdriver頁面
    :param page_num: 指定日期內頁面的數量
    :param url: 指定日期的連結
    :param output_list: 輸出list 
    :param date: 指定日期
    '''
    k = 1
    while k <= int(page_num):
        driver.get(url + str(k))
        time.sleep(2.5)
        driver.refresh()
        for i in range(1, 11):
            for j in range(1, 6):
                classfy_cn = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[1]').text
                title = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').text
                href = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').get_attribute('href')
                times = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[3]').text
                pubtime = times.split(" ")[1]
                content, classfy_en = get_article(href)
                content_list = [classfy_cn, classfy_en, date, pubtime, title, href, content]
                test = '' + title + '.txt'
                with open(test, 'w') as f:
                    for content_list_element in content_list:
                        f.write(content_list_element)
                output_list.append(content_list)
                print(len(output_list))
        k = k + 1

def sina(url,output_list,date):
    '''
    :param url: 待爬取的url
    :param output_list: 輸出list
    :param date: 日期
    :return:
    '''
    driver = webdriver.Chrome()
    page_num = get_pages(driver, url)
    Get_content(driver, page_num, url, output_list, date)
    driver.close()

article_spider

#-*- coding:utf-8 -*-

from bs4 import BeautifulSoup
from user_agents import agents
import requests
import time
import random

def get_article(url):
    '''
    :param url: 指定日期的連結
    :return content: 文字的內容
    :return classfy: 文字的型別
    '''
    try:
        classfy = url.split('.')[0].split('//')[1]
        agent = random.choice(agents)
        header = {'User-Agent': agent}
        res = requests.get(url.rsplit('\r\n')[0], headers=header)
        time.sleep(1)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        newsArticle = getnewsArticle(soup.select('.art_p'))
        content = ''
        for con in newsArticle:
            content = content + con
        return content, classfy
    except Exception as e:
        print(e)

def getnewsArticle(news):
    '''
    :param news: 新聞主題內容連結
    :return newsArticle: 新聞主題內容
    '''
    newsArticle = []
    for p in news:
         newsArticle.append(p.text.strip())
    return newsArticle

後記

爬取下來的內容還算ok，雖然這樣看起來又繁瑣又蛋疼，而且好像如果新聞標題中含有" / "這個字元的話，就會報錯，顯示沒有這個資料夾，或許第二版程式碼我再考慮怎麼加一下中文匹配（？）。程式碼裡面的agent就隨便網上找點就行，不用太在意。
第一次寫，想來也有很多很多毛病，如果有人看到，還請指出，感恩的心，感謝有你。

Webdriver 爬取新浪滾動新聞

Webdriver 爬取新浪滾動新聞

初始想法

背景介紹

內容介紹

環境要求

檔案介紹

程式碼

後記

Webdriver 爬取新浪滾動新聞

selenium爬取新浪滾動新聞新聞

requests, Beautifusoup 爬取新浪新聞資訊

Python 爬蟲實例（7）—— 爬取新浪軍事新聞

4-15 爬取新浪網

python 爬取新浪網站 NBA球員最近2個賽季庫裡前20場資料

python爬蟲爬取新浪新聞的評論數以及部分評論

爬取新浪新聞

【轉】寫一個簡單的爬蟲來批量爬取新浪網的新聞

Python爬取新浪微博用戶信息及內容

python：爬取新浪新聞的內容

python爬取新浪股票資料—繪圖【原創分享】

關於爬取新浪微博，記憶體耗用過高的問題

scrapy爬取新浪微博並存入MongoDB中

python3[爬蟲實戰] 爬蟲之requests爬取新浪微博京東客服

用python寫網路爬蟲-爬取新浪微博評論

70行python程式碼爬取新浪財經中股票歷史成交明細

用網路爬蟲爬取新浪新聞----Python網路爬蟲實戰學習筆記

爬蟲爬取新浪微博

Python利用xpath和正則re爬取新浪新聞

Webdriver 爬取新浪滾動新聞

Webdriver 爬取新浪滾動新聞

初始想法

背景介紹

內容介紹

環境要求

檔案介紹

程式碼

後記

相關推薦