1. 程式人生 > >Webdriver 爬取新浪滾動新聞

Webdriver 爬取新浪滾動新聞

Webdriver 爬取新浪滾動新聞

初始想法

本人現在是國際關係學院2016級的本科生,學的是資訊管理與資訊系統。講道理不知道這個專業到底是幹啥的,現在選擇的後續方向是資料科學與工程,並且在老師的自然語言處理小組。爬蟲是做自然語言處理的基礎嘛,學習機器學習之前先學學怎麼爬取內容還是挺有意義的。本來開始想著爬一下新浪微博的內容,但是又涉及到滾動爬取,賬號登陸之類的繁瑣問題,還是先玩玩滾動新聞吧。其實講道理中國新聞網的滾動新聞做的比新浪的好多了,介面也好看,不過這都是爬完之後才發現的哈哈哈哈哈哈哈

背景介紹

本專案為基於新浪滾動新聞(https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1)進行網頁爬取,以新聞標題為檔名稱,以新聞內容為檔案正文儲存。

內容介紹

環境要求

  • 環境要求:python3.7+
  • 安裝包要求:time,requests,random,Beautifulsoup,selenium等

檔案介紹

  • main:調取其他所有檔案相關函式,輸入初始url,並計算爬取全部網頁耗時
  • date_helper:對網頁日期進行調整實現自動翻頁
  • data_helper:對資料的所有路徑進行調整
  • spider:爬取網頁的主檔案,呼叫Webdriver獲取主索引頁的子頁面並獲取網頁內容
  • article_spider:爬取新聞正文內容

程式碼

main

from
date_helper import date_processing from data_helper import pickle_writer from spider import * import time start = time.clock() if __name__ == '__main__': url_org = 'http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&date={}&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page='
while True: date = date_processing() # 獲取日期 output_list = [] # 存放輸出序列 list url = url_org.format(date) # 生成待爬取URL sina(url,output_list,date) # 爬蟲 print(output_list) print(len(output_list)) file_name = ''.format(date) pickle_writer(output_list, file_name) # 寫入臨時檔案存放 end = time.clock() print('Running:%s seconds.'%(end - start))

date_helper

from selenium import webdriver
import re
import time
import calendar
import re
import codecs
from data_helper import *

def count_days(year, month):
    cal = calendar.monthrange(year, month)
    pattern = re.compile(r'\d+')
    days = pattern.findall(str(cal))[1]
    return days

def month_sub(year,month):
    if month > 10:
        month -= 1
        month = str(month)
    elif month <= 10 and month > 1 :
        month -= 1
        month = '0'+str(month)
    else:
        year -= 1
        month = 12
    return year,month

def date_sub(year,month,day):
    if day > 10:
        day -= 1
        day = str(day)
    elif day <= 10 and day > 1:
        day -= 1
        day = '0'+str(day)
    else:
        year, month = month_sub(int(year),int(month))
        days = count_days(year, int(month))
        day = days
    date = str(year)+'-'+str(month) +'-'+str(day)  #新浪滾動新聞
    return date

def date_processing():
    date_txt = ""
    last_date = txt_load(date_txt)
    date = str(last_date[0])
    year = int(date.split("-")[0])
    month = date.split("-")[1]
    day = int(date.split("-")[2])
    date = date_sub(year, month, day)
    writer = codecs.open(date_txt,'w','UTF-8')
    writer.write(date)
    writer.flush()
    return date

data_helper

import re
import pickle
import codecs
import jieba
'''
讀取原始資料
'''
def txt_load(path):
    reader = codecs.open(path,'r','UTF-8')
    lines = reader.readlines()
    return lines

def join_list(ss):
    c = ""
    for k in ss:
        c+=k
    return c

def pickle_writer(input_,name):
    '''
    :param input_: 待儲存的資料
    :param name:  存放路徑
    '''
    writer = open(name,"wb")
    pickle.dump(input_,writer)
    writer.close()
    print("finish to write data")

# 定義讀plk檔案函式
def pickle_load(input_):
    '''
    :param input_: 路徑
    :return:  原始資料
    '''
    raeder = open(input_,"rb")
    content = pickle.load(raeder)
    raeder.close()
    print("finish to read data")
    return content

def jieba_cut(content):
    '''
    :param content: str 句子 待分詞
    :return: 分好詞的list
    '''
    cut = jieba.cut(content)
    l = []
    for con in cut:
        if con!=" ":
            l.append(con)
    return l

def is_chinese(uchar):
    """判斷一個unicode是否是漢字"""
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return uchar
    elif uchar == re.sub('[^a-zA-Z]', '', uchar):
        return str(uchar).lower()
    else:
        return ''

spider

# -*- coding: utf-8 -*-

from selenium import webdriver
from article_spider import *
import re

def get_pages(driver,url):
    '''
    :param driver: Webdriver頁面
    :param url: 指定日期的連結
    :return page_num: 指定日期內頁面的數量
    '''
    start_url = url + '1'
    driver.get(start_url)
    time.sleep(2)
    driver.refresh()
    time.sleep(2)
    page_html = driver.page_source
    pagelist = re.findall('onclick="newsList.page.goTo(.*?);return false', page_html, re.S)
    pattern = re.compile('\d+')  # 獲取頁碼數
    page_num = pattern.findall(pagelist[len(pagelist)-1])[0]
    return (page_num)

def Get_content(driver,page_num,url,output_list,date):
    '''
    :param driver: Webdriver頁面
    :param page_num: 指定日期內頁面的數量
    :param url: 指定日期的連結
    :param output_list: 輸出list 
    :param date: 指定日期
    '''
    k = 1
    while k <= int(page_num):
        driver.get(url + str(k))
        time.sleep(2.5)
        driver.refresh()
        for i in range(1, 11):
            for j in range(1, 6):
                classfy_cn = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[1]').text
                title = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').text
                href = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[2]/a').get_attribute('href')
                times = driver.find_element_by_xpath(
                    '//*[@id="d_list"]/ul[' + str(i) + ']/li[' + str(j) + ']/span[3]').text
                pubtime = times.split(" ")[1]
                content, classfy_en = get_article(href)
                content_list = [classfy_cn, classfy_en, date, pubtime, title, href, content]
                test = '' + title + '.txt'
                with open(test, 'w') as f:
                    for content_list_element in content_list:
                        f.write(content_list_element)
                output_list.append(content_list)
                print(len(output_list))
        k = k + 1

def sina(url,output_list,date):
    '''
    :param url: 待爬取的url
    :param output_list: 輸出list
    :param date: 日期
    :return:
    '''
    driver = webdriver.Chrome()
    page_num = get_pages(driver, url)
    Get_content(driver, page_num, url, output_list, date)
    driver.close()

article_spider

#-*- coding:utf-8 -*-

from bs4 import BeautifulSoup
from user_agents import agents
import requests
import time
import random

def get_article(url):
    '''
    :param url: 指定日期的連結
    :return content: 文字的內容
    :return classfy: 文字的型別
    '''
    try:
        classfy = url.split('.')[0].split('//')[1]
        agent = random.choice(agents)
        header = {'User-Agent': agent}
        res = requests.get(url.rsplit('\r\n')[0], headers=header)
        time.sleep(1)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        newsArticle = getnewsArticle(soup.select('.art_p'))
        content = ''
        for con in newsArticle:
            content = content + con
        return content, classfy
    except Exception as e:
        print(e)

def getnewsArticle(news):
    '''
    :param news: 新聞主題內容連結
    :return newsArticle: 新聞主題內容
    '''
    newsArticle = []
    for p in news:
         newsArticle.append(p.text.strip())
    return newsArticle

後記

爬取下來的內容還算ok,雖然這樣看起來又繁瑣又蛋疼,而且好像如果新聞標題中含有" / "這個字元的話,就會報錯,顯示沒有這個資料夾,或許第二版程式碼我再考慮怎麼加一下中文匹配(?)。程式碼裡面的agent就隨便網上找點就行,不用太在意。
第一次寫,想來也有很多很多毛病,如果有人看到,還請指出,感恩的心,感謝有你。