1. 程式人生 > >Python3爬蟲實踐--網易科技滾動新聞爬取

Python3爬蟲實踐--網易科技滾動新聞爬取

開發十年,就只剩下這套架構體系了! >>>   

背景需求

完成作業的同時練習爬蟲,利用Xpath匹配出需要爬取的內容;

需要爬取的新聞介面

image

需要爬取的資訊

image

實現程式碼

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/3/13 13:08
# @Author  : cunyu
# @Site    : cunyu1943.github.io
# @File    : NetaseNewsSpider.py
# @Software: PyCharm

import requests
from lxml import etree
import xlwt


headers = {
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

# 根據url獲取剛網頁中的新聞詳情頁的網址列表
def getNewsDetailUrlList(url):
	"""
	:param url: 每頁的URL
	:return newDetailList:每頁包含的新聞詳情URL
	"""
	response = requests.get(url, headers=headers)
	html = response.content.decode('gbk')
	selector = etree.HTML(html)
	newsDetailList = selector.xpath('//ul[@id="news-flow-content"]//li//div[@class="titleBar clearfix"]//h3//a/@href')
	return newsDetailList


# 獲取新聞標題
def getNewsTitle(detailUrl):
	"""
	:param detailUrl:新聞詳情url
	:return newsTitle:新聞標題
	"""
	response = requests.get(detailUrl, headers=headers)
	html = response.content.decode('gbk')
	selector = etree.HTML(html)
	newsTitle = selector.xpath('//div[@class="post_content_main"]//h1/text()')
	return newsTitle


# 獲取新聞詳情內容
def getNewsContent(detailUrl):
	"""
	:param detailUrl: 新聞詳情url
	:return newsContent: 新聞內容詳情
	"""
	response = requests.get(detailUrl, headers=headers)
	html = response.content.decode('gbk')
	selector = etree.HTML(html)
	newsContent = selector.xpath('//div[@class="post_text"]//p/text()')
	return newsContent

# 將新聞標題和內容寫入檔案
	TODO

# 獲取翻頁網址列表
def getUrlList(baseUrl, num):
	"""
	:param baseUrl:基礎網址
	:param num: 翻到第幾頁
	:return urlList: 翻頁網址列表
	"""
	urlList = []
	urlList.append(baseUrl)
	for i in range(2, num+1):
		urlList.append(baseUrl + "_" + str(i).zfill(2))
	return urlList

if __name__ == '__main__':
	baseUrl = "http://tech.163.com/special/gd2016"
	num = int(input('輸入你要爬取的頁數: '))
	urlList = getUrlList(baseUrl, num)

	print(urlList)
	detailUrl = []
	for url in urlList:
		for i in getNewsDetailUrlList(url):
			detailUrl.append(i)
	print(detailUrl)


	print(getNewsTitle(detailUrl[0]))
	print(getNewsContent(detailUrl[0]))

	# 將爬取的文字存入文字檔案

	# with open('news.txt', 'w', encoding='utf-8') as f:
	# 	for i in detailUrl:
	# 		f.write(''.join(getNewsTitle(i)))
	# 		f.write('\n')
	# 		f.write(''.join(getNewsContent(i)))
	# 		f.write('\n')
	# print('檔案寫入成功')

	# 將爬取得文字存入excel檔案

	# 建立一個Excel檔案
	workbook = xlwt.Workbook(encoding='utf-8')

	news_sheet = workbook.add_sheet('news')

	news_sheet.write(0, 0, 'Title')
	news_sheet.write(0, 1, 'Content')
	for i in range(len(detailUrl)):
		# print(detailUrl[i])
		news_sheet.write(i + 1, 0, getNewsTitle(detailUrl[i]))
		news_sheet.write(i + 1, 1, getNewsContent(detailUrl[i]))

	# 將寫入操作儲存到指定Excel檔案中
	workbook.save('網易新聞.xls')
	print('檔案寫入成功')

結果

  • 程式碼執行結果 image

  • 儲存的檔案

image

總結

總體來說比較簡單,程式碼也存在需要改進的地方,後續會改進更新,有其他想法的也