Python3爬蟲實踐--網易科技滾動新聞爬取
阿新 • • 發佈:2019-03-13
背景需求
完成作業的同時練習爬蟲,利用Xpath匹配出需要爬取的內容;
需要爬取的新聞介面
需要爬取的資訊
實現程式碼
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/3/13 13:08 # @Author : cunyu # @Site : cunyu1943.github.io # @File : NetaseNewsSpider.py # @Software: PyCharm import requests from lxml import etree import xlwt headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } # 根據url獲取剛網頁中的新聞詳情頁的網址列表 def getNewsDetailUrlList(url): """ :param url: 每頁的URL :return newDetailList:每頁包含的新聞詳情URL """ response = requests.get(url, headers=headers) html = response.content.decode('gbk') selector = etree.HTML(html) newsDetailList = selector.xpath('//ul[@id="news-flow-content"]//li//div[@class="titleBar clearfix"]//h3//a/@href') return newsDetailList # 獲取新聞標題 def getNewsTitle(detailUrl): """ :param detailUrl:新聞詳情url :return newsTitle:新聞標題 """ response = requests.get(detailUrl, headers=headers) html = response.content.decode('gbk') selector = etree.HTML(html) newsTitle = selector.xpath('//div[@class="post_content_main"]//h1/text()') return newsTitle # 獲取新聞詳情內容 def getNewsContent(detailUrl): """ :param detailUrl: 新聞詳情url :return newsContent: 新聞內容詳情 """ response = requests.get(detailUrl, headers=headers) html = response.content.decode('gbk') selector = etree.HTML(html) newsContent = selector.xpath('//div[@class="post_text"]//p/text()') return newsContent # 將新聞標題和內容寫入檔案 TODO # 獲取翻頁網址列表 def getUrlList(baseUrl, num): """ :param baseUrl:基礎網址 :param num: 翻到第幾頁 :return urlList: 翻頁網址列表 """ urlList = [] urlList.append(baseUrl) for i in range(2, num+1): urlList.append(baseUrl + "_" + str(i).zfill(2)) return urlList if __name__ == '__main__': baseUrl = "http://tech.163.com/special/gd2016" num = int(input('輸入你要爬取的頁數: ')) urlList = getUrlList(baseUrl, num) print(urlList) detailUrl = [] for url in urlList: for i in getNewsDetailUrlList(url): detailUrl.append(i) print(detailUrl) print(getNewsTitle(detailUrl[0])) print(getNewsContent(detailUrl[0])) # 將爬取的文字存入文字檔案 # with open('news.txt', 'w', encoding='utf-8') as f: # for i in detailUrl: # f.write(''.join(getNewsTitle(i))) # f.write('\n') # f.write(''.join(getNewsContent(i))) # f.write('\n') # print('檔案寫入成功') # 將爬取得文字存入excel檔案 # 建立一個Excel檔案 workbook = xlwt.Workbook(encoding='utf-8') news_sheet = workbook.add_sheet('news') news_sheet.write(0, 0, 'Title') news_sheet.write(0, 1, 'Content') for i in range(len(detailUrl)): # print(detailUrl[i]) news_sheet.write(i + 1, 0, getNewsTitle(detailUrl[i])) news_sheet.write(i + 1, 1, getNewsContent(detailUrl[i])) # 將寫入操作儲存到指定Excel檔案中 workbook.save('網易新聞.xls') print('檔案寫入成功')
結果
-
程式碼執行結果
-
儲存的檔案
總結
總體來說比較簡單,程式碼也存在需要改進的地方,後續會改進更新,有其他想法的也