分享幾個小小的python爬蟲供大家娛樂(人民日報要聞---to be continued )
阿新 • • 發佈:2019-02-02
-1-實現人民日報要聞的抓取
說明文件:
使用包 : lxml,requests,urllib2
起始url :人民日報主頁
爬取目標 :人民日報要聞
- 要聞連結
- 要聞標題
- 要聞時間
- 要聞來源
- 要聞內容
輸出格式: HTML表格檔案
思路 : 首先收集要爬取頁面的所有連結,之後逐個進行爬取
實現程式碼:
#-*-coding:utf8-*-
#這段程式碼寫得不是很好,許多地方都有要改善的地方,大神勿噴^-^
import requests
import urllib2
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import HTMLParser
def htmls(url):
url = url.replace(" ", "")
request = urllib2.Request(url)
head = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
try:
response2 = urllib2.urlopen(request)
html = response2.read()
#html = unicode(html, "gb2312").encode("utf-8").decode('utf-8')
selector = etree.HTML(html)
return selector
except urllib2.HTTPError, e:
return
def firsthtml(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
html = requests.get(url, headers=header)
selector = etree.HTML(html.text)
return selector
def urls(url):
selector = firsthtml(url)
content_field1 = selector.xpath('/html/body/section[5]/div[2]/ul/li/strong/a')
content_field2 = selector.xpath('/html/body/section[5]/div[2]/ul/li/a')
content = content_field1 + content_field2
urlss = []
for content in content:
urlss.append(content.attrib['href'])
return urlss
def spider(url): #url處理函式
print '正在處理銜接'+str(num)+": ", url
selector = htmls(url)
if selector is None:
print '該連結未找到 -_-'
return
temp = {}
try:
title_path = selector.xpath('/html/body/div[4]/h1')
content_path = selector.xpath('//*[@id="rwb_zw"]/p')
time_path = selector.xpath('/html/body/div[4]/div/div[1]')
source_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
temp['time'] = time_path[0].text[0:19]
temp['source'] = source_path[0].text
temp['title'] = title_path[0].text
except:
title_path = selector.xpath('/html/body/div[@class="pic_content clearfix"]/div[@class="title"]/h1')
content_path = selector.xpath('/html/body/div[@class="content clear clearfix"]/p')
source_path = selector.xpath('//*[@id="picG"]/div[2]/div[2]/a')
time_path = selector.xpath('//*[@id="picG"]/div[2]/div[2]/text()[2]')
try:
temp['time'] = time_path[0][0:23]
temp['source'] = source_path[0].text
temp['title'] = title_path[0].text
except:
print '該連結爬取失敗 -_-'
return
scontent = ''
for content in content_path:
scontent = scontent + content.text
temp['content'] = scontent
temp['url'] = url
all.append(temp)
print "成功爬取該連結 ^.^"
def tohtml(datas):
fout = open('content.html', 'w')
fout.write("<html>")
fout.write("<meta charset=utf-8>")
fout.write("<title>人民日報要聞</title>")
fout.write("<body>")
fout.write("<table>")
fout.write("<style type='text/css'>table{border-collapse: collapse;}table td{border:1px solid black;}</style>")
for data in datas:
fout.write("<tr>")
fout.write("<td>%s</td>" % data['url'])
fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
fout.write("<td>%s</td>" % data['time'].encode('utf-8'))
fout.write("<td>%s</td>" % data['source'].encode('utf-8'))
fout.write("<td>%s</td>" % data['content'].encode('utf-8'))
fout.write("</tr>")
fout.write("</table>")
fout.write("</bdoy>")
fout.write("</html>")
fout.close()
if __name__ == '__main__':
num = 1
all = []
urlss = urls('http://www.people.com.cn/')
# pool = ThreadPool(4)
for x in urlss:
spider(x)
num = num + 1
# results = pool.map(spider, urlss)
tohtml(all)
# pool.close()
# pool.join()
#本來想開個多執行緒的,懶得寫了,大家有興趣可以自己嘗試下,也不難 ^-^
執行結果:
爬取數量取決於中國人民網首頁要聞一欄的文章數量
執行成功產生content.html檔案
可以在瀏覽器中直接開啟
我是一條小小的分割線,我還是第一條 ^-^