1. 程式人生 > >分享幾個小小的python爬蟲供大家娛樂(人民日報要聞---to be continued )

分享幾個小小的python爬蟲供大家娛樂(人民日報要聞---to be continued )



使用包 : lxml,requests,urllib2
起始url :人民日報主頁
爬取目標 :人民日報要聞

  1. 要聞連結
  2. 要聞標題
  3. 要聞時間
  4. 要聞來源
  5. 要聞內容

輸出格式: HTML表格檔案
思路 : 首先收集要爬取頁面的所有連結,之後逐個進行爬取


import requests
import urllib2
from lxml import etree
from multiprocessing.dummy import
Pool as ThreadPool import HTMLParser def htmls(url): url = url.replace(" ", "") request = urllib2.Request(url) head = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' try: response2 = urllib2.urlopen(request) html = response2.read() #html = unicode(html, "gb2312").encode("utf-8").decode('utf-8')
selector = etree.HTML(html) return selector except urllib2.HTTPError, e: return def firsthtml(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} html = requests.get(url, headers=header) selector = etree.HTML(html.text) return
selector def urls(url): selector = firsthtml(url) content_field1 = selector.xpath('/html/body/section[5]/div[2]/ul/li/strong/a') content_field2 = selector.xpath('/html/body/section[5]/div[2]/ul/li/a') content = content_field1 + content_field2 urlss = [] for content in content: urlss.append(content.attrib['href']) return urlss def spider(url): #url處理函式 print '正在處理銜接'+str(num)+": ", url selector = htmls(url) if selector is None: print '該連結未找到 -_-' return temp = {} try: title_path = selector.xpath('/html/body/div[4]/h1') content_path = selector.xpath('//*[@id="rwb_zw"]/p') time_path = selector.xpath('/html/body/div[4]/div/div[1]') source_path = selector.xpath('/html/body/div[4]/div/div[1]/a') temp['time'] = time_path[0].text[0:19] temp['source'] = source_path[0].text temp['title'] = title_path[0].text except: title_path = selector.xpath('/html/body/div[@class="pic_content clearfix"]/div[@class="title"]/h1') content_path = selector.xpath('/html/body/div[@class="content clear clearfix"]/p') source_path = selector.xpath('//*[@id="picG"]/div[2]/div[2]/a') time_path = selector.xpath('//*[@id="picG"]/div[2]/div[2]/text()[2]') try: temp['time'] = time_path[0][0:23] temp['source'] = source_path[0].text temp['title'] = title_path[0].text except: print '該連結爬取失敗 -_-' return scontent = '' for content in content_path: scontent = scontent + content.text temp['content'] = scontent temp['url'] = url all.append(temp) print "成功爬取該連結 ^.^" def tohtml(datas): fout = open('content.html', 'w') fout.write("<html>") fout.write("<meta charset=utf-8>") fout.write("<title>人民日報要聞</title>") fout.write("<body>") fout.write("<table>") fout.write("<style type='text/css'>table{border-collapse: collapse;}table td{border:1px solid black;}</style>") for data in datas: fout.write("<tr>") fout.write("<td>%s</td>" % data['url']) fout.write("<td>%s</td>" % data['title'].encode('utf-8')) fout.write("<td>%s</td>" % data['time'].encode('utf-8')) fout.write("<td>%s</td>" % data['source'].encode('utf-8')) fout.write("<td>%s</td>" % data['content'].encode('utf-8')) fout.write("</tr>") fout.write("</table>") fout.write("</bdoy>") fout.write("</html>") fout.close() if __name__ == '__main__': num = 1 all = [] urlss = urls('http://www.people.com.cn/') # pool = ThreadPool(4) for x in urlss: spider(x) num = num + 1 # results = pool.map(spider, urlss) tohtml(all) # pool.close() # pool.join() #本來想開個多執行緒的,懶得寫了,大家有興趣可以自己嘗試下,也不難 ^-^





我是一條小小的分割線,我還是第一條 ^-^

to be continued 。。。時間不夠了-_-!