python爬蟲xpath針對json程式碼的分析方法
本文學會使用多程序爬取的map方法,json提取頁面內容方法,xpath解析頁面的方法:
http://tieba.baidu.com/p/3522395718?pn=1
頁面程式碼:
<div class="l_post j_l_post l_post_bright " data-field="{"author":{"user_id":503570759,"user_name":"\u9893\u5e9f\u4e86\u8c01\u7684\u6e05\u7eaf","name_u":"%E9%A2%93%E5%BA%9F%E4%BA%86%E8%B0%81%E7%9A%84%E6%B8%85%E7%BA%AF&ie=utf-8","user_sex":2,"portrait":"47e1e9a293e5ba9fe4ba86e8b081e79a84e6b885e7baaf031e","is_like":1,"level_id":14,"level_name":"\u4f20\u5947\u679c\u7c89","cur_score":20947,"bawu":0,"props":null},"content":{"post_id":62866847607,"is_anonym":false,"open_id":"tbclient","open_type":"apple","date":"2015-01-11
16:39","vote_crypt":"","post_no":6,"type":"0","comment_num":123,"ptype":"0","is_saveface":false,"props":null,"post_index":4,"pb_tpoint":null}}">
程式設計程式碼:
def spider(url): html = requests.get(url) selector = etree.HTML(html.text) content_field = selector.xpath('//div[@class="l_post l_post_bright "]') item = {} for each in content_field: reply_info = json.loads(each.xpath('@data-field')[0].replace('"','')) author = reply_info['author']['user_name'] content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content "]/text()')[0] reply_time = reply_info['content']['date'] #print content #print reply_time #print author item['user_name'] = author item['topic_reply_content'] = content item['topic_reply_time'] = reply_time towrite(item)
針對json開發的頁面,我們可以使用json.loads方法載入提取,如:
reply_info = json.loads(each.xpath('@data-field')[0].replace('"',''))
此頁面也涉及多個字典,字典裡含有字典,解析方法為:
author = reply_info['author']['user_name']
後附爬取貼吧user_name,內容,發表時間等內容的完整程式碼:
#-*-coding:utf8-*- from lxml import etree from multiprocessing.dummy import Pool as ThreadPool import requests import json import sys reload(sys) sys.setdefaultencoding('utf-8') '''重新執行之前請刪除content.txt,因為檔案操作使用追加方式,會導致內容太多。''' def towrite(contentdict): f.writelines(u'回帖時間:' + str(contentdict['topic_reply_time']) + '\n') f.writelines(u'回帖內容:' + unicode(contentdict['topic_reply_content']) + '\n') f.writelines(u'回帖人:' + contentdict['user_name'] + '\n\n') def spider(url): html = requests.get(url) selector = etree.HTML(html.text) content_field = selector.xpath('//div[@class="l_post l_post_bright "]') item = {} for each in content_field: reply_info = json.loads(each.xpath('@data-field')[0].replace('"','')) author = reply_info['author']['user_name'] content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content "]/text()')[0] reply_time = reply_info['content']['date'] #print content #print reply_time #print author item['user_name'] = author item['topic_reply_content'] = content item['topic_reply_time'] = reply_time towrite(item) if __name__ == '__main__': pool = ThreadPool(4) f = open('content.txt','a') page = [] for i in range(1,22): newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i) page.append(newpage) results = pool.map(spider, page) pool.close() pool.join() f.close()