爬蟲Spider--爬取京東某產品的評價
阿新 • • 發佈:2018-12-26
本篇部落格提供了4種方式,有簡略版僅能完成要求卻簡陋,也有較為完整的方式
1.
# -*- coding:utf-8 -*- import re import urllib2 import json import sys if sys.getdefaultencoding() != 'utf-8': reload(sys) sys.setdefaultencoding('utf-8') class JDSpider: def loadPage(self): url = "https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv566&productId=100001906474&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1" user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' headers = {'User-Agent': user_agent} req = urllib2.Request(url,headers=headers) response = urllib2.urlopen(req) html = response.read() pattern = re.compile('"content":"(.*?)".*?"nickname":"(.*?)"') # print html.decode("gbk") # pattern = re.compile(r',"creationTime"(.*?)"nickname":') # html = pattern.sub(r'', html) # print html # pattern = re.compile(r'"content":"(.*?)""(.*?)"') item_list = pattern.findall(html) print item_list for item in item_list: self.writetoFile(item) # def printPage(self,item_list,page): # print item_list # print "=====爬取第%d頁===="%page # for item in item_list: # self.writetoFile(item) def writetoFile(self,test): with open("d:/124/jindong1.txt", 'a+')as myFile: json.dump(test, myFile, ensure_ascii=False, encoding='utf-8') # myFile.write(str(test).decode("utf-8")) myFile.write("\n-------------------------------------\n") myFile.close() # def doWork(self): # while self.enable: # try: # item_list = self.loadPage(self.page) # except urllib2.URLError,e: # print e.reason # continue # self.printPage(item_list,self.page) # self.page+=1 # print "按回車繼續" # print "輸入quit退出" # command = raw_input() # if(command=="quit"): # break if __name__ == '__main__': """ ====================== 京東評論爬蟲 ====================== """ #定義一個JDSpider物件 mySpider = JDSpider() mySpider.loadPage()
2.
# -*- coding:utf-8 -*- import urllib2 import json import sys class Comment: def Commets(self): reload(sys) sys.setdefaultencoding('utf-8') f=open('02.txt', 'w') for i in range(0, 10): url='https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv562&productId=100001906474&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' + str( i) + '&pageSize=10&isShadowSku=0&fold=1' # 爬多頁 request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read().decode('GBK') html = html.replace('fetchJSON_comment98vv562(', '') html = html.replace(');', '') # 去掉多餘字元 b = json.loads(html) print b for k in b['comments']: content = k["content"].encode('utf-8') self.writeToFile(content) f.write(k["content"].encode('utf-8') + '\n') referenceName = k["nickname"].encode('utf-8') self.writeToFile(referenceName) f.write(k["nickname"].encode('utf-8') + '\n') referenceTime = k["referenceTime"].encode('utf-8') + '\n\n' self.writeToFile(referenceTime) def writeToFile(self, text): # @brief 將資料追加進檔案內容 # @param text檔案內容 with open("d:/124/jd.txt", 'a') as myFile: myFile.write(text) myFile.write("\n-----------------------------------------------") myFile.close() if __name__ == '__main__': Comment = Comment() Comment.Commets()
3.
# - * - coding: UTF-8 - * - import urllib2 import json import sys reload(sys) sys.setdefaultencoding('utf8') f = open('01.txt', 'w') for i in range(0, 10): url = 'https://sclub.jd.com/comment/productPageComments.action?cal lback=fetchJSON_comment98vv562&productId=100001906474&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' + str(i) + '&pageSize=10&isShadowSku=0&fold=1' # 實現爬多頁 print url request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read().decode('GBK') html = html.replace('fetchJSON_comment98vv562(', '') html = html.replace(');', '') # 去掉多餘的字元 b = json.loads(html) for k in b['comments']: content = k["content"].encode('utf-8') print content f.write(k["content"].encode('utf-8') + '\n') referenceName = k["referenceName"].encode('utf-8') print referenceName f.write(k["referenceName"].encode('utf-8') + '\n') referenceTime = k["referenceTime"].encode('utf-8') print referenceTime f.write(k["referenceTime"].encode('utf-8') + '\n\n')