Pytorch-中文文字分類
阿新 • • 發佈:2020-08-20
1. 爬蟲
JD.py
import requests from urllib.parse import quote from urllib.parse import urlencode from lxml import etree import logging import json import time class JDSpider: # 爬蟲實現類:傳入商品類別(如手機、電腦),構造例項。然後呼叫getData爬取資料 def __init__(self, categlory): self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (quote(categlory)) # jD起始搜尋頁面 self.commentBaseUrl = "https://club.jd.com/comment/productPageComments.action?" self.headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" } self.productsId = self.getId() self.comtype = {0: "nagetive", 1: "medium", 2: "positive"} self.categlory = categlory self.iplist = { 'http': [], 'https': [] } def getParamUrl(self, productid, page, score): # 用於控制頁數,頁面資訊數的資料,非常重要,必不可少,否則會被JD識別出來,爬不出相應的資料。 params = { "productId" : "%s" % (productid), "score": "%s" % (score), # 1: 差評, 2: 中評, 3: 好評 "page": "%s" % (page), "sortType": "5", "pageSize": "10", "isShadowSku": "0", "rid": "0", "fold": "1" } url = self.commentBaseUrl + urlencode(params) return params, url # 和初始的self.header不同,爬取某個商品的header,加入了商品id def getHeaders(self, productid): header = { "Referer": "https://item.jd.com/%s.html" % (productid), "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" } return header # 獲取商品id,為了得到具體商品頁面的網址,結果保持在self.productId的數組裡 def getId(self): response = requests.get(self.startUrl, headers=self.headers) if response.status_code != 200: logging.warning("狀態碼錯誤,爬蟲異常!") html = etree.HTML(response.text) return html.xpath('//li[@class="gl-item"]/@data-sku') # maxPage是爬取評論的最大頁數,每頁10條資料 def getData(self, maxPage, score, ): # 差評 和 好評 的 最大一般頁碼不相同,一般情況下:好評 >> 差評 > 中評 # score是指 那種評價型別: 好評3、中評2、差評1 comments = [] scores = [] for j in range(len(self.productsId)): id = self.productsId[j] header = self.getHeaders(id) for i in range(1, maxPage): param, url = self.getParamUrl(id, i, score) print(">>>>>>>>>>>>>>>>第:%d 個,第 %d 頁" % (j, i)) try: response = requests.get(url, headers=header, params=param) except Exception as e: logging.warning(e) break if response.status_code != 200: logging.warning("狀態碼錯誤,爬蟲連線異常") continue time.sleep(2) # 設定時延 if response.text == '': logging.warning("未爬取到資訊") continue try: res_json = json.loads(response.text) except Exception as e: logging.warning(e) continue if len((res_json['comments'])) == 0: logging.warning("頁面次數已到:%d,超出範圍" % (i)) break logging.info("正在爬取%s %s 第 %d" % (self.categlory, self.comtype[score], i)) for cdit in res_json['comments']: comment = cdit['content'].replace("\n", ' ').replace('\r', ' ') comments.append(comment) scores.append(cdit['score']) print(comment) savepath = './data/' + self.categlory + '_' + self.comtype[score] + '.csv' logging.warning("已爬取%d 條 %s 評價資訊" % (len(comments), self.comtype[score])) with open(savepath, 'a+', encoding='utf8') as f: for i in range(len(comments)): f.write("%d\t%s\t%s\n" % (i, scores[i], comments[i])) logging.warning("資料已儲存在 %s" % (savepath)) if __name__=='__main__': list = ['電腦','手機','耳機'] for item in list: spider = JDSpider(item) spider.getData(10, 2) # 好評 spider.getData(10, 1) # 中評 spider.getData(10, 0) # 差評