爬取淘寶評論時出現list index out of range問題
阿新 • • 發佈:2019-02-13
list index out of range 列表越界
Traceback (most recent call last):
File "G:/workSpace/Python/TB_Crawler/Crawler_train.py", line 71, in <module>
print(getCommentsList(ItemURL2))
File "G:/workSpace/Python/TB_Crawler/Crawler_train.py", line 46, in getCommentsList
comment = getComment(newURL, i)['content']
File "G:/workSpace/Python/TB_Crawler/Crawler_train.py", line 27, in getComment
comment = jd['comments'][num]
IndexError: list index out of range
提示錯誤在getComment方法的comment = jd['comments'][num]中
因為淘寶評論每一頁最多20條評論,所以爬取評論時對每頁的20條評論進行遍歷。當某頁評論小於20條時,num仍然是從0到20遍歷,所以就會出現列表越界的問題。
解決辦法:當遍歷到最後一條評論時break,跳出迴圈。
# 獲取商品id def getItemID(url): m = re.search('id=(.+)', url) itemID = m.group(1)[0:12] return itemID # 獲取評論數def getCommentCount(url): # countURL = 'https://rate.taobao.com/detailCount.do?_ksTS=1516697185953_173&callback=jsonp174&itemId={}' countURL = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId=352740130¤tPageNum=1' res = requests.get(countURL.format(getItemID(url))) # jd = json.loads(res.text.strip('jsonp174()'))jd = json.loads(res.text.strip().strip('()')) return jd['total'] # 獲取每條評論及時間 def getComment(url, num): result = {} res = requests.get(url) jd = json.loads(res.text.strip().strip('()')) comment = jd['comments'][num] # print(comment) # result['date'] = comment['date'] result['content'] = comment['content'] return result # 將評論填入列表 def getCommentsList(url): commentList = [] page = 1 maxCount = getCommentCount(url) count = 0 # num = 0 while count < maxCount: commentURL = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId=352740130¤tPageNum=1' newURL = commentURL.format(getItemID(url))[:-1] + str(page) page = page + 1 for i in range(0, 20): comment = getComment(newURL, i)['content'] if comment != '15天內買家未作出評價' and comment != '評價方未及時做出評價,系統預設好評!' and comment != '此使用者沒有填寫評價。': commentList.append(getComment(newURL, i)) # num = num+1 # print(comment, num) # 獲取追加評論 # if comment['append'] is not None: # commentList.append(comment['append']['content']) count = count + 1 if count >= maxCount: break return commentList