python爬取京東評論
阿新 • • 發佈:2022-03-16
一.分析
1.找到京東商品評論所在位置(記得點選商品評論,否則找不到productPageComments.action)
2.解析檔案
開啟後發現是json資料,但不是那麼規範,所以需要去點前面的字串和括號,還有最後一行的分號和括號
3.放到json解析器可以看到資料的結構
4.解析網址
裡面的引數:
productid:產品id;不同的id不同的商品
score:0是全部評論,1是差評,2是中評,3是好評,4是晒圖評價,5是追平
page:頁數,評論較多的最多顯示100頁,雖然評論是20萬+,但是也只能爬取一百頁
所以根據更改page可以爬取多個頁面評論
5.爬取多個商品
要爬取多個商品就要多個productid,所以就要去找它
通過搜尋xxxx,可以在search中找到sku_id,將它解析出來即可,同上
二.編寫爬蟲程式碼
# -*- coding: utf-8 -*- import gzip import urllib.request import json import time import random import demjson as dj import requests import itertools headers = { "Cookie": "__jdu=1507876332; shshshfpa=2ea021ee-52dd-c54e-1be1-f5aa9e333af2-1640075639; areaId=5; PCSYCityID=CN_0_0_0; shshshfpb=n7UymiTWOsGPvQfCup%2B3J1g%3D%3D; ipLoc-djd=5-142-42547-54561; jwotest_product=99; pinId=S4TjgVP4kjjnul02leqp07V9-x-f3wj7; pin=jd_60a1ab2940be3; unick=jd_60a1ab2940be3; ceshi3.com=000; _tp=672TNfWmOtaDFuqCPQqYycXMpi6F%2BRiwrhIuumNmoJ4%3D; _pst=jd_60a1ab2940be3; __jdc=122270672; shshshfp=4e8d45f57897e469586da47a0016f20e; ip_cityCode=142; CCC_SE=ADC_rzqTR2%2bUDTtHDYjJdX25PEGvHsBpPY%2bC9pRDVdNK7pU%2fwikRihpN3XEXZ1cn4Jd4w5OWdpJuduhBFwUvdeB6X1VFb7eIZkqL0OJvBn9RB6AJYo4An%2fGTiU%2b8rvqQwYxBI4QCM8a9w9kYQczygSjPxPjn1pbQLtBgo%2fzKBhwfKhAWs563NfBjmnRlkGHPX6E7jy6%2fEdfEhtkNSTCQod238cEpUFpKiQ%2bWV%2bW8MiaL3ti7d7ozdlNbZ03ylqRbI1XrXylDiqzW%2b2uALhF5H1eHuk3yH3t4ojXZmRbDy3k2OoZFk%2bcmrXD0eWhcIaD5RnhHbToYLuX%2byx7otaPuemTVAG4Z7CSyEfmUBAj7QuGmHg647a7KuoaR3hoCvxj%2f3woXdd2H9b40oqmJ5PO958Z1g%2fr7Jbk8a5w2CU547IaXRzakehLhW9xzG57Ak0Jhv85Jlt9A5N6hl%2ft4DSAwh%2bGhwg%3d%3d; unpl=JF8EAJJnNSttDBxWAxxSEkUVQg4EW1QKTx9TazcCAV8KSFICE1FIF0N7XlVdXhRKFR9vYhRUW1NPVA4ZBysSEXteVV1YCE0TAGlnNWRtW0tkBCsCHxMWQltTXF8LeycDZ2M1VFxZSlYGHQEbEBBCbWRbXQlKFQBpYQVQbVl7VTVZbEJTDBkCBxNdDEoRCmlgB1ZeaEpkBg; JSESSIONID=347F847A6818E35675648739BD4BA9FF.s1; __jda=122270672.1507876332.1640075637.1647251498.1647261295.13; thor=8D225D1673AA75681B9D3811417B0D325568BB2DD7F2729798D3AECF0428F59F7C70EA7504347F8E059F895AEE7D6E2662F565665845F0D94F2D7D56739CF3BC2B15F5F6E2ADDB891DDA80A9E9F88B7BA0BA95147512F78D28D8095E52379AB78550E451558DB6595C2270A1D5CFA2E211FF20F22ADA1987C6AE9E864DA6A7364D5BFD3EE08DA597D2EF2B37444CFD7A47134EFFD71B3A70B0C8BD55D51F274F; token=397b2c7c58f4021bbe9a9bbe9eeda694,3,915145; __tk=46fbcc7e51f75824dcdc2e8820904365,3,915145; shshshsID=5c5095f0b5728a839c0397308d625da5_1_1647261360535; __jdb=122270672.2.1507876332|13.1647261295; __jdv=122270672|jd.idey.cn|t_2024175271_|tuiguang|ef376a8f48ba48359a5a6d3c2769bb4b|1647261360584; 3AB9D23F7A4B3C9B=24HI5ARAA3SK7RJERTWUDZKA2NYJIXX3ING24VG466VC3ANKUALJLLD7VBYLQ3QPRYUSO3R6QBJYLVTVXBDIGJLGBA", "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39" } headers2 = { "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "cookie": "__jdu=1507876332; shshshfpa=2ea021ee-52dd-c54e-1be1-f5aa9e333af2-1640075639; areaId=5; PCSYCityID=CN_0_0_0; ipLoc-djd=5-142-42547-54561; pinId=S4TjgVP4kjjnul02leqp07V9-x-f3wj7; pin=jd_60a1ab2940be3; unick=jd_60a1ab2940be3; _tp=672TNfWmOtaDFuqCPQqYycXMpi6F%2BRiwrhIuumNmoJ4%3D; _pst=jd_60a1ab2940be3; user-key=a2aaf011-2c1e-4dea-bf76-3392d16b1fb1; __jdc=122270672; wlfstk_smdl=jlwwba2gmccq62touff9evvbp3fk8gbr; ceshi3.com=000; shshshfp=4e8d45f57897e469586da47a0016f20e; ip_cityCode=142; shshshfpb=n7UymiTWOsGPvQfCup%2B3J1g%3D%3D; joyya=1647305570.1647305909.27.0kop377; __jda=122270672.1507876332.1640075637.1647314039.1647318046.22; token=d5899471c4530886f6a9658cbea3ca94,3,915176; __tk=1570759a7dd1a720b0db2dec5df8d044,3,915176; CCC_SE=ADC_Wj0UWzuXioxsiUvbIxw9PbW9q011vNMASHkfjXFO%2fZlkeGDtZUHe5qgaEpWv8RDEkCruGSGmCItsvHjIZ3aHbh9heUjNIZh6WZl9ZDfDokk66kRX6I%2by%2bDsdf4JtPOQUuULSsWOA%2fcDyP7Bb91YuHOwNnciLtS97UIKO7XA5sAd34Rf4XDKijy6Fw1DFTx%2b7izzme6YALuLp9Y%2bByC6aUTDzU9te7g1BZXPXtfGGwqu52ZVkdVId2jpxPnhX24fFD9WI9aX1qgswZ1PPZSGYKswUkqXhIf2S9aLFkjXW2n61LVzw2ZeqJRQI8QIcmi%2fF7WHOHLbWScnKwG594WIk0SRiCa0n2aEJAhVlXmzEE%2f5%2f%2bXWsKhlneTLduVs52ST5m96zdx%2bLnNGgDERqznFNu3AT5zvLcN0PyVq08n4keSv2ngLLTZK4QQJslS4he9MT3XJoEUfe9L8beZNh1239eLHYF6w4KWMCWWTfwxdCUOY%3d; unpl=JF8EAJZnNSttDEhSAkwDE0dEGAoEWw8LSh9TbjRVXV5QHFIDGwMfGhd7XlVdXhRKFR9vYxRUXlNIUw4ZBysSEXteVV1YCE0TAGlnNWRtW0tkBCsCHxMWQltTXF8LeycDZ2M1VFxZSlYGGwcTEhhObWRbXQlKFQBpYQVQbVl7VTVNbBsTEUpcVVteDENaA2tmA11bX0lWBisDKxE; __jdv=122270672|jd.idey.cn|t_2024175271_|tuiguang|e276f09debfa4c209a0ba829f7710596|1647318395561; thor=8D225D1673AA75681B9D3811417B0D325568BB2DD7F2729798D3AECF0428F59F4C39726C44E930AA2DD868FC4BCA33EA0D52228F39A68FC9F5C1157433CAACF1110B20B6975502864453B70E6B21C0ED165B733359002643CD05BDBA37E4A673AF38CC827B6013BCB5961ADA022E57DB6811E99E10E9C4E6410D844CD129071F7646EC7CE120A0B3D2F768020B044A010452D9F8ABD67A59D41880DD1991935C; 3AB9D23F7A4B3C9B=24HI5ARAA3SK7RJERTWUDZKA2NYJIXX3ING24VG466VC3ANKUALJLLD7VBYLQ3QPRYUSO3R6QBJYLVTVXBDIGJLGBA; __jdb=122270672.5.1507876332|22.1647318046; shshshsID=d7a96097b296c895558adfd840546a72_5_1647318650562", "referer": "https://search.jd.com/" } def crawlProductComment(url): # 讀取原始資料 try: req = requests.get(url=url, headers=headers2).text reqs = req.replace("fetchJSON_comment98(", "").strip(');') print(reqs) jsondata = json.loads(reqs) # 遍歷商品評論列表 comments = jsondata['comments'] return comments except IOError: print("Error: gbk不合適")
def getProduct(url):
# 爬取商品id ids = [] req = requests.get(url=url, headers=headers2).text reqs = req.replace("jQuery1544821(", "").strip(')') jsondata = json.loads(reqs)['291']# 解析檔案 for i in range(0, len(jsondata)): ids.append(jsondata[i]['sku_id']) print(ids) return ids
# 將productid寫成list形式 ids = [] for i in range(0,10): product_id = getProduct( "https://search-x.jd.com/Search?callback=jQuery1544821&area=5&enc=utf-8&keyword=%E7%94%B7%E5%A3%AB%E8%BF%90%E5%8A%A8%E9%9E%8B&adType=7&page="+str(i)+"&ad_ids=291%3A33&xtest=new_search&_=1647325621019") time.sleep(random.randint(1, 3)) ids.append(product_id) data = [] count = 0
# 去除重複productid,加快爬蟲速度 for k in list(set(itertools.chain.from_iterable(ids))): for i in range(0, 100): # 通過更改page引數的值來迴圈讀取多頁評論資訊 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=' + str( k) + '&score=1&sortType=5&page=' \ + str(i) + '&pageSize=10&isShadowSku=0&fold=1' comments = crawlProductComment(url) if len(comments) <= 0: break print(comments) data.extend(comments) # 設定休眠時間 time.sleep(random.randint(1, 5)) print('-------', i) print("這是第{}類商品".format(count))
# 每爬取一個商品就儲存一個檔案,防止爬蟲中途中斷(不建議使用追加,因為不同商品爬蟲時生成的json檔案有多箇中括號,不適合解析) with open('data2/shoes'+str(count)+'.json', 'w+', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) count += 1
# 儲存為最終檔案 with open('data/shoes_all.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4)
三.爬蟲結果:
結果以json形式輸出,之後需要什麼資料,直接解析即可
[{ "id": 16517111016, "guid": "742b5c0124eb4da7c673f0405cd9b512", "content": "上個月買了同款網眼的,穿的非常舒服,看到有冬款又買了,做工和穿著舒適度沒得說!款式與某國外品牌類似!但價格相差十萬八千里!", "creationTime": "2021-10-25 18:25:44", "isDelete": false, "isTop": false, "userImageUrl": "storage.360buyimg.com/i.imageUpload/6a645f3532393535313732373035373831343931363631383932333836_sma.jpg", "topped": 0, "replyCount": 0, "score": 5, "imageStatus": 1, "usefulVoteCount": 0, "userClient": 4, "discussionId": 983275343, "imageCount": 4, "anonymousFlag": 1, "plusAvailable": 201, "mobileVersion": "10.2.0", "images": [ { "id": 1577587353, "imgUrl": "//img30.360buyimg.com/n0/s128x96_jfs/t1/135814/13/20705/98256/61768626E72015f0f/167476690ac02dea.jpg", "imgTitle": "", "status": 0 }, { "id": 1577587354, "imgUrl": "//img30.360buyimg.com/n0/s128x96_jfs/t1/162488/7/27962/106230/61768627Ed0036d75/6e6987536e0bc545.jpg", "imgTitle": "", "status": 0 }, { "id": 1577587355, "imgUrl": "//img30.360buyimg.com/n0/s128x96_jfs/t1/156456/26/25453/120365/61768627E7cb4ead7/4030e3fe60102e8d.jpg", "imgTitle": "", "status": 0 }, { "id": 1577587356, "imgUrl": "//img30.360buyimg.com/n0/s128x96_jfs/t1/200969/9/13143/81333/61768628E2433528c/8a7607cae7c80ce4.jpg", "imgTitle": "", "status": 0 } ], "mergeOrderStatus": 2, "productColor": "黑色豎口繫帶", "productSize": "39", "textIntegral": 40, "imageIntegral": 40, "status": 1, "referenceId": "30424977080", "referenceTime": "2021-10-08 21:17:48", "nickname": "****3", "replyCount2": 0, "userImage": "storage.360buyimg.com/i.imageUpload/6a645f3532393535313732373035373831343931363631383932333836_sma.jpg", "orderId": 0, "integral": 80, "productSales": "[]", "referenceImage": "jfs/t1/130672/22/26171/157596/622e31d5E051db120/2c09c9b8c5711f0b.jpg", "referenceName": "金利來男鞋休閒運動皮鞋真皮軟底英倫鞋子潮流戶外旅遊鞋波鞋男運動鞋 黑色豎口繫帶 41", "firstCategory": 11729, "secondCategory": 11730, "thirdCategory": 6908, "aesPin": null, "days": 17, "afterDays": 0 }]