1. 程式人生 > 實用技巧 >Python 爬取大眾點評店鋪評論

Python 爬取大眾點評店鋪評論

  1 import parsel
  2 import pymysql
  3 from lxml import etree
  4 import re
  5 import requests
  6 def download_data(url,cookie):
  7     '''
  8     獲取加密網頁原始碼
  9     獲取加密檔案
 10     :return:
 11     '''
 12     headers = {
 13         "Cookie": cookie,
 14         "Referer": "http://www.dianping.com/
", 15 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" 16 } 17 ''' 18 獲取原始網頁 19 ''' 20 ret = requests.get(url=url, headers=headers).text 21 with open('01 原始網頁_加密.html', 'w', encoding='utf-8') as f:
22 f.write(ret) 23 24 ''' 25 獲取css檔案 26 ''' 27 css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret) 28 css_url = 'https:' + css_url[0] 29 css_response = requests.get(css_url).text 30 with open('02 css樣式.css', '
w', encoding='utf-8') as f: 31 f.write(css_response) 32 33 ''' 34 獲取svg對照表 35 ''' 36 svg_urls = re.findall(r'.*?\[class\^="(.*?)"\]\{.*?background-image: url\((.*?)\);', css_response) 37 for svg_url in svg_urls: 38 name, url = svg_url 39 svg_url = 'https:' + url 40 svg_response = requests.get(svg_url).text 41 with open(F'03 svg對照表{name}.svg', 'w', encoding='utf-8') as f: 42 f.write(svg_response) 43 def crack_data(): 44 ''' 45 解密資料,破解svg對應關係 46 :return: 47 ''' 48 with open('03 svg對照表zpd.svg', 'r', encoding='utf-8') as f:#檔名稱根據獲取到的svg檔案更換 49 svg_html = f.read() 50 sel = parsel.Selector(svg_html) 51 texts = sel.css('textPath') 52 paths = sel.css('path') 53 path_dict = {} 54 for path in paths: 55 path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1] 56 # print(path.css('path::attr(id)').get()) 57 # print(path.css('path::attr(d)').get().split(' ')[1]) 58 count = 1 59 zpd_svg_dict = {} # y座標和字串的聯絡 60 for text in texts: 61 zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get() 62 count += 1 63 print(zpd_svg_dict) 64 65 with open('02 css樣式.css', 'r', encoding='utf-8') as f: 66 css_html = f.read() 67 68 css_paths = re.findall(r''' 69 \.(zpd.*?) { 70 background: -(\d+)\.0px -(\d+)\.0px; 71 \} 72 ''', css_html) # 正則表示式條件根據css檔案類標籤更換 73 print(css_paths) 74 last_map = {} 75 for css_path in css_paths: 76 css_name, x, y = css_path 77 index = int(int(x) / 14) 78 for i in zpd_svg_dict: 79 if int(y) > int(i): 80 pass 81 else: 82 last_map[css_name] = zpd_svg_dict[i][index] 83 break 84 return last_map 85 86 def decryption(last_map): 87 ''' 88 返回破解後的html 89 :param last_map: 90 :return: 91 ''' 92 93 with open('01 原始網頁_加密.html', 'r', encoding='utf-8') as f: 94 ret = f.read() 95 svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret) 96 for svg in svg_list: 97 print(svg, last_map[svg]) 98 ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg]) 99 return ret 100 def write_data(ret): 101 ''' 102 獲取評論資料並寫入資料庫 103 104 :param ret: 105 :return: 106 ''' 107 # 用不到的div標籤去掉 並不是全部都有這個標籤 影響程式碼編寫 108 ret = ret.replace(' <div class="richtitle">消費後評價</div>', '') 109 # ret = ret.replace(div,'') 110 # print(ret) 111 etre = etree.HTML(ret) 112 li_list = etre.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li') 113 114 # 初始化資料庫 115 db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, database='review', 116 charset='utf8mb4') 117 cursor = db.cursor() 118 count = 0 119 for li in li_list: 120 name = li.xpath('./div[@class="main-review"]/div[1]/a/text()')[0].strip() 121 score = re.findall('sml-rank-stars sml-str(.*?) star', li.xpath('./div[1]/div[2]/span[1]/@class')[0])[0].strip() 122 time = li.xpath('//div[@class="misc-info clearfix"]/span[1]/text()')[count].strip() 123 shop_name = li.xpath('//div[@class="misc-info clearfix"]/span[2]/text()')[count].strip() 124 comment = ','.join([i.replace('\n', '').strip() for i in li.xpath('./div/div[4]/text()')]) 125 count += 1 126 print(name, score, time, shop_name, comment) 127 # 寫入資料庫 128 sql = 'insert into dianping(name,score,time,shop_name,comment)values(%s,%s,%s,%s,%s)' 129 cursor.execute(sql, (name, score, time, shop_name, comment)) 130 db.commit() 131 # 關閉連線 132 db.close() 133 134 135 if __name__ == '__main__': 136 #cookie 不定時更換 137 cookie = "s_ViewType=10; _lxsdk_cuid=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _lxsdk=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _hc.v=c4dfac1c-01af-6a87-d803-2cd6b8db107a.1605834485; fspop=test; ctu=ef0b64e4cabf67f148563284ea8c8d0555a008f7ca0dee097831c90b52822812; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1605834487,1605835298,1606093773; cy=2; cye=beijing; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1606098153; expand=yes; _lxsdk_s=175f2cc7d23-6-9d5-75e%7C%7C532" 138 url = 'http://www.dianping.com/shop/130096343/review_all' #這是一個商家的評論 可以更換 139 try: 140 download_data(url,cookie) 141 except Exception: 142 print('出現驗證碼驗證')#訪問過多會出現驗證碼 目前沒有破解 143 map_dict = {} 144 try: 145 map_dict = crack_data() 146 except Exception: 147 print('css類屬性發生變化') 148 ret = decryption(map_dict) 149 write_data(ret)