1. 程式人生 > >python爬蟲——美團美食店鋪資訊

python爬蟲——美團美食店鋪資訊

寫在前面

本篇文章主要介紹美團美食頁面爬取(web版)


整體思路

通過分析,我們發現美團美食的資料是通過ajax請求來的。
美團美食資料

所以接下來,我們只需要請求這個介面就行了。分析下這個介面的request-header。發現有一點複雜欸(別慌,馬上告訴你答案

美團美食請求

說一下圖片中我標記的地方

  • url 欄裡面的地址可以在城市切換頁面爬取到(這個很簡單
  • 主要是token是加密的
    • token是先用的zlib加密,然後再base64加密(通過看加密字元型別和字元長度可以大概推斷是哪種加密),當然,解密也就是反著來啦。對了,token解密后里面還有個sign
      引數,也是用的同樣的加密方式。

我把token拿出來大概做個示範,你一看就懂了
美團美食token解密

拿到解密後的token,整個ajax請求在你面前可以算是透明的了,接下來就是自己造token然後請求api拿到商家資訊了


程式碼參考

下面附上我寫的程式碼(防反爬還沒怎麼寫,但用上隨機ua頭和代理應該還行)

from requests import RequestException
from fake_useragent import UserAgent
from lxml.html import etree
import base64, zlib, json
import
requests from urllib import parse import time class Spider(object): def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language'
: 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } # 請求城市列表 def get_cities(self): self.headers['User-Agent'] = UserAgent().random # 隨機請求頭 self.headers['Host'] = 'www.meituan.com' self.headers['Referer'] = 'https://www.meituan.com/changecity/' try: response = requests.get('https://www.meituan.com/changecity/', headers=self.headers) if response.status_code == 200: return self.parse_cities(response.text) except RequestException as e: pass # 解析城市列表 def parse_cities(self, html): html = etree.HTML(html) cities = html.xpath('//div[@class="alphabet-city-area"]//a') for city in cities: self.get_meishi('http:'+city.xpath('./@href')[0]+'/meishi/' ,city.xpath('./text()')[0]) # 構造token def get_token(self, url, city, page): sign = '"areaId=0&cateId=0&cityName={}&dinnerCountAttrId=&optimusCode=1' \ '&originUrl={}&page={}&partner=126&platform=1&riskLevel=1&sort=' \ '&userId=&uuid=5bb9712c812a4ee18eb2.1544868815.1.0.0"'.format( city, url, page) token = { "rId": 100900, "ver": "1.0.6", "ts": int(time.time()*1000), "cts": int(time.time()*1000)+100, "brVD": [290, 667], "brR": [[1920, 1080], [1920, 1040], 24, 24], "bI": ["{}".format(url), ""], "mT": ["255,230"], "kT": [], "aT": [], "tT": [], "aM": '', "sign": str(base64.b64encode(zlib.compress(bytes(sign, encoding='utf8'))), encoding='utf8') } return str(base64.b64encode(zlib.compress(bytes(str(token).replace(' ','').replace("'",'"'), encoding='utf8'))), encoding='utf8') def get_meishi(self, url, city): for page in range(1, 33): self.headers['User-Agent'] = UserAgent().random requests_url = '{}api/poi/getPoiList?cityName={}&cateId=0&areaId=0&sort=&dinnerCountAttrId=&page={}&userId=' \ '&uuid=5bb9712c812a4ee18eb2.1544868815.1.0.0&platform=1&partner=126&originUrl={}&riskLevel=1' \ '&optimusCode=1&_token={}'.format(url, parse.quote(city), page, parse.quote(url+'pn{}/'.format(page)), parse.quote(self.get_token(url, city, page))) print(requests_url) try: response = requests.get(requests_url, headers=self.headers) if response.status_code == 200: self.parse_meishi(response.text) except RequestException as e: pass def parse_meishi(self, html): try: result = json.loads(html) if result: print(result) except: pass def run(self): self.get_cities() if __name__ == '__main__': Spider().run()