python爬蟲——美團美食店鋪資訊
阿新 • • 發佈:2018-12-31
寫在前面
本篇文章主要介紹美團美食頁面爬取(web版)
整體思路
通過分析,我們發現美團美食的資料是通過ajax請求來的。
所以接下來,我們只需要請求這個介面就行了。分析下這個介面的request-header
。發現有一點複雜欸(別慌,馬上告訴你答案)
說一下圖片中我標記的地方
- url 欄裡面的地址可以在城市切換頁面爬取到(
這個很簡單) - 主要是token是加密的
- token是先用的
zlib
加密,然後再base64
加密(通過看加密字元型別和字元長度可以大概推斷是哪種加密),當然,解密也就是反著來啦。對了,token解密后里面還有個sign
- token是先用的
我把token拿出來大概做個示範,你一看就懂了
拿到解密後的token,整個ajax請求在你面前可以算是透明的了,接下來就是自己造token然後請求api拿到商家資訊了
程式碼參考
下面附上我寫的程式碼(防反爬還沒怎麼寫,但用上隨機ua頭和代理應該還行)
from requests import RequestException
from fake_useragent import UserAgent
from lxml.html import etree
import base64, zlib, json
import requests
from urllib import parse
import time
class Spider(object):
def __init__(self):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language' : 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
# 請求城市列表
def get_cities(self):
self.headers['User-Agent'] = UserAgent().random # 隨機請求頭
self.headers['Host'] = 'www.meituan.com'
self.headers['Referer'] = 'https://www.meituan.com/changecity/'
try:
response = requests.get('https://www.meituan.com/changecity/', headers=self.headers)
if response.status_code == 200:
return self.parse_cities(response.text)
except RequestException as e:
pass
# 解析城市列表
def parse_cities(self, html):
html = etree.HTML(html)
cities = html.xpath('//div[@class="alphabet-city-area"]//a')
for city in cities:
self.get_meishi('http:'+city.xpath('./@href')[0]+'/meishi/' ,city.xpath('./text()')[0])
# 構造token
def get_token(self, url, city, page):
sign = '"areaId=0&cateId=0&cityName={}&dinnerCountAttrId=&optimusCode=1' \
'&originUrl={}&page={}&partner=126&platform=1&riskLevel=1&sort=' \
'&userId=&uuid=5bb9712c812a4ee18eb2.1544868815.1.0.0"'.format(
city, url, page)
token = {
"rId": 100900,
"ver": "1.0.6",
"ts": int(time.time()*1000),
"cts": int(time.time()*1000)+100,
"brVD": [290, 667],
"brR": [[1920, 1080], [1920, 1040], 24, 24],
"bI": ["{}".format(url), ""],
"mT": ["255,230"],
"kT": [],
"aT": [],
"tT": [],
"aM": '',
"sign": str(base64.b64encode(zlib.compress(bytes(sign, encoding='utf8'))), encoding='utf8')
}
return str(base64.b64encode(zlib.compress(bytes(str(token).replace(' ','').replace("'",'"'), encoding='utf8'))), encoding='utf8')
def get_meishi(self, url, city):
for page in range(1, 33):
self.headers['User-Agent'] = UserAgent().random
requests_url = '{}api/poi/getPoiList?cityName={}&cateId=0&areaId=0&sort=&dinnerCountAttrId=&page={}&userId=' \
'&uuid=5bb9712c812a4ee18eb2.1544868815.1.0.0&platform=1&partner=126&originUrl={}&riskLevel=1' \
'&optimusCode=1&_token={}'.format(url, parse.quote(city), page, parse.quote(url+'pn{}/'.format(page)), parse.quote(self.get_token(url, city, page)))
print(requests_url)
try:
response = requests.get(requests_url, headers=self.headers)
if response.status_code == 200:
self.parse_meishi(response.text)
except RequestException as e:
pass
def parse_meishi(self, html):
try:
result = json.loads(html)
if result:
print(result)
except:
pass
def run(self):
self.get_cities()
if __name__ == '__main__':
Spider().run()