動態爬取,酒店評論
阿新 • • 發佈:2018-12-03
使用Python爬取動態網頁,獲取評論
python2.7.15
酒店的評論都在js檔案中它們連著資料庫,是動態載入的,找到js檔案的URL和它們的規律,爬取就成功了一半。
我獲取了評論中的五項,這五項中有的可能沒有,因為是在字典中取值,如果沒有的話會報錯,說沒有該鍵,所以在查詢和寫入的時候要加一個try-except,如果有就獲取並寫入,如果沒有就賦值none
headers字典和data字典是必須的,從自己瀏覽器裡找,
F12或右鍵審查元素。
程式碼如下
# coding=utf-8 import urllib2 import re import MySQLdb import json import requests conn=MySQLdb.connect(host="127.0.0.1",user="root",passwd="199855pz",db="pz",charset='utf8') print '連線成功' cursor = conn.cursor() cursor.execute("DROP TABLE IF EXISTS yilong") sql = '''CREATE TABLE yilong(姓名 char(10) ,評價 char(100) ,商品 char(10) ,日期 char(10) ,評分 char(10))''' cursor.execute(sql) def hotelname(shoplist,n): header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} nameurl = "http://hotel.elong.com/" + shoplist[n] + "/" request = urllib2.Request(nameurl, headers=header) response = urllib2.urlopen(request) cont = response.read() pattern = re.compile('<title>【(.*?)】地址:.*?藝龍旅行網</title>') name = re.findall(pattern, cont) name = name[0] print name return name url = "http://hotel.elong.com/ajax/tmapilist/asyncsearch" data = {'code':9559991, 'listRequest.areaID':'', 'listRequest.bookingChannel':'1', 'listRequest.cardNo':192928, 'listRequest.checkInDate':'2018-11-08 00:00:00', 'listRequest.checkOutDate':'2018-11-09 00:00:00', 'listRequest.cityName':'上海市', 'listRequest.customLevel':11, 'listRequest.distance':20, 'listRequest.orderFromID':50426} headers = {'Accept':'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.9', 'Connection':'keep-alive', 'Content-Length':'1686', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie':'CookieGuid=afc5a26f-f88c-4c8b-aee3-284aa693a358; _fid=afc5a26f-f88c-4c8b-aee3-284aa693a358; CitySearchHistory=0101%23%E5%8C%97%E4%BA%AC%E5%B8%82%23beijing%23; s_eVar44=pz360sem; SHBrowseHotel=cn=92947173%2C%2C%2C%2C%2C%2C%3B91282394%2C%2C%2C%2C%2C%2C%3B92385687%2C%2C%2C%2C%2C%2C%3B&; SessionGuid=befebc42-ecba-4f5b-91c3-d48efe3f9e6a; Esid=5d309ea3-4aea-4f53-9fbe-1edd8113ce9e; semid=pz360sem; outerFrom=pz360sem; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=5&Isusefparam=0&Pkid=50426&Parentid=4300&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=9001; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_visit=1; newjava2=5db7fb36946d2a8fdb8546870157311e; JSESSIONID=6287F52D048B99E57C52BE155FDD0435; anti_token=32F13A20-A664-4F0B-A84B-6B2FD7DCF052; ShHotel=CityID=0201&CityNameCN=%E4%B8%8A%E6%B5%B7%E5%B8%82&CityName=%E4%B8%8A%E6%B5%B7%E5%B8%82&OutDate=2018-11-09&CityNameEN=shanghai&InDate=2018-11-08; s_sq=%5B%5BB%5D%5D; __tctmc=0.244490012; __tctmd=0.52917361; __tccgd=0.1; __tctmb=0.3937967179605747.1541661372067.1541661372067.1', 'Host':'hotel.elong.com', 'Origin':'http://hotel.elong.com', 'Referer':'http://hotel.elong.com/shanghai/', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Requested-With':'XMLHttpRequest'} request = requests.post(url, headers=headers, data = data).content shoplist = json.loads(request) shoplist = shoplist['value']['hotelIds'] shoplist = shoplist.encode('utf-8') shoplist = shoplist.split(",",19) print shoplist for n in range(21): hn = hotelname(shoplist, n) f = open(hn.decode('utf-8')+'.txt', 'a+') for p in range(1,26) : url = "http://hotel.elong.com/ajax/comment/getcommentbypage/?hotelId=" + shoplist[n] + "&recommendedType=0&pageIndex=" + str(p) + "&mainTagId=0&subTagId=0&rankType=0&eToken=afc5a26f-f88c-4c8b-aee3-284aa693a358&code=9342551&_=1541592274486" header = {'Accept':'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.9', 'Connection':'keep-alive', 'Host':'hotel.elong.com', 'Referer':'http://hotel.elong.com/92947173/', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Requested-With':'XMLHttpRequest'} request = requests.get(url, headers=header).content pl = json.loads(request) for q in range(20): Nickname = pl['value']['Comments'][q]['CommentUser']['NickName'] f.write(Nickname.encode('utf-8') + '\n') print Nickname pinglun = pl['value']['Comments'][q]['Content'] f.write(pinglun.encode('utf-8') + '\n') print pinglun try : room = pl['value']['Comments'][q]['CommentExt']['Order']['RoomTypeName'] f.write(room.encode('utf-8') + '\n') print room except : room = 'Null' print room time = pl['value']['Comments'][q]['CreateTime'] f.write(time.encode('utf-8') + '\n') print time try : score = pl['value']['Comments'][q]['CommentScore']['Score'] score = str(score) f.write(score.encode('utf-8') + '\n') print score except : score = 'Null' print score insert_yilong = ("INSERT INTO yilong(姓名 , 評價 , 商品 , 日期 , 評分)" "VALUES(%s,%s,%s,%s,%s)") data_yilong = (Nickname, pinglun, room, time, score) cursor.execute(insert_yilong, data_yilong) conn.commit() f.close()