爬取藝龍網站酒店評論+
阿新 • • 發佈:2018-11-11
import urllib.request
import requests
import demjson
import pymysql
import re
from bs4 import BeautifulSoup
def remove_emoji(comment,restr=’’):
#過濾表情
try:
co = re.compile(u’[\U00010000-\U0010ffff]’)
except re.error:
co = re.compile(u’[\uD800-\uDBFF][\uDC00-\uDFFF]’)
return co.sub(restr, comment)
conn=pymysql.connect(host=‘localhost’,
user=‘root’,
password=‘123’,
db=‘test’,
charset=‘utf8’
)
cursor = conn.cursor()
cursor.execute(“DROP TABLE IF EXISTS comment”)
sql = “”“CREATE TABLE comment(name char(25),分數 char(110),評價 text(10000),日期 char (25) )”""
cursor.execute(sql)
url = ‘http://hotel.elong.com/ajax/tmapilist/asyncsearch
header ={‘Accept’:‘application/jsontext/javascript, /; q=0.01’,
‘Accept-Encoding’:‘gzip, deflate’,
‘Accept-Language’:‘zh-CN,zh;q=0.9’,
‘Connection’:‘keep-alive’,
‘Content-Length’:‘1665’,
‘Content-Type’:‘application/x-www-form-urlencoded; charset=UTF-8’,
‘Host’:‘hotel.elong.com’,
‘Origin’:‘http://hotel.elong.com
‘Referer’:‘http://hotel.elong.com/hangzhou/’,
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36’,
‘X-Requested-With’:‘XMLHttpRequest’,
‘cookie’:‘CookieGuid=ab7f1877-dfda-4882-b9c8-6116d3a41166; s_eVar44=brand360sem; fid=e607b469-3334-454a-90ef-e7282f7a4a45; __guid=206901770.676507428123648000.1541419138672.057; SHBrowseHotel=cn=91450084%2C%2C%2C%2C%2C%2C%3B92375725%2C%2C%2C%2C%2C%2C%3B21201436%2C%2C%2C%2C%2C%2C%3B&; ShHotel=CityID=1201&CityNameCN=%E6%9D%AD%E5%B7%9E%E5%B8%82&CityName=%E6%9D%AD%E5%B7%9E%E5%B8%82&OutDate=2018-11-09&CityNameEN=hangzhou&InDate=2018-11-08; SessionGuid=da5a119d-ff37-4b56-8f62-5b9b0615858a; Esid=59a9e2d7-1b62-4aa6-a390-56a3f43613c3; semtcid=cd67cc75-c72d-42a7-a755-180e4d22f6e9; semid=brand360sem; outerFrom=brand360sem; com.eLong.CommonService.OrderFromCookieInfo=Status=1&Orderfromtype=1&Isusefparam=0&Pkid=50792&Parentid=4300&Coefficient=0.0&Makecomefrom=0&Cookiesdays=0&Savecookies=0&Priority=9001; fv=pcweb; ext_param=bns%3D4%26ct%3D3; s_cc=true; s_visit=1; newjava2=0b48025058222f4e0d14b79dbc0d2df5; JSESSIONID=B97F1B21A3B60E349023A0C13129345D; anti_token=42126823-FBE1-4301-8FE6-50A6812CF3D5; __tctmb=0.2119295649609267.1541672954728.1541672954728.1; __tccgd=0.0; __tctmc=0.136017320; monitor_count=37; s_sq=elongcom%3D%2526pid%253Dhotel.elong.com%25252Fhangzhou%2526pidt%253D1%2526oid%253Djavascript%25253Avoid(0)%2526ot%253DA; __tctmd=0.1’
}
for i in range(1,10):
data = {‘code’:‘8999595’,
‘listRequest.pageIndex’: i,
‘listRequest.pageSize’:‘20’,
‘listRequest.cityName’: ‘杭州市’,
}
#print(data[‘listRequest.pageIndex’])
html = requests.post(url, data=data, headers=header)
text = html.json()[‘value’][‘hotelIds’]
text = text.split(’,’)
text1 = html.json()[‘value’][‘hotelListHtml’]
soup = BeautifulSoup(text1, ‘html.parser’)
hotelname = soup.find_all(‘img’)
# print(hotelname)
for k in hotelname:
f = open(k[‘alt’]+’.txt’,‘a+’,encoding=‘UTF-8’)
for a in text:
print(a)
for i in range(1, 10):
url1 = 'http://hotel.elong.com/ajax/comment/getcommentbypage/?hotelId=’+a+’+&recommendedType=1&pageIndex=’+str(i)+’&mainTagId=0&subTagId=0&rankType=0&eToken=e607b469-3334-454a-90ef-e7282f7a4a45&code=7051534&=1541673964193’
headers = {‘Accept’:‘application/json, text/javascript, /; q=0.01’,
‘Accept-Encoding’:‘gzip, deflate’,
‘Accept-Language’:‘zh-CN,zh;q=0.9’,
‘Connection’:‘keep-alive’,‘Host’:‘hotel.elong.com’,
‘Referer’:‘http://hotel.elong.com/21201502/’,
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36’,
‘X-Requested-With’:‘XMLHttpRequest’}
html1 = requests.get(url1, headers=headers)
for o in range(20):
name = html1.json()[‘value’][‘Comments’][o][‘CommentUser’][‘NickName’]
name = remove_emoji(name,restr=’’)
f.write(‘name’+’\n’)
print (name)
comment = html1.json()[‘value’][‘Comments’][o][‘Content’]
comment = remove_emoji(comment,restr=’’)
f.write(‘comment’+’\n’)
print(comment)
time = html1.json()[‘value’][‘Comments’][o][‘createTimeString’]
f.write(‘time’+’\n’)
print(time)
score = html1.json()[‘value’][‘Comments’][o][‘Source’]
f.write(‘score’+’\n’)
print(score)
# cursor.execute(“INSERT INTO comment(name,分數,評價,日期) VALUES (’%s’,’%s’,’%s’,’%s’);” % (name,score,comment,time))
# conn.commit()
# conn.close()
學習總結:
1.由於藝龍網站都是動態網頁,所以爬的內容不再在原來的HTML網頁中了;
2.在該任務中,學會了用get與post獲取內容