1. 程式人生 > 其它 >去哪兒網旅遊資訊視覺化

去哪兒網旅遊資訊視覺化

去哪兒網旅遊資訊視覺化

一、摘要

  該專案爬取去哪兒網旅遊資料進行資料視覺化,使用pyecharts庫進行實現。

二、選題背景:

  我國旅遊行業的極速發展,因為疫情原因,使得國內旅遊成為新風潮,由於國內疫情控制得當,使得中國成為最先開放旅遊的國家,

本次專案視覺化就是分析國內旅遊的資料,分析適合出行旅遊的時間與地點資訊。

三、過程及程式碼:

1.設計爬取去哪兒網網頁程式碼

import requests
from bs4 import BeautifulSoup
import re
import time
import csv
import random
#爬取每個網址的分頁
fb = open(r'
url.txt','w') url = 'http://travel.qunar.com/travelbook/list.htm?page={}&order=hot_heat&avgPrice=1_2' #請求頭,cookies在電腦網頁中可以查到 headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.360', 'cookies':'JSESSIONID=5E9DCED322523560401A95B8643B49DF; QN1=00002b80306c204d8c38c41b; QN300=s%3Dbaidu; QN99=2793; QN205=s%3Dbaidu; QN277=s%3Dbaidu; QunarGlobal=10.86.213.148_-3ad026b5_17074636b8f_-44df|1582508935699; QN601=64fd2a8e533e94d422ac3da458ee6e88; _i=RBTKSueZDCmVnmnwlQKbrHgrodMx; QN269=D32536A056A711EA8A2FFA163E642F8B; QN48=6619068f-3a3c-496c-9370-e033bd32cbcc; fid=ae39c42c-66b4-4e2d-880f-fb3f1bfe72d0; QN49=13072299; csrfToken=51sGhnGXCSQTDKWcdAWIeIrhZLG86cka; QN163=0; Hm_lvt_c56a2b5278263aa647778d304009eafc=1582513259,1582529930,1582551099,1582588666; viewdist=298663-1; uld=1-300750-1-1582590496|1-300142-1-1582590426|1-298663-1-1582590281|1-300698-1-1582514815; _vi=6vK5Gry4UmXDT70IFohKyFF8R8Mu0SvtUfxawwaKYRTq9NKud1iKUt8qkTLGH74E80hXLLVOFPYqRGy52OuTFnhpWvBXWEbkOJaDGaX_5L6CnyiQPPOYb2lFVxrJXsVd-W4NGHRzYtRQ5cJmiAbasK8kbNgDDhkJVTC9YrY6Rfi2; viewbook=7562814|7470570|7575429|7470584|7473513; QN267=675454631c32674; Hm_lpvt_c56a2b5278263aa647778d304009eafc=1582591567; QN271=c8712b13-2065-4aa7-a70b-e6156f6fc216
', 'referer':'http://travel.qunar.com/travelbook/list.htm?page=1&order=hot_heat&avgPrice=1'} count = 1 #共200頁 for i in range(1,201): url_ = url.format(i) try: response = requests.get(url=url_,headers = headers) response.encoding = 'utf-8' html = response.text soup
= BeautifulSoup(html,'lxml') #print(soup) all_url = soup.find_all('li',attrs={'class': 'list_item'}) #print(all_url[0]) ''' for i in range(len(all_url)): #p = re.compile(r'data-url="/youji/\d+">') url = re.findall('data-url="(.*?)"', str(i), re.S) #url = re.search(p,str(i)) print(url) ''' print('正在爬取第%s頁' % count) for each in all_url: each_url = each.find('h2')['data-bookid'] #print(each_url) fb.write(each_url) fb.write('\n') #last_url = each.find('li', {"class": "list_item last_item"})['data-url'] #print(last_url) time.sleep(random.randint(3,5)) count+=1 except Exception as e: print(e) url_list = [] with open('url.txt','r') as f: for i in f.readlines(): i = i.strip() url_list.append(i) the_url_list = [] for i in range(len(url_list)): url = 'http://travel.qunar.com/youji/' the_url = url + str(url_list[i]) the_url_list.append(the_url) last_list = [] def spider(): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.360', 'cookies': 'QN1=00002b80306c204d8c38c41b; QN300=s%3Dbaidu; QN99=2793; QN205=s%3Dbaidu; QN277=s%3Dbaidu; QunarGlobal=10.86.213.148_-3ad026b5_17074636b8f_-44df|1582508935699; QN601=64fd2a8e533e94d422ac3da458ee6e88; _i=RBTKSueZDCmVnmnwlQKbrHgrodMx; QN269=D32536A056A711EA8A2FFA163E642F8B; QN48=6619068f-3a3c-496c-9370-e033bd32cbcc; fid=ae39c42c-66b4-4e2d-880f-fb3f1bfe72d0; QN49=13072299; csrfToken=51sGhnGXCSQTDKWcdAWIeIrhZLG86cka; QN163=0; Hm_lvt_c56a2b5278263aa647778d304009eafc=1582513259,1582529930,1582551099,1582588666; viewdist=298663-1; uld=1-300750-1-1582590496|1-300142-1-1582590426|1-298663-1-1582590281|1-300698-1-1582514815; viewbook=7575429|7473513|7470584|7575429|7470570; QN267=67545462d93fcee; _vi=vofWa8tPffFKNx9MM0ASbMfYySr3IenWr5QF22SjnOoPp1MKGe8_-VroXhkC0UNdM0WdUnvQpqebgva9VacpIkJ3f5lUEBz5uyCzG-xVsC-sIV-jEVDWJNDB2vODycKN36DnmUGS5tvy8EEhfq_soX6JF1OEwVFXk2zow0YZQ2Dr; Hm_lpvt_c56a2b5278263aa647778d304009eafc=1582603181; QN271=fc8dd4bc-3fe6-4690-9823-e27d28e9718c', 'Host': 'travel.qunar.com' } count = 1 for i in range(len(the_url_list)): try: print('正在爬取第%s頁'% count) response = requests.get(url=the_url_list[i],headers = headers) response.encoding = 'utf-8' html = response.text soup = BeautifulSoup(html,'lxml') information = soup.find('p',attrs={'class': 'b_crumb_cont'}).text.strip().replace(' ','') info = information.split('>') if len(info)>2: location = info[1].replace('\xa0','').replace('旅遊攻略','') introduction = info[2].replace('\xa0','') else: location = info[0].replace('\xa0','') introduction = info[1].replace('\xa0','') other_information = soup.find('ul',attrs={'class': 'foreword_list'}) when = other_information.find('li',attrs={'class': 'f_item when'}) time1 = when.find('p',attrs={'class': 'txt'}).text.replace('出發日期','').strip() howlong = other_information.find('li',attrs={'class': 'f_item howlong'}) day = howlong.find('p', attrs={'class': 'txt'}).text.replace('天數','').replace('/','').replace('','').strip() howmuch = other_information.find('li',attrs={'class': 'f_item howmuch'}) money = howmuch.find('p', attrs={'class': 'txt'}).text.replace('人均費用','').replace('/','').replace('','').strip() who = other_information.find('li',attrs={'class': 'f_item who'}) people = who.find('p',attrs={'class': 'txt'}).text.replace('人物','').replace('/','').strip() how = other_information.find('li',attrs={'class': 'f_item how'}) play = how.find('p',attrs={'class': 'txt'}).text.replace('玩法','').replace('/','').strip() Look = soup.find('span',attrs={'class': 'view_count'}).text.strip() if time1: Time = time1 else: Time = '-' if day: Day = day else: Day = '-' if money: Money = money else: Money = '-' if people: People = people else: People = '-' if play: Play = play else: Play = '-' last_list.append([location,introduction,Time,Day,Money,People,Play,Look]) #設定爬蟲時間 time.sleep(random.randint(3,5)) count+=1 except Exception as e : print(e) #寫入csv with open('Travel.csv', 'a', encoding='utf-8-sig', newline='') as csvFile: csv.writer(csvFile).writerow(['地點', '短評', '出發時間', '天數','人均費用','人物','玩法','瀏覽量']) for rows in last_list: csv.writer(csvFile).writerow(rows) if __name__ == '__main__': spider()

根據網頁結構獲取所需要的資料值,將資料插入csv檔案,共爬取了1603個頁面的資料

2.設計資料視覺化程式碼

(1)讀取爬取到的The_Travel.csv檔案

import pandas as pd
data = pd.read_csv('The_Travel.csv')
data

(2)檢視資料框的所有資訊

data.info()

(3)根據條件把資料進行清洗

data = data[~data['地點'].isin(['攻略'])]
data = data[~data['天數'].isin(['99+'])]
data['天數'] = data['天數'].astype(int)
data = data[data['人均費用'].values>200]
data = data[data['天數']<=15]
data = data.reset_index(drop=True)
data

(4)篩選出旅行月份

def Month(e):
    m = str(e).split('/')[2]
    if m=='01':
        return '一月'
    if m=='02':
        return '二月'
    if m=='03':
        return '三月'
    if m=='04':
        return '四月'
    if m=='05':
        return '五月'
    if m=='06':
        return '六月'
    if m=='07':
        return '七月'
    if m=='08':
        return '八月'
    if m=='09':
        return '九月'
    if m=='10':
        return '十月'
    if m=='11':
        return '十一月'
    if m=='12':
        return '十二月'
data['旅行月份'] = data['出發時間'].apply(Month)
data['出發時間']=pd.to_datetime(data['出發時間'])
data

(5)篩選出瀏覽次數,顯示前幾行

import re
def Look(e):
    if '' in e:
        num1 = re.findall('(.*?)萬',e)
        return float(num1[0])*10000
    else:
        return float(e)
data['瀏覽次數'] = data['瀏覽量'].apply(Look)
data.drop(['瀏覽量'],axis = 1,inplace = True)
data['瀏覽次數'] = data['瀏覽次數'].astype(int)
data.head()

(6)將旅行城市前幾名進行計數並排序

data1 = data
data1['地點'].value_counts().head(10)

(7)算出前十個城市的人均費用進行排序

loc = data1['地點'].value_counts().head(10).index.tolist()
print(loc)
loc_data = data1[data1['地點'].isin(loc)]
price_mean = round(loc_data['人均費用'].groupby(loc_data['地點']).mean(),1)
print(price_mean)
price_mean2 = [1630.1,1862.9,1697.9,1743.4,1482.4,1586.4,1897.0,1267.5,1973.8,1723.7]

(8)繪製前十個城市人均消費的柱狀圖

from pyecharts import Bar
bar = Bar('目的地Top10人均費用',width = 800,height = 500,title_text_size = 20)
bar.add('',loc,price_mean2,is_label_show = True,is_legend_show= True)
bar.render('人均費用.html')

(9)篩選出旅行天數

data1['旅行時長'] = data1['天數'].apply(lambda x:str(x) + '')
data1

(10)將出遊人物進行排序

data1['人物'].value_counts()

(11)篩選出瀏覽次數,並進行排序

m = data1['瀏覽次數'].sort_values(ascending=False).index[:].tolist()
data1 = data1.loc[m]
data1 = data1.reset_index(drop = True)
data1

(12)將旅行次數最多的月份進行排序

data1['旅行月份'].value_counts()

(13)取出玩法資料加入列表

word_list = []
for i in data1['玩法']:
    s = re.split('\xa0',i)
    word_list.append(s)  
dict = {}
for j in range(len(word_list)):
    for i in word_list[j]:
        if i not in dict:
            dict[i] = 1
        else:
            dict[i]+=1
list = []
for item in dict.items():
    list.append(item)
for i in range(1,len(list)):
    for j in range(0,len(list)-1):
        if list[j][1]<list[j+1][1]:
            list[j],list[j+1] = list[j+1],list[j]
print(list)

(14)繪製出遊方式的環形圖

from pyecharts import Pie

m1 = data1['人物'].value_counts().index.tolist()
n1 = data1['人物'].value_counts().values.tolist()
pie =Pie('出遊結伴方式',background_color = 'white',width = 800,height = 500,title_text_size = 20)
pie.add('',m1,n1,is_label_show = True,is_legend_show= True,radius=[40, 75])
pie.render('1.html')

(15)繪製目的地前十的柱形圖

from pyecharts import Bar

m2 = data1['地點'].value_counts().head(10).index.tolist()
n2 = data1['地點'].value_counts().head(10).values.tolist()

bar = Bar('',width = 800,height = 500,title_text_size = 20)
bar.add('',m2,n2,is_label_show = True,is_legend_show= True)
bar.render('前十目的地'.html')

(16)繪製2021年出遊曲線

from pyecharts import Line

m3 = data1['出發時間'].value_counts().sort_index()[:]
m4 = m3['2021'].index
n4 = m3['2021'].values

m3['2021'].sort_values().tail(10)

line = Line('出遊時間曲線',width = 800,height = 500,title_text_size = 20)
line.add('',m4,n4,is_legend_show= True)
line.render('出遊曲線.html')

(17)繪製出遊玩法柱狀圖

m5 = []
n5 = []
for i in range(20):
    m5.append(list[i][0])
    n5.append(list[i][1])
m5.reverse()
m6 = m5
n5.reverse()
n6 = n5
bar = Bar('出遊玩法',width = 1000,height = 600,title_text_size = 30)
bar.add('',m6,n6,is_convert = True,is_label_show = True,label_pos = 'right')
bar.render('出遊玩法.html')

(18)篩選七月和八月人物為三五好友按照瀏覽次數進行排序

data_mo = data1[((data1['旅行月份'] =='七月')|(data1['旅行月份'] =='八月'))&(data1['人物']=='三五好友')].drop(['旅行時長'],axis = 1)
data_mo.head(10)

四、總結

綜上所有資料可知,我們用去哪兒網對於國內旅遊城市進行了一定的分析以及排名,讓人們出遊有更加合理的選擇,更體現國內疫情後每個城市旅行的情況。