【網站點選流資料分析】06-ETL
阿新 • • 發佈:2018-12-14
該專案的資料分析過程在hadoop叢集上實現,主要應用hive資料倉庫工具,因此,採集並經過預處理後的資料,需要載入到hive資料倉庫中,以進行後續的挖掘分析。
1、建立原始資料表
在hive倉庫中建貼源資料表
drop table if exists ods_weblog_origin; create table ods_weblog_origin( valid string, remote_addr string, remote_user string, time_local string, request string, status string, body_bytes_sent string, http_referer string, http_user_agent string) partitioned by (datestr string) row format delimited fields terminated by '\001';
點選流模型pageviews表
drop table if exists ods_click_pageviews; create table ods_click_pageviews( session string, remote_addr string, time_local string, request string, visit_step string, page_staylong string, http_referer string, http_user_agent string, body_bytes_sent string, status string) partitioned by (datestr string) row format delimited fields terminated by '\001';
點選流visit模型表 click_stream_visit
drop table if exist click_stream_visit;
create table click_stream_visit(
session string,
remote_addr string,
inTime string,
outTime string,
inPage string,
outPage string,
referal string,
pageVisits int)
partitioned by (datestr string);
時間維表建立
drop table dim_time if exists ods_click_pageviews;
create table dim_time(
year string,
month string,
day string,
hour string)
row format delimited
fields terminated by ',';
2、匯入資料
匯入清洗結果資料到貼源資料表ods_weblog_origin。
load data inpath '/weblog/preprocessed/16-02-24-16/' overwrite into table ods_weblog_origin partition(datestr='2013-09-18');
0: jdbc:hive2://localhost:10000> show partitions ods_weblog_origin;
+-------------------+--+
| partition |
+-------------------+--+
| timestr=20151203 |
+-------------------+--+
0: jdbc:hive2://localhost:10000> select count(*) from ods_origin_weblog;
+--------+--+
| _c0 |
+--------+--+
| 11347 |
+--------+--+
匯入點選流模型pageviews資料到ods_click_pageviews表。
load data inpath '/weblog/clickstream/pageviews' overwrite into table ods_click_pageviews partition(datestr='2013-09-18');
0: jdbc:hive2://hdp-node-01:10000> select count(1) from ods_click_pageviews;
+------+--+
| _c0 |
+------+--+
| 66 |
+------+--+
匯入點選流模型visit資料到ods_click_visit表。
load data local inpath '/weblog/clickstream/visits' overwrite into table click_stream_visit partition(datestr='2013-09-18');
3、生成ODS層明細寬表
3.1、需求概述
整個資料分析的過程是按照資料倉庫的層次分層進行的,總體來說,是從ODS原始資料中整理出一些中間表(比如,為後續分析方便,將原始資料中的時間、url等非結構化資料作結構化抽取,將各種欄位資訊進行細化,形成明細表),然後再在中間表的基礎之上統計出各種指標資料
3.2、ETL實現
建表——明細表 (源:ods_weblog_origin) (目標:ods_weblog_detail)
drop table ods_weblog_detail;
create table ods_weblog_detail(
valid string, --有效標識
remote_addr string, --來源IP
remote_user string, --使用者標識
time_local string, --訪問完整時間
daystr string, --訪問日期
timestr string, --訪問時間
month string, --訪問月
day string, --訪問日
hour string, --訪問時
request string, --請求的url
status string, --響應碼
body_bytes_sent string, --傳輸位元組數
http_referer string, --來源url[dht1]
ref_host string, --來源的host
ref_path string, --來源的路徑
ref_query string, --來源引數query
ref_query_id string, --來源引數query的值
http_user_agent string --客戶終端標識
)
partitioned by(datestr string);
抽取refer_url到中間表 "t_ods_tmp_referurl"
將來訪url分離出host path query query id
drop table if exists t_ods_tmp_referurl;
create table t_ ods _tmp_referurl as
SELECT a.*,b.*
FROM ods_origin_weblog a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;
抽取轉換time_local欄位到中間表明細表 ”t_ ods _detail”
drop table if exists t_ods_tmp_detail;
create table t_ods_tmp_detail as
select b.*,substring(time_local,0,10) as daystr,
substring(time_local,11) as tmstr,
substring(time_local,5,2) as month,
substring(time_local,8,2) as day,
substring(time_local,11,2) as hour
From t_ ods _tmp_referurl b;
以上語句可以改寫成:
insert into table zs.ods_weblog_detail partition(datestr='$day_01')
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,11,3) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from
(SELECT
a.valid,a.remote_addr,a.remote_user,a.time_local,
a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id
FROM zs.ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query, ref_query_id) c
"
0: jdbc:hive2://localhost:10000> show partitions ods_weblog_detail;
+---------------------+--+
| partition |
+---------------------+--+
| dd=18%2FSep%2F2013 |
+---------------------+--+
1 row selected (0.134 seconds)