【大資料】hive 分析 nginx 日誌
阿新 • • 發佈:2020-08-25
目錄
1.nginx 日誌收集
# 檢查 nginx 配置
nginx -t
# 檢視日誌配置
less /etc/nginx/nginx.conf
# 檢視日誌
cd /var/log/nginx;
ll
# 合併打包日誌
cat access.log > nginx.log;
gunzip -c access.log*gz > nginx.log;
gzip nginx.log;
sz nginx.log.gz;
2.hive 建表載入資料
-- 使用正則序列化解析,一個()是一個欄位,注意轉義 drop table if exists spider.nginx_log; create table spider.nginx_log( remote_addr STRING, remote_user STRING, time_local STRING, request STRING, status STRING, body_bytes_sent STRING, http_referer STRING, http_user_agent STRING ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = '(.*?) - (.*?) \\[(.*?)\\] "(.*?)" (\\d+) (\\d+) "(.*?)" "(.*?)"', "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s" ); -- 載入資料,總共 3236712 條 load data local inpath '/home/getway/tmp/way/nginx.log' into table spider.nginx_log;
3.分析資料
-- 檢視資料示例
select * from spider.nginx_log limit 10;
-- ip 統計 select remote_addr, count(1) from spider.nginx_log group by remote_addr order by 2 desc -- pv 最高的頁面 select request, count(1) from spider.nginx_log where request rlike 'comics' group by request order by 2 desc -- 每天的訪問數 select substring(time_local, 0, 11), count(1) from spider.nginx_log group by substring(time_local, 0, 11) order by 1 -- 每小時的訪問數 select substring(time_local, 13, 2), count(1) from spider.nginx_log group by substring(time_local, 13, 2) order by 1
4.資料視覺化
後續處理