1. 程式人生 > 實用技巧 >【大資料】hive 分析 nginx 日誌

【大資料】hive 分析 nginx 日誌

目錄

1.nginx 日誌收集

# 檢查 nginx 配置
nginx -t

# 檢視日誌配置
less /etc/nginx/nginx.conf

# 檢視日誌
cd /var/log/nginx;
ll

# 合併打包日誌
cat access.log > nginx.log;
gunzip -c access.log*gz > nginx.log;
gzip nginx.log;
sz nginx.log.gz;

2.hive 建表載入資料

-- 使用正則序列化解析,一個()是一個欄位,注意轉義
drop table if exists spider.nginx_log;
create table spider.nginx_log(
remote_addr STRING,
remote_user STRING,
time_local STRING,
request STRING,
status STRING,
body_bytes_sent STRING,
http_referer STRING,
http_user_agent STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = '(.*?) - (.*?) \\[(.*?)\\] "(.*?)" (\\d+) (\\d+) "(.*?)" "(.*?)"',
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s"
);

-- 載入資料,總共 3236712 條
load data local inpath '/home/getway/tmp/way/nginx.log' into table spider.nginx_log;

3.分析資料

-- 檢視資料示例
select * from spider.nginx_log limit 10;

-- ip 統計
select remote_addr, count(1)
from spider.nginx_log
group by remote_addr
order by 2 desc

-- pv 最高的頁面
select request, count(1)
from spider.nginx_log
where request rlike 'comics'
group by request
order by 2 desc

-- 每天的訪問數
select substring(time_local, 0, 11), count(1)
from spider.nginx_log
group by substring(time_local, 0, 11)
order by 1

-- 每小時的訪問數
select substring(time_local, 13, 2), count(1)
from spider.nginx_log
group by substring(time_local, 13, 2)
order by 1

4.資料視覺化

後續處理