Hive學習視訊心得（四）常用查詢函式

阿新 • • 發佈：2021-01-15

技術標籤：大資料——數倉工具Hive hive 大資料資料倉庫 hadoop mapreduce

文章目錄

常用查詢函式

常用查詢函式

1、空欄位賦值（NVL）

給值為NULL的資料賦值，它的格式是NVL( value，default_value)。它的功能是如果value為NULL，則NVL函式返回default_value的值，否則返回value的值，如果兩個引數都為NULL ，則返回NULL。

--NVL( value，default_value)
--將職員的獎金為NULL的置為-1
select comm,nvl(comm,-1)
from emp;

2、CASE WHEN（相當於java的switch case）

--舉例：求出不同部門男女各多少人
--輸入資料
name	dept_id	  sex
悟空		A		男
大海		A		男
宋宋		B		男
鳳姐		A		女
婷姐		B		女
婷婷		B		女
--需求資料
A     2     1
B     1     2

--1、創表
create table emp_sex(
name string, 
dept_id string, 
 
sex string) 
row format delimited fields terminated by "\t";

--2、匯入資料
load data local inpath '/usr/local/soft/hive-1.2.1/data/emp_sex.txt' into table emp_sex;

--3、按需求查詢資料
select
    dept_id,
    sum(case sex when '男' then 1 else 0 end) male,
    sum(case sex when '女' then 1 else 0 end) female
from 

    emp_sex
group by
    dept_id;

3、行轉列（聚合）

COLLECT_SET(col)：函式只接受基本資料型別，它的主要作用是將某欄位的值進行去重彙總，產生array型別欄位。

--舉例：把星座和血型一樣的人歸類到一起。
--輸入資料
name	constellation	blood_type
孫悟空		白羊座			  A
大  海	 射手座		   A
宋  宋	 白羊座		   B
豬八戒		白羊座			  A
鳳  姐	 射手座		   A
蒼老師		白羊座			  B
--需求資料
射手座,A            大  海|鳳  姐
白羊座,A            孫悟空|豬八戒
白羊座,B            宋  宋|蒼老師

--1、創表
create table person_info(
name string, 
constellation string, 
blood_type string) 
row format delimited fields terminated by "\t";

--2、匯入資料
load data local inpath '/usr/local/soft/hive-1.2.1/data/constellation.txt' into table person_info;

--3、按需求查詢資料
select
     concat(constellation,",",blood_type) con,
     concat_ws("|",collect_list(name)) ren
from
     person_info
group by 
     constellation,blood_type;

4、列轉行

EXPLODE(col)：將hive一列中複雜的array或者map結構拆分成多行。

--將電影分類中的陣列資料展開
--輸入資料 
   movie		  category
《疑犯追蹤》	  懸疑,動作,科幻,劇情
《Lie to me》  懸疑,警匪,動作,心理,劇情
《戰狼2》       戰爭,動作,災難
--需求資料
《疑犯追蹤》      懸疑
《疑犯追蹤》      動作
《疑犯追蹤》      科幻
《疑犯追蹤》      劇情
《Lie to me》   懸疑
《Lie to me》   警匪
《Lie to me》   動作
《Lie to me》   心理
《Lie to me》   劇情
《戰狼2》        戰爭
《戰狼2》        動作
《戰狼2》        災難

--1、創表
create table movie_info(
    movie string, 
    category array<string>) 
row format delimited fields terminated by "\t";

--2、匯入資料
load data local inpath '/usr/local/soft/hive-1.2.1/data/movie.txt' into table movie_info;

--3、按需求查詢資料
select
    movie,
    category_name
from 
    movie_info 
lateral view 
    explode(category) table_tmp 
as category_name;

拓展：將上述原表按照電影類別劃分電影，輸出如下：

| 劇情               | ["《疑犯追蹤》","《Lie to me》"]          |
| 動作               | ["《疑犯追蹤》","《Lie to me》","《戰狼2》"]  |
| 心理               | ["《Lie to me》"]                   |
| 懸疑               | ["《疑犯追蹤》","《Lie to me》"]          |
| 戰爭               | ["《戰狼2》"]                         |
| 災難               | ["《戰狼2》"]                         |
| 科幻               | ["《疑犯追蹤》"]                        |
| 警匪               | ["《Lie to me》"]                   |

--按需求查詢資料
select
     b.category_name,
     collect_set(b.movie)
from
(select
     movie,category_name
from
     movie_info
lateral view
     explode(category) ta as category_name) b
group by
     b.category_name;

5、視窗函式（開窗函式）

OVER()：指定分析函式工作的資料視窗大小，這個資料視窗大小可能會隨著行的變而變化。
CURRENT ROW：當前行
n PRECEDING：往前n行資料
n FOLLOWING：往後n行資料
UNBOUNDED：起點，UNBOUNDED PRECEDING 表示從前面的起點， UNBOUNDED FOLLOWING表示到後面的終點
LAG(col,n,default_val)：往前第n行資料
LEAD(col,n, default_val)：往後第n行資料
NTILE(n)：把有序視窗的行分發到指定資料的組中，各個組有編號，編號從1開始，對於每一行，NTILE返回此行所屬的組的編號。注意：n必須為int型別。

--輸入資料
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94

--建表語句
create table business(
name string, 
orderdate string,
cost int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

--匯入資料
load data local inpath "/usr/local/soft/hive-1.2.1/data/business.txt" into table business;

--需求：
/*1、查詢在2017年4月份購買過的顧客及總人數(substring切分字串)*/
--1)初始
select 
    count(distinct name)
from
    business
where
    substring(orderdate,1,7)="2017-04";
--2)進階（將姓名和統計人數都顯示）
select 
    distinct name,
    count(*) over()
from
    business
where
    substring(orderdate,1,7)="2017-04";

/*2、查詢顧客的購買明細及月購買總額*/
select
    name,
    cost,
    orderdate,
    sum(cost) over(partition by substring(orderdate,1,7))
from
    business;

/*3、上述的場景, 將每個顧客的cost按照日期進行累加*/
--視窗函式：按名字分割槽，區內按照時間排序，最後加範圍（unbounded preceding每組第一行，current row當前行）的行
select
    name,
    cost,
    orderdate,
    sum(cost) over(partition by substring(orderdate,1,7)) mc,
    sum(cost) over(partition by name order by orderdate asc rows between unbounded preceding and current row) lc
from
    business;
--拓展：
select name,orderdate,cost,
sum(cost) over() as sample1,--所有行相加 
sum(cost) over(partition by name) as sample2,--按name分組，組內資料相加 
sum(cost) over(partition by name order by orderdate) as sample3,--按name分組，組內資料累加 
sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一樣,由起點到當前行的聚合 
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --當前行和前面一行做聚合 
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--當前行和前邊一行及後面一行 
sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --當前行及後面所有行 
from business;
--求明細和每個月有哪些顧客來過
select name,orderdate,cost,
       concat_ws(",",collect_set(name) over(partition by substring(orderdate,1,7))) a
from  business;

/*4、查詢每個顧客上次的購買時間*/
--LAG(col,n,default_val)：往前第n行資料,這裡可以往前一行的資料，通過視窗函式name分割槽orderdate排序,求得上次的購買時間
select name,orderdate,cost,
      lag(orderdate,1,"1970-01-01") over(partition by name order by orderdate) last_order
from
      business;
--拓展
--顯示下一行資料
lead(orderdate,1) over(partition by name order by orderdate)

/*5、查詢前20%時間的訂單資訊*/
select *
from  
(select
      name,orderdate,cost,
      ntile(5) over(order by orderdate ASC) n
from
      business) t1
where t1.n=1;
--拓展
--與上述題目同理
select
      name,orderdate,cost,
      percent_rank() over(order by orderdate) pr
from 
      business;

注意：rows必須跟在Order by 子句之後，對排序的結果進行限制，使用固定的行數來限制分割槽中的資料行數量

6、Rank（排名）

RANK() 排序相同時會重複，總數不會變

DENSE_RANK() 排序相同時會重複，總數會減少

ROW_NUMBER() 會根據順序計算

輸入資料

name	subject	score
孫悟空	語文	87
孫悟空	數學	95
孫悟空	英語	68
大海	語文	94
大海	數學	56
大海	英語	84
宋宋	語文	64
宋宋	數學	86
宋宋	英語	84
婷婷	語文	65
婷婷	數學	85
婷婷	英語	78

按需求查詢資料--需求：計算每門學科成績排名
--1、建表語句
create table score(
name string,
subject string, 
score int) 
row format delimited fields terminated by "\t";

--2、匯入資料
load data local inpath '/usr/local/soft/hive-1.2.1/data/score.txt' into table score;

--3、按需求查詢資料
select 
      *,
      rank() over(partition by subject order by score) r
from   
      score;
--拓展
--RANK()、DENSE_RANK()、ROW_NUMBER()區別
select 
      *,
      rank() over(partition by subject order by score desc) r1,
      dense_rank() over(partition by subject order by score desc) r2,
      row_number() over(partition by subject order by score desc) r3
from   
      score;
--比較結果
+-------------+----------------+--------------+-----+-----+-----+--+
| score.name  | score.subject  | score.score     | r1  | r2  | r3  |
+-------------+----------------+--------------+-----+-----+-----+--+
| 孫悟空        | 數學             | 95           | 1   | 1   | 1   |
| 宋宋          | 數學             | 86           | 2   | 2   | 2   |
| 婷婷          | 數學             | 85           | 3   | 3   | 3   |
| 大海          | 數學             | 56           | 4   | 4   | 4   |
| 宋宋          | 英語             | 84           | 1   | 1   | 1   |
| 大海          | 英語             | 84           | 1   | 1   | 2   |
| 婷婷          | 英語             | 78           | 3   | 2   | 3   |
| 孫悟空        | 英語             | 68           | 4   | 3   | 4   |
| 大海          | 語文             | 94           | 1   | 1   | 1   |
| 孫悟空        | 語文             | 87           | 2   | 2   | 2   |
| 婷婷          | 語文             | 65           | 3   | 3   | 3   |
| 宋宋          | 語文             | 64           | 4   | 4   | 4   |
+-------------+----------------+--------------+-----+-----+-----+--+

7、日期函式（部分）

--current_date 返回當前日期
select current_date();

--日期的加減
--舉例1：從今天開始90天以後的日期
select date_add(current_date(),90);
--舉例2：從今天開始90天以以前的日期
select date_sub(current_date(),90);

--日期差額
--舉例：今天和1996-06-04的差額天數
select datediff(current_date(),"1990-06-04");

--轉換為日期格式
--舉例：'2021年01月14日' -> '2021-01-14'
select from_unixtime(unix_timestamp('2021年01月14日','yyyy年MM月dd日'),'yyyy-MM-dd');

--獲取當前時間戳
select unix_timestamp();

--時間戳轉換為日期格式
--舉例：1610611142 -> 'YYYY/MM/dd HH:mm:ss'
select from_unixtime(1610611142,'YYYY/MM/dd HH:mm:ss');

8、練習題

題目：有哪些顧客連續兩天來過店裡，資料是business表

--視訊寫法
--先排號、求日期相減、求連續兩天來的
select
      name,temp,
      count(*) c
from 
      (select
            *,
            date_sub(orderdate,rn) temp
       from
            (select
                  *,
                  row_number() over(partition by name order by orderdate) rn
             from
             business
            ) t1
      )  t2
group by
      name,temp
having
      c>=2;   
      
--自己寫法
select 
     distinct name 
from(
     select name,
            datediff(a.now_date,a.last_date) d
     from
           (select 
                 name,
                 lag(orderdate,1) over(partition by name order by orderdate) last_date,
                 orderdate now_date
            from business
           ) a
     ) b
where b.d<=1;