hive常用函式全集

阿新 • • 發佈：2020-12-15

Hive 引言

# 簡介
> hive是facebook開源，並捐獻給了apache組織，作為apache組織的頂級專案(hive.apache.org)。 hive是一個基於大資料技術的資料倉庫(DataWareHouse)技術，主要是通過將使用者書寫的SQL語句翻譯成MapReduce程式碼，然後釋出任務給MR框架執行，完成SQL 到 MapReduce的轉換。可以將結構化的資料檔案對映為一張資料庫表，並提供類SQL查詢功能。
>
> **總結**
>
> - Hive是一個數據倉庫(資料庫)
> - Hive構建在HDFS上，可以儲存海量資料。
> - Hive允許程式設計師使用**SQL命令**來完成資料的分散式計算，計算構建在yarn之上。(Hive會將**SQL轉化為MR操作**)
>
> 優點：
> 	簡化程式設計師的開發難度，寫SQL即可，避免了去寫mapreduce,減少開發人員的學習成本
> 缺點：
> 	 延遲較高(MapReduce本身延遲，Hive SQL向MapReduce轉化優化提交)，適合做大資料的離線處理(TB PB級別的資料，統計結果延遲1天產出)
> Hive不適合場景：
> 	1：小資料量, MySQL。
> 	2：實時計算：Flink/Spark HBase

- 資料庫  DataBase
  - 資料量級小，資料價值高
- 資料倉庫 DataWareHouse
  - 資料體量大，資料價值低

啟動hive

# 本地模式啟動 【管理員模式】
# 啟動hive伺服器，同時進入hive的客戶端。只能通過本地方式訪問。
[root@hadoop10 ~]# hive
Logging initialized using configuration in jar:file:/opt/installs/hive1.2.1/lib/hive-common-1.2.1.jar!/hive-log4j.properties
hive>


# 啟動hive的伺服器，可以允許遠端連線方式訪問。
// 前臺啟動
[root@hadoop10 ~]# hiveserver2 
// 後臺啟動
[root@hadoop10 ~]# hiveserver2 &

# beeline客戶端··
# 啟動客戶端
[root@hadoop10 ~]# beeline
beeline> !connect jdbc:hive2://hadoop10:10000
回車輸入mysql使用者名稱
回車輸入mysql密碼

HQL高階


# 0. 各個資料型別的欄位訪問(array、map、struct)
# array型別:欄位名[index];map型別:欄位名[key];struct型別:欄位名.屬性名
select name,salary,hobbies[1],cards['123456'],addr.city from t_person;

# 1. 條件查詢：= != >= <=
select * from t_person where addr.city='鄭州';

# 2. and or between and
# array_contains(欄位，值):函式，針對array型別的欄位，判斷數組裡麵包含指定的值
select * from t_person where salary>5000 and array_contains(hobbies,'抽菸');

# 3. order by[底層會啟動mapreduce進行排序]
select * from t_person order by salary desc;

# 4. limit(hive沒有起始下標)
select * from t_person sort by salary desc limit 5;

# 5. 去重 distinct 
select distinct addr.city  from t_person;
select distinct(addr.city) from t_person;

單行函式(show functions) --對一行資料進行操作

#檢視所有函式
-- 檢視hive系統所有函式 
show functions;
--函式的使用：函式名(引數)

1. array_contains(列,值);--判斷陣列列中是否包含指定的值
select name,hobbies from t_person where array_contains(hobbies,'喝酒');

2. length(列)--獲取到長度
select length('123123');

3. concat(列,列)--拼接
select concat('123123','aaaa');

4. to_date('1999-9-9')--字串轉換成日期
select to_date('1999-9-9');

5. year(date)--獲取日期型別的年,month(date)--獲取日期型別的月份,

6. date_add(date,數字)--日期加多少天

select name,date_add(birthday,-9) from t_person;

炸裂函式(集合函式)：由一行資料計算完成之後獲得多行資料

-- 查詢所有的愛好，explode
select explode(hobbies) as hobby from t_person

常見的函式


# lateral view 
-- 為指定表，的邊緣拼接一個列。(類似表連線)
-- lateral view：為表的拼接一個列(炸裂結果)
-- 語法：from 表 lateral view explode(陣列欄位) 別名 as 欄位名;


# collect_list(組函式)
作用：對分組後的，每個組的某個列的值進行收集彙總。
語法：select collect_list(列) from 表 group by 分組列;

例: select username,collect_list(video_name) from t_visit_video group by username;
資料: 
      id         username 
      1           ["a","b"]
      2           ["a","a","b"]


# collect_set(組函式)
作用：對分組後的，每個組的某個列的值進行收集彙總，並去掉重複值。
語法：select collect_set(列) from 表 group by 分組列;

例: select username,collect_set(video_name) from t_visit_video group by username;
資料: 
      id         username 
      1           ["a","b"]
      2           ["a","C","b"]


# concat_ws(單行函式)：指定分隔符
作用：如果某個欄位是陣列，對該值得多個元素使用指定分隔符拼接。
select id,name,concat_ws(',',hobbies) from t_person;

--# 將t_visit_video資料轉化為如下效果
--統計每個人，2020-3-21看過的電影。
例: select username,concat_ws(',',collect_set(video_name)) from t_visit_video group by username;
資料: 
      id         username 
      1           a,b
      2           a,C,b

全排序和區域性排序

# 全域性排序
語法：select * from 表 order by 欄位 asc|desc;


# 區域性排序(分割槽排序)
概念：啟動多個reduceTask，對資料進行排序(預排序)，區域性有序。
	區域性排序關鍵詞 sort by
	預設reducetask個數只有1個，所有分割槽也只有一個。所以預設和全排序效果一樣。
語法：select * from 表 distribute by 分割槽欄位 sort by 欄位 asc|desc;

外部表和分割槽表


# 建立表語法  external 代表為外部表 
             row format delimited 表示自定義分隔符
             partitioned by(country string,city string) 表示為分割槽表
      create external table t_personout(
        id int,
        name string,
        salary double,
        birthday date,
        sex char(1),
        hobbies array<string>,
        cards map<string,string>,
        addr struct<city:string,zipCode:string>
    )
    #partitioned by(country string,city string)   
    #row format delimited
    fields terminated by ',' --列的分割
    collection items terminated by '-'--陣列 struct的屬性 map的kv和kv之間
    map keys terminated by '|'
    lines terminated by '\n'
    location '/file';


# 匯入資料命令
# 在hive命令列中執行
-- local 代表本地路徑，如果不寫，代表讀取檔案來自於HDFS
-- overwrite 是覆蓋的意思，可以省略。
load data [local] inpath ‘/opt/datas/person.txt’ [overwrite] into table t_person;
# 本質上就是將資料上傳到hdfs中(資料是受hive的管理)

#可執行的匯入資料的命令
load data local inpath '/opt/data/person.txt' into table t_person;

自定義函式 UDF和UDTF

# 0. 匯入hive依賴
<dependency>
    <groupId>org.apache.hive</groupId>
    <artifactId>hive-exec</artifactId>
    <version>1.2.1</version>
</dependency>
# 1.定義一個類繼承UDF
1. 必須繼承UDF  GenericUDTF
2. 方法名必須是evaluate   initialize

-- 建立永久函式與開發好的java class關聯
create function base_analizer as 'UDF.FileUdf' # 全類名
using jar 'hdfs://synthesize60:9000/user/hive/jars/aofflineforwarehouse.jar' # hdfs上的路徑;
-- 刪除永久函式
drop function flat_analizer;

表資料轉存匯入操作


# 1.將檔案資料匯入hive表中，
load data local inpath '檔案的路徑' overwrite into table 表。
# 2.直接將查詢結果，放入一個新建立的表中。(執行查詢的建立)
	create table 表 as select語...
		1. 執行select語句
		2. 建立一個新的表，將查詢結果存入表中。
# 3.將查詢結果，匯入已經存在表。
	insert into 表 
	select語句...
# 4.將HDFS中已經存在檔案，匯入新建的hive表中
	create table Xxx(
		...
	)row format delimited 
    fields terminated by ','
    location 'hdfs的表資料對應的目錄'

# 將SQL的執行結果插入到另一個表中
    create table 表 as select語句
--## 例子:
--統計每個人，2020-3-21看過的電影，將結果存入hive的表：t_video_log_20200321
create table t_video_log_20200321 as select ...;