HIVE(二)Hive基本SQL操作
阿新 • • 發佈:2020-12-24
一:Hive DDL
1. 資料庫的基本操作
1.1 檢視資料庫列表
show databases;
1.2 使用資料庫
use databse_name;
1.3 新建資料庫
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name --DATABASE|SCHEMA 是等價 的 [COMMENT database_comment] --資料庫註釋 [LOCATION hdfs_path] --儲存在 HDFS 上的位置 [WITH DBPROPERTIES (property_name=property_value, ...)]; --指定額外屬性
create database if not exists test comment 'hive database for test'
1.4 檢視資料庫資訊
DESC DATABASE [EXTENDED] db_name; --EXTENDED 表示是否顯示額外屬性
desc database test;
1.5 刪除資料庫
DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];
drop database if exists test;
2. 資料庫表的基本操作
2.1 建立表
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later) [(col_name data_type [column_constraint_specification] [COMMENT col_comment], ... [constraint_specification])] [COMMENT table_comment] [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] [SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)] ON ((col_value, col_value, ...), (col_value, col_value, ...), ...) [STORED AS DIRECTORIES] [ [ROW FORMAT row_format] [STORED AS file_format] | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later) ] [LOCATION hdfs_path] [TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later) [AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables) CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name LIKE existing_table_or_view_name [LOCATION hdfs_path];
資料型別
data_type : primitive_type | array_type | map_type | struct_type | union_type -- (Note: Available in Hive 0.7.0 and later) primitive_type : TINYINT | SMALLINT | INT | BIGINT | BOOLEAN | FLOAT | DOUBLE | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later) | STRING | BINARY -- (Note: Available in Hive 0.8.0 and later) | TIMESTAMP -- (Note: Available in Hive 0.8.0 and later) | DECIMAL -- (Note: Available in Hive 0.11.0 and later) | DECIMAL(precision, scale) -- (Note: Available in Hive 0.13.0 and later) | DATE -- (Note: Available in Hive 0.12.0 and later) | VARCHAR -- (Note: Available in Hive 0.12.0 and later) | CHAR -- (Note: Available in Hive 0.13.0 and later) array_type : ARRAY < data_type > map_type : MAP < primitive_type, data_type > struct_type : STRUCT < col_name : data_type [COMMENT col_comment], ...> union_type : UNIONTYPE < data_type, data_type, ... > -- (Note: Available in Hive 0.7.0 and later)
(1)建立內部表
--建立普通hive表(不包含行定義格式) create table psn ( id int, name string, likes array<string>, address map<string,string> ) --建立自定義行格式的hive表 create table psn2 ( id int, name string, likes array<string>, address map<string,string> ) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':'; --建立預設分隔符的hive表(^A、^B、^C) create table psn3 ( id int, name string, likes array<string>, address map<string,string> ) row format delimited fields terminated by '\001' collection items terminated by '\002' map keys terminated by '\003'; --或者 create table psn3 ( id int, name string, likes array<string>, address map<string,string> )
(2)建立外部表
--建立hive的外部表(需要新增external和location的關鍵字) create external table psn4 ( id int, name string, likes array<string>, address map<string,string> ) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' location '/data';
內部表與外部表的區別?
1、hive內部表建立的時候資料儲存在hive的預設儲存目錄中,外部表在建立的時候需要制定額外的目錄
2、hive內部表刪除的時候,會將元資料和資料都刪除,而外部表只會刪除元資料,不會刪除資料
(3)建立分割槽表
--建立單分割槽表 create table psn5 ( id int, name string, likes array<string>, address map<string,string> ) partitioned by(gender string) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':'; --建立多分割槽表 create table psn6 ( id int, name string, likes array<string>, address map<string,string> ) partitioned by(gender string,age int) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':';
新增分割槽與刪除分割槽
--給分割槽表新增分割槽列的值 alter table table_name add partition(col_name=col_value) --刪除分割槽列的值 alter table table_name drop partition(col_name=col_value) /* 注意: 1、新增分割槽列的值的時候,如果定義的是多分割槽表,那麼必須給所有的分割槽列都賦值 2、刪除分割槽列的值的時候,無論是單分割槽表還是多分割槽表,都可以將指定的分割槽進行刪除 */
修復分割槽
在使用hive外部表的時候,可以先將資料上傳到hdfs的某一個目錄中,然後再建立外部表建立對映關係,如果在上傳資料的時候,參考分割槽表的形式也建立了多級目錄,那麼此時建立完表之後,是查詢不到資料的,原因是分割槽的元資料沒有儲存在mysql中,因此需要修復分割槽,將元資料同步更新到mysql中,此時才可以查詢到元資料
--在hdfs建立目錄並上傳檔案 hdfs dfs -mkdir /msb hdfs dfs -mkdir /msb/age=10 hdfs dfs -mkdir /msb/age=20 hdfs dfs -put /root/data/data /msb/age=10 hdfs dfs -put /root/data/data /msb/age=20 --建立外部表 create external table psn7 ( id int, name string, likes array<string>, address map<string,string> ) partitioned by(age int) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' location '/msb'; --查詢結果(沒有資料) select * from psn7; --修復分割槽 msck repair table psn7; --查詢結果(有資料) select * from psn7;
(4)建立分桶表
CREATE EXTERNAL TABLE emp_bucket( empno INT, ename STRING, job STRING, mgr INT, hiredate TIMESTAMP, sal DECIMAL(7,2), comm DECIMAL(7,2), deptno INT) CLUSTERED BY(empno) SORTED BY(empno ASC) INTO 4 BUCKETS --按照員工編號雜湊到四 個 bucket 中 ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t" LOCATION '/hive/emp_bucket';
2.2 修改表
ALTER TABLE table_name RENAME TO new_table_name;
ALTER TABLE emp_temp RENAME TO new_emp; --把 emp_temp 表重新命名為 new_emp
2.3 修改列
ALTER TABLE table_name [PARTITION partition_spec] CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT];
-- 修改欄位名和型別 ALTER TABLE emp_temp CHANGE empno empno_new INT; -- 修改欄位 sal 的名稱 並將其放置到 empno 欄位後 ALTER TABLE emp_temp CHANGE sal sal_new decimal(7,2) AFTER ename; -- 為欄位增加註釋 ALTER TABLE emp_temp CHANGE mgr mgr_new INT COMMENT 'this is column mgr';
2.4 新增列
ALTER TABLE emp_temp ADD COLUMNS (address STRING COMMENT 'home address');
2.5 清空表
-- 清空整個表或表指定分割槽中的資料 TRUNCATE TABLE table_name [PARTITION (partition_column = partition_col_value, ...)];
目前只有內部表才能執行 TRUNCATE 操作,外部表執行時會丟擲異常 Cannot truncate nonmanaged table xxx
2.6 刪除表
DROP TABLE [IF EXISTS] table_name [PURGE];
2.7 查看錶資訊
DESCRIBE|Desc [EXTENDED|FORMATTED] table_name --FORMATTED 以友好的展現方式查看錶詳情
2.8 查看錶的列表
SHOW TABLES [IN database_name] ['identifier_with_wildcards'];
2.9 查看錶的分割槽列表
SHOW PARTITIONS table_name;
2.10 查看錶/檢視的建立語句
SHOW CREATE TABLE ([db_name.]table_name|view_name);
二:Hive DML
1. 資料匯入表的方式
1.1 載入檔案資料到表
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA INPATH "hdfs://hadoop001:8020/mydir/emp.txt" OVERWRITE INTO TABLE emp_ptn PARTITION (deptno=20);
1.2 通過查詢插入資料
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement; INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement;
insert into table t1 PARTITION (deptno=20) select * from t2
動態分割槽
INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement; INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
insert into table t1 PARTITION (deptno) select * from t2
動態分割槽的相關配置
配置 | 預設值 | 說明 |
hive.error.on.empty.partition | false | Whether to throw an exception if dynamic partition insert generates empty results |
hive.exec.dynamic.partition | true | Needs to be set totrue to enable dynamic partition inserts |
hive.exec.dynamic.partition.mode | strict | Instrict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions, innonstrict mode all partitions are allowed to be dynamic |
hive.exec.max.created.files | 100000 | Maximum number of HDFS files created by all mappers/reducers in a MapReduce job |
hive.exec.max.dynamic.partitions | 1000 | Maximum number of dynamic partitions allowed to be created in total |
hive.exec.max.dynamic.partitions.pernode | 100 | Maximum number of dynamic partitions allowed to be created in each mapper/reducer node |
1.3 多插入模式
FROM from_statement INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...; FROM from_statement INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...;
1.4 查詢語句中建立表並載入資料
create table table1 as select 指定欄位 from table2;
1.5建立表時通過location指定載入資料路徑(外部表)
2. 資料匯出表的方式
2.1 匯出到檔案系統
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 [ROW FORMAT row_format] [STORED AS file_format] SELECT ... FROM ...
2.1.1 本地檔案系統
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/output' ROW FORMAT DELIMITED FIELDS TERMINATED by ',' select * from testA;
2.1.2 匯出到HDFS
INSERT OVERWRITE DIRECTORY '/home/hadoop/output' select * from testA;
2.2 Hadoop命令
dfs -get /export/servers/exporthive/000000_0 /export/servers/exporthive/local.txt;
2.3採用hive的-e和-f引數來匯出資料
引數為:-e的使用方式,後面接SQL語句。>>後面為輸出檔案路徑
[hadoop@hadoop01 bin]$ ./hive -e "select * from psn" >> /home/hadoop/output/testA.txt
引數為:-f的使用方式,後面接存放sql語句的檔案。>>後面為輸出檔案路徑
[hadoop@hadoop01 bin]$ ./hive -f /home/hadoop/output/sql.sql >> /home/hadoop/output/testB.txt
2.4export匯出到HDFS上(全表匯出)
export table score to ‘/export/exporthive/score’;
2.5 通過sqoop匯出
3. 更新與刪除表資料
更新和刪除的語法比較簡單,和關係型資料庫一致。需要注意的是這兩個操作都只能在支援 ACID 的
表,也就是事務表上才能執行。
-- 更新 UPDATE tablename SET column = value [, column = value ...] [WHERE expression] --刪除 DELETE FROM tablename [WHERE expression]
在hive的hive-site.xml中新增如下配置:
<property> <name>hive.support.concurrency</name> <value>true</value> </property> <property> <name>hive.enforce.bucketing</name> <value>true</value> </property> <property> <name>hive.exec.dynamic.partition.mode</name> <value>nonstrict</value> </property> <property> <name>hive.txn.manager</name> <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value> </property> <property> <name>hive.compactor.initiator.on</name> <value>true</value> </property> <property> <name>hive.compactor.worker.threads</name> <value>1</value> </property>
//操作語句 create table test_trancaction (user_id Int,name String) clustered by (user_id) into 3 buckets stored as orc TBLPROPERTIES ('transactional'='true'); create table test_insert_test(id int,name string) row format delimited fields TERMINATED BY ','; insert into test_trancaction select * from test_insert_test; update test_trancaction set name='jerrick_up' where id=1; //資料檔案 1,jerrick 2,tom 3,jerry 4,lily 5,hanmei 6,limlei 7,lucky