1. 程式人生 > 實用技巧 >HIVE(二)Hive基本SQL操作

HIVE(二)Hive基本SQL操作

一:Hive DDL

1. 資料庫的基本操作

1.1 檢視資料庫列表

show databases;

1.2 使用資料庫

use databse_name;

1.3 新建資料庫

CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name --DATABASE|SCHEMA 是等價
[COMMENT database_comment] --資料庫註釋
[LOCATION hdfs_path] --儲存在 HDFS 上的位置
[WITH DBPROPERTIES (property_name=property_value, ...)]
; --指定額外屬性
create database if not exists test
comment 'hive database for test'

1.4 檢視資料庫資訊

DESC DATABASE [EXTENDED] db_name; --EXTENDED 表示是否顯示額外屬性
desc database test;

1.5 刪除資料庫

DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];
drop database if exists test;

2. 資料庫表的基本操作

2.1 建立表

CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name    -- (Note: TEMPORARY available in Hive 0.14.0 and later)
  [(col_name data_type [column_constraint_specification] [COMMENT col_comment], ... [constraint_specification])]
  [COMMENT table_comment]
  [PARTITIONED BY (col_name data_type [COMMENT col_comment
], ...)] [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] [SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)] ON ((col_value, col_value, ...), (col_value, col_value, ...), ...) [STORED AS DIRECTORIES] [ [ROW FORMAT row_format] [STORED AS file_format] | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later) ] [LOCATION hdfs_path] [TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later) [AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables) CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name LIKE existing_table_or_view_name [LOCATION hdfs_path];

資料型別

data_type
  : primitive_type
  | array_type
  | map_type
  | struct_type
  | union_type  -- (Note: Available in Hive 0.7.0 and later)
 
primitive_type
  : TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
  | STRING
  | BINARY      -- (Note: Available in Hive 0.8.0 and later)
  | TIMESTAMP   -- (Note: Available in Hive 0.8.0 and later)
  | DECIMAL     -- (Note: Available in Hive 0.11.0 and later)
  | DECIMAL(precision, scale)  -- (Note: Available in Hive 0.13.0 and later)
  | DATE        -- (Note: Available in Hive 0.12.0 and later)
  | VARCHAR     -- (Note: Available in Hive 0.12.0 and later)
  | CHAR        -- (Note: Available in Hive 0.13.0 and later)
 
array_type
  : ARRAY < data_type >
 
map_type
  : MAP < primitive_type, data_type >
 
struct_type
  : STRUCT < col_name : data_type [COMMENT col_comment], ...>
 
union_type
   : UNIONTYPE < data_type, data_type, ... >  -- (Note: Available in Hive 0.7.0 and later)

(1)建立內部表

--建立普通hive表(不包含行定義格式)
    create table psn
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )
--建立自定義行格式的hive表
    create table psn2
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )
    row format delimited
    fields terminated by ','
    collection items terminated by '-'
    map keys terminated by ':';
--建立預設分隔符的hive表(^A、^B、^C)
    create table psn3
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )
    row format delimited
    fields terminated by '\001'
    collection items terminated by '\002'
    map keys terminated by '\003';
    --或者
    create table psn3
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )

(2)建立外部表

--建立hive的外部表(需要新增external和location的關鍵字)
    create external table psn4
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )
    row format delimited
    fields terminated by ','
    collection items terminated by '-'
    map keys terminated by ':'
    location '/data';

內部表與外部表的區別?

1、hive內部表建立的時候資料儲存在hive的預設儲存目錄中,外部表在建立的時候需要制定額外的目錄
2、hive內部表刪除的時候,會將元資料和資料都刪除,而外部表只會刪除元資料,不會刪除資料

(3)建立分割槽表

--建立單分割槽表
    create table psn5
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )
    partitioned by(gender string)
    row format delimited
    fields terminated by ','
    collection items terminated by '-'
    map keys terminated by ':';
--建立多分割槽表
    create table psn6
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )
    partitioned by(gender string,age int)
    row format delimited
    fields terminated by ','
    collection items terminated by '-'
    map keys terminated by ':';    

新增分割槽與刪除分割槽

--給分割槽表新增分割槽列的值
    alter table table_name add partition(col_name=col_value)
--刪除分割槽列的值
    alter table table_name drop partition(col_name=col_value)
/*
    注意:
        1、新增分割槽列的值的時候,如果定義的是多分割槽表,那麼必須給所有的分割槽列都賦值
        2、刪除分割槽列的值的時候,無論是單分割槽表還是多分割槽表,都可以將指定的分割槽進行刪除
*/

修復分割槽

在使用hive外部表的時候,可以先將資料上傳到hdfs的某一個目錄中,然後再建立外部表建立對映關係,如果在上傳資料的時候,參考分割槽表的形式也建立了多級目錄,那麼此時建立完表之後,是查詢不到資料的,原因是分割槽的元資料沒有儲存在mysql中,因此需要修復分割槽,將元資料同步更新到mysql中,此時才可以查詢到元資料

--在hdfs建立目錄並上傳檔案
    hdfs dfs -mkdir /msb
    hdfs dfs -mkdir /msb/age=10
    hdfs dfs -mkdir /msb/age=20
    hdfs dfs -put /root/data/data /msb/age=10
    hdfs dfs -put /root/data/data /msb/age=20
--建立外部表
    create external table psn7
    (
    id int,
    name string,
    likes array<string>,
    address map<string,string>
    )
    partitioned by(age int)
    row format delimited
    fields terminated by ','
    collection items terminated by '-'
    map keys terminated by ':'
    location '/msb';
--查詢結果(沒有資料)
    select * from psn7;
--修復分割槽
    msck repair table psn7;
--查詢結果(有資料)
    select * from psn7;

(4)建立分桶表

CREATE EXTERNAL TABLE emp_bucket(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2),
deptno INT)
CLUSTERED BY(empno) SORTED BY(empno ASC) INTO 4 BUCKETS --按照員工編號雜湊到四
個 bucket 中
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
LOCATION '/hive/emp_bucket';

2.2 修改表

ALTER TABLE table_name RENAME TO new_table_name;
ALTER TABLE emp_temp RENAME TO new_emp; --把 emp_temp 表重新命名為 new_emp

2.3 修改列

ALTER TABLE table_name [PARTITION partition_spec] CHANGE [COLUMN] col_old_name
col_new_name column_type
[COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT];
-- 修改欄位名和型別
ALTER TABLE emp_temp CHANGE empno empno_new INT;
-- 修改欄位 sal 的名稱 並將其放置到 empno 欄位後
ALTER TABLE emp_temp CHANGE sal sal_new decimal(7,2) AFTER ename;
-- 為欄位增加註釋
ALTER TABLE emp_temp CHANGE mgr mgr_new INT COMMENT 'this is column mgr';

2.4 新增列

ALTER TABLE emp_temp ADD COLUMNS (address STRING COMMENT 'home address');

2.5 清空表

-- 清空整個表或表指定分割槽中的資料
TRUNCATE TABLE table_name [PARTITION (partition_column = partition_col_value,
...)];

目前只有內部表才能執行 TRUNCATE 操作,外部表執行時會丟擲異常 Cannot truncate nonmanaged table xxx

2.6 刪除表

DROP TABLE [IF EXISTS] table_name [PURGE];

2.7 查看錶資訊

DESCRIBE|Desc [EXTENDED|FORMATTED] table_name --FORMATTED 以友好的展現方式查看錶詳情

2.8 查看錶的列表

SHOW TABLES [IN database_name] ['identifier_with_wildcards'];

2.9 查看錶的分割槽列表

SHOW PARTITIONS table_name;

2.10 查看錶/檢視的建立語句

SHOW CREATE TABLE ([db_name.]table_name|view_name);

二:Hive DML

1. 資料匯入表的方式

1.1 載入檔案資料到表

LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE]
INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA INPATH "hdfs://hadoop001:8020/mydir/emp.txt" OVERWRITE INTO TABLE
emp_ptn PARTITION (deptno=20);

1.2 通過查詢插入資料

INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement;
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement;
insert into table t1 PARTITION (deptno=20) select * from t2

動態分割槽

INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
insert into table t1 PARTITION (deptno) select * from t2

動態分割槽的相關配置

配置 預設值 說明
hive.error.on.empty.partition false Whether to throw an exception if dynamic partition insert generates empty results
hive.exec.dynamic.partition true Needs to be set totrueto enable dynamic partition inserts
hive.exec.dynamic.partition.mode strict Instrictmode, the user must specify at least one static partition in case the user accidentally overwrites all partitions, innonstrictmode all partitions are allowed to be dynamic
hive.exec.max.created.files 100000 Maximum number of HDFS files created by all mappers/reducers in a MapReduce job
hive.exec.max.dynamic.partitions 1000 Maximum number of dynamic partitions allowed to be created in total
hive.exec.max.dynamic.partitions.pernode 100 Maximum number of dynamic partitions allowed to be created in each mapper/reducer node

1.3 多插入模式

FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
FROM from_statement
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2]
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...;

1.4 查詢語句中建立表並載入資料

create table table1 as select 指定欄位 from table2;

1.5建立表時通過location指定載入資料路徑(外部表)

2. 資料匯出表的方式

2.1 匯出到檔案系統

INSERT OVERWRITE [LOCAL] DIRECTORY directory1
[ROW FORMAT row_format] [STORED AS file_format]
SELECT ... FROM ...

2.1.1 本地檔案系統

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/output' ROW FORMAT DELIMITED FIELDS TERMINATED by ',' select * from testA;  

2.1.2 匯出到HDFS

INSERT OVERWRITE DIRECTORY '/home/hadoop/output' select * from testA;  

2.2 Hadoop命令

dfs -get /export/servers/exporthive/000000_0 /export/servers/exporthive/local.txt;

2.3採用hive的-e和-f引數來匯出資料

引數為:-e的使用方式,後面接SQL語句。>>後面為輸出檔案路徑

[hadoop@hadoop01 bin]$ ./hive -e "select * from psn" >> /home/hadoop/output/testA.txt 

引數為:-f的使用方式,後面接存放sql語句的檔案。>>後面為輸出檔案路徑

[hadoop@hadoop01 bin]$ ./hive -f /home/hadoop/output/sql.sql >> /home/hadoop/output/testB.txt  

2.4export匯出到HDFS上(全表匯出)

export table score to ‘/export/exporthive/score’;

2.5 通過sqoop匯出

3. 更新與刪除表資料

更新和刪除的語法比較簡單,和關係型資料庫一致。需要注意的是這兩個操作都只能在支援 ACID 的
表,也就是事務表上才能執行。

-- 更新
UPDATE tablename SET column = value [, column = value ...] [WHERE expression]
--刪除
DELETE FROM tablename [WHERE expression]

在hive的hive-site.xml中新增如下配置:

<property>
        <name>hive.support.concurrency</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.enforce.bucketing</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.exec.dynamic.partition.mode</name>
        <value>nonstrict</value>
    </property>
    <property>
        <name>hive.txn.manager</name>
        <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
    </property>
    <property>
        <name>hive.compactor.initiator.on</name>
        <value>true</value>
    </property>
    <property>
        <name>hive.compactor.worker.threads</name>
        <value>1</value>
    </property>

//操作語句
    create table test_trancaction (user_id Int,name String) clustered by (user_id) into 3 buckets stored as orc TBLPROPERTIES ('transactional'='true');
    create table test_insert_test(id int,name string) row format delimited fields TERMINATED BY ',';
    insert into test_trancaction select * from test_insert_test;
    update test_trancaction set name='jerrick_up' where id=1;
//資料檔案
    1,jerrick
    2,tom
    3,jerry
    4,lily
    5,hanmei
    6,limlei
    7,lucky