hive的基本語法操作
基本操作
DLL操作
資料庫- -----1. 建立資料庫:預設儲存在:HDFS預設位置:/user/hive/warehouse/資料庫名.db
create database if not exists hive;
-----2. 檢視資料庫
show databases;
–如果資料庫非常多,可以使用正則表示式,例如檢視以“h”開頭的資料庫:
show databases like 'h*';
-----3. 檢視資料庫資訊
describe database hive;
-----4. 刪除資料庫
drop database if exists hive; #強制刪除資料庫(非空資料庫) drop database if exists hive cascade;
二. 表
建立表
建立表之前,最好使用use 資料庫名;選擇資料庫,否則表會預設建立在default資料庫中;
–(1)建立內部表
create table if not exists student(
id int,
name string,
age int)
row format delimited fields terminated by '\t'
stored as textfile;
#指定列的分隔符,預設是^A,需要根據上傳檔案的分隔符確定。預設的行分隔符是'\n';
#指定儲存的格式
#預設的儲存的路徑
–(2)建立外部表
create external table if not exists stu_external2( id int, name string, age int) row format delimited fields terminated by '\t' location '/shiny/hive.db/stu_external2'; #指定儲存的路徑
一級分割槽
create table if not exists stu_partition(
id int,
name string,
age int)
partitioned by(sex string)
row format delimited fields terminated by '\t';
#partitioned by指定按照那個欄位分割槽,這個欄位不能在定義表的時候定義
– 二級分割槽
create table if not exists stu_partition2( id int, name string, age int) partitioned by(classname string,sex string) row format delimited fields terminated by '\t';
—插入女分割槽資料------
load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition partition(sex='female');
load data local inpath '/home/shiny/Desktop/data/female.txt' into table stu_partition2 partition(classname='1101',sex='female');
—插入男分割槽資料------
load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition partition(sex='male');
load data local inpath '/home/shiny/Desktop/data/male.txt' into table stu_partition2 partition(classname='1101',sex='male');
----查詢表的所有資料------
select * from stu_partition;
----查詢表分割槽-----------
show partitions stu_partition;
–(4)建立分桶表
create table if not exists stu_buck(
id int,
name string,
age int)
clustered by(id) sorted by(id desc) into 3 buckets
row format delimited fields terminated by '\t';
#指定分桶的欄位,排序的欄位,桶的數量
–插入資料
insert into table stu_buck select * from student distribute by (id) sort by (id desc);
------2. 修改表 –(1)重命名錶
alter table student rename to stu_internal;
–(2)增加列
alter table stu_partition add columns (address string);
—查看錶結構-----
desc stu_partition;
—查看錶結構詳細資訊 desc formatted stu_partition;
–(3)改變列 alter table stu_partition change id number string;
–(4)替換/刪除列 alter table stu_partition replace columns(id int,name string,age int);
–(5)新增分割槽------ alter table stu_partition add partition(sex=‘weizhi’);
–(6)刪除分割槽------ alter table stu_partition drop partition(sex=‘weizhi’);
------3. 刪除表 drop table if exists stu_external2;
–顯示當前資料庫中所有的表 show tables;
---------1. Load裝載資料------------------- –(1)載入本地資料(複製資料) load data local inpath ‘/home/shiny/Desktop/data/female.txt’ into table stu_internal;
–(2)載入HDFS資料(移動資料) load data inpath ‘/data/male.txt’ into table stu_internal;
–(3)載入本地資料覆蓋表中內容 load data local inpath ‘/home/shiny/Desktop/data/female.txt’ overwrite into table stu_internal;
---------2. INSERT插入資料----------------- –(1)單條插入(一般不使用) insert into table stu_internal values(1116,‘bob’,23);
–(2)利用查詢語句將結果匯入新表(新表必須事先手動建立) – 複製表(只是複製現有的表結構,不復制資料) create table student like stu_internal;
– 將資料匯入新表
insert overwrite/into table student select * from stu_internal where age>=23;
–(3)多重插入(新表事先建立)
# 新建表
create table stu_insert(
id int,
name string)
row format delimited fields terminated by '\t';
# 實現多重插入
from stu_internal
insert into table student select * where age<23
insert into table stu_insert select id,name;
–(4)CTAS(create table … as select …)(新表不用事先手動建立)如果select語句查詢由於某種原因而失敗,新表是不會建立的。
create table stu_ctas as select id,age from stu_internal where age<23;
-----3. INSERT匯出資料(注意是overwrite,不能使用into)----------------- –(1)單模式匯出:匯出到本地(^A(ctrl+A)為列分隔符,\n為行分隔符)
insert overwrite local directory '/home/shiny/Desktop/data/student' select * from student;
–(2)單模式匯出:匯出到HDFS(^A(ctrl+A)為列分隔符,\n為行分隔符)
insert overwrite directory '/student' select * from student;
----4. SELECT查詢資料--------------------- –建立表
create table if not exists score(
id int,
name string,
course string,
score int)
row format delimited fields terminated by '\t';
#載入本地資料
load data local inpath '/home/shiny/Desktop/data/score.txt' into table score;
–(1)GROUP BY:查詢每位學生總成績 – 注意:在Group by子句中,Select查詢的列,要麼需要是Group by中的列,要麼得是用聚合函式(比如sum、count等)加工過的列。不支援直接引用非Group by的列。
select id,name,sum(score) as count from score group by id,name;
–(2)ORDER BY:獲取全級總成績最高的學生資訊(全域性排序)預設是升序排序asc
select id,name,sum(score) as count from score group by id,name order by count desc limit 1;
–(3)SORT BY: 查詢學生資訊,按照id降序排序(區域性排序) –設定reduce的個數為2
set mapreduce.job.reduces=2;
create table stu_sort as select * from student sort by id desc;
–(4)先對age進行降序排序,age相同的情況下對id進行降序排序 –DISTRIBUTE BY + SORT BY:分桶和排序的組合操作,對id進行分桶,對age,id進行降序排序
-- 指定開啟分桶
SET hive.enforce.bucketing = true;
-- 指定 reducetask 數量,也就是指定桶的數量
SET mapreduce.job.reduces=3;
insert overwrite local directory '/home/shiny/Desktop/data/distr'
select * from student distribute by (id) sort by (age desc,id desc);
–(5)對id進行分桶,對id進行升序排序 – CLUSTER BY:分桶和排序的組合操作,等於DISTRIBUTE BY + SORT BY(前提:分桶和SORT欄位是同一個)。
insert overwrite local directory '/home/shiny/Desktop/data/cluster'
select * from student cluster by (id); -- 等價於distribute by id sort by id
join連線
#內連線:顯示符合條件的連線
select * from studenta a join studentb b on a.id=b.id;
#左外連線,以左表位基準,匹配不上的null
select * from studenta a left join studentb b on a.id=b.id;
#右外連線,以右表為基準
select * from studenta a right join studentb b on a.id=b.id;
#全外連線,以兩個表為標準,並去重
select * from student a full join studentb b on a.id=b.id;
#左半連線:只顯示匹配成功後左表的資料
select * from studenta a left semi join studentb b on a.id=b.id;