Hive資料倉庫

阿新 • • 發佈：2021-09-24

以下全部是在Zeppelin上操作

一、內部表：

1.向內部表插入資料

%hive
--insert  into mydemo.xxx values(1,'zhangsan',25),(2,'wangwu',27)
select * from mydemo.xxx

2.建立1個壓縮格式為orc的內部表

%hive
create table mydemo.yyy(
    id string,
    name string
)
row format delimited fields terminated by ','
stored as orc

3.向表中插入資料

%hive
create table mydemo.yyy(
     
id string,
    name string
)
row format delimited fields terminated by ','
stored as orc

二、外部表

1.建立一個外部表

首先在Hadoop上建立一個數據目錄，後面將資料檔案上傳到此，再建立外部表對映到這裡

-- 在Hadoop上建立一個數據目錄
%sh
hdfs dfs -mkdir /tab


-- 上傳資料檔案到外部表的資料夾
%sh
hdfs dfs -put /opt/data/data.txt /tab



-- 建立一個外部表，對映到Hadoop上的資料檔案位置

　%hive
　create external table mydemo.uss(
　id string,
　name string,
　job ARRAY<string>,
sex_age STRUCT<sex:int,age:string>,
skill MAP<string,string>
)
row format delimited fields terminated by ' '
collection items terminated by ','
map keys terminated by ':'
location '/tab'

-- 檢視資料表

　%hive
　select id,job[0],sex_age.age,skill['skill1'] from mydemo.uss

2.csv檔案作為資料來源

%sh
hdfs dfs -mkdir -p /tab1


%sh
hdfs dfs -put /opt/data/data.csv /tab1


%hive
create external table mydemo.usi(
    id string,
    name string,
    birth string,
    hoby string
)
-- row format delimited fields terminated by ' 
,'
-- location '/tab1'
-- tblproperties('skip.header.line.count'='1') --去除首行
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties 
(
    'separatorChar' = ',',
    'quoteChar'     = '\"',
    'escapeChar'    = '\\'
)
location '/tab1'
tblproperties('skip.header.line.count'='1')

三、根據已有的表建立新表，只能建內部表

%hive
--create table mydemo.usi1 as select * from mydemo.usi  --連資料帶結構

--create table mydemo.usi2 like mydemo.usi              --只複製表結構

--建立表不推薦  但查詢使用非常方便
create table mydemo.usi3 as
with
r1 as (select userid id,username name from mydemo.userinfos),
r2 as (select id,name from mydemo.usi)
select * from r1 union all select * from r2

四、使用load向表中傳資料

分別建立一張內部表和一張外部表：

-- 內部表
%hive
-- drop table mydemo.test1
create table mydemo.test1(
    id string,
    name string
)
row format delimited fields terminated by ','
stored as textfile

-- 外部表
-- create external table mydemo.test2(
--     id string,
--     name string
-- )
-- row format delimited fields terminated by ','
-- location '/tab2'

2.用load載入hive中的資料 ---> ETL 資料格式不能轉換

%hive
load data local inpath '/opt/data/dd.txt' into table mydemo.test1
-- local 是指從Linux本地載入  不加local就是指從Hadoop上載入
-- load data local inpath '/opt/data/dd.txt' overwrite into table mydemo.test2
-- overwrite是全量表（用一次覆蓋一次）| 不寫是增量表寫法（往後面追加，前面已有資料不動）

五、建立分割槽表

　　分割槽表操作：實際上就是建立一個個的資料夾，將資料按照你的分割槽約定，分別存放進去。分為靜態分割槽和動態分割槽

　　建立一個分割槽表，id，name，birthmonth三列，其中按birthmonth分割槽：

%hive
create table mydemo.my_part(
    id string,
    name string
)
partitioned by (birthmonth string)  -- 分割槽的那個列寫在外面
row format delimited fields terminated by ','

1.手工建立一個靜態分割槽

%hive
alter table mydemo.my_part add partition(birthmonth='01') -- 實質上就是手工新增一個資料夾

　　此時去你的50070埠，你會發現在/hive/warehouse/mydemo.db/my_part路徑下，為你生成好了一個 ‘birthmonth=01’資料夾

2.向指定分割槽裡面插入Linux本地的資料檔案進去（兩種方法：靜態和動態）

一：靜態將資料以全量表（不覆蓋）的方式插入
%hive
load data local inpath '/opt/data/dd.txt' overwrite into table mydemo.my_part partition(birthmonth='01')

二：動態插入一些資料
%hive
insert into mydemo.my_part partition(birthmonth='01') select id,name from mydemo.usi

3.再建立一個動態分割槽，並向其中插入資料

%hive
-- 動態分割槽是根據查詢分割槽列的有幾種變化就有幾個分割槽（列值基數）
-- set hive.exec.dynamic.partition=true;
-- set hive.exec.dynamic.partition.mode=nonstrict;

-- 放入資料
insert into mydemo.my_part1 partition(birthmonth) select id,name,month(regexp_replace(birth,'/','-')) birthmonth from mydemo.usi

六、分桶表

1.先建立一個臨時表

%hive
-- 建立一個臨時表存放資料
-- create temporary table mydemo.tmp(
--     id string,
--     name string,
--     gender string
-- )
-- row format delimited fields terminated by ','
load data local inpath '/opt/data/ddd.csv' overwrite into table mydemo.tmp

2.建立分桶表

%hive
-- create table mydemo.bck_user(
--     id string,
--     name string,
--     gender string
-- )
-- clustered by (gender)   -- 按照什麼分桶
-- into 2 buckets          -- 分多少個桶
-- row format delimited fields terminated by ','

-- set hive.enforce.bucketing=true
-- set mapreduce.job.reduces=2

insert into table mydemo.bck_user select * from mydemo.tmp

3.檢視分桶表

%hive
select * from mydemo.tmp tablesample(bucket 2 out of 2 on gender)
-- (1)n = 總桶數  (2)x = 抽桶開始的位置  (3)y 2^z = 抽取總桶數的比例 n/y=2/2=1

七、建立檢視

%hive
create view mydemo.view_user as
select id,name,if(gender='男',1,0) sex from mydemo.bck_user

%hive
select * from mydemo.view_user

八、explode函式和posexplode函式

%hive
-- select * from mydemo.usi
-- 某一欄位可能包含多個值，是個陣列  explode函式將這些值全部拿出來  只拿值 注意必須起別名
-- select id,name,birth,ind from mydemo.usi lateral view explode(split(hoby,',')) b as ind

-- 在上面的基礎上，將陣列的下標和值對應著一一拿出 對應下標和值全部拿出來  注意必須起別名
select id,name,birth,ind,schem from mydemo.usi lateral view posexplode(split(hoby,',')) b as ind,schem