1. 程式人生 > >數據定義和描述

數據定義和描述

ace employee comment ram internal osi 自動創建 ted pop

create table employee (
name string,
work_place array<string>,         -- 調用樣式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      類似於Hbase的family,調用樣式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";

!table employee --不用
!column employee --不用
describe formatted employee;  --用這個,可讀性更好
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table employee; 
# query the whole array 
select work_place from employee; 
select work_place[0] as col_1, work_place[1] as col_2, work_place[2] as col_3 from employee;
#query the whole map
select gender_age from employee;
select gender_age.gender , gender_age.age from employee;
#query the whole struct and each column in table:
select skills_score from employee;
select name, 
skills_score["DB"] as DB, 
skills_score["Perl"] as Perl,
skills_score["Python"] as Python,
skills_score["Sales"] as Sales,
skills_score["HR"] as HR
from employee;
#query composite type
select apart_title from employee;
select name,
apart_title["Product"] as Product,
apart_title["Test"] as Test,
apart_title["COE"] as COE,
apart_title["Sales"] as Sales
from employee;

DDL
操作數據庫
create database if not exists myhivebook
comment "cho3 hive database in practice"  --添加描述
location "/hdfs/hive"   --hdfs上的路徑
with dbproperties ("name"="MengRui", "date"="2018-08-20");

show databases;
describe database myhivebook;   --打印出指定數據庫的信息

use myhivebook;

drop database if exists myhivebook; --刪除空庫
drop database if exists myhivebook cascade; --刪除含表的庫

alter database myhivebook   --設置數據庫屬性
set dbproperties ("edited by"="dog");
alter database myhivebook
set owner user dog;

操作表
create external table external_employee (
name string,
work_place array<string>,         -- 調用樣式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      調用樣式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
comment "this is a external table" --屬性位置固定,否則會報錯
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":"
stored as textfile   --
location "/user/ch03/employee"; --此路徑下不能包含其他文件夾,否則,在查詢時會出錯。若路徑不存在,Hive會自動創建路徑
    
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table external_employee;
create temporary table temporary_name... ???

--CTAS copy metadata and data to new table
create table ctas_employee as 
select * from external_employee;

--創建CTE
男性中選出名為"Michael"的姓名,並且選擇出所有女性的姓名
create table cte_employee as  -- CTAS
with r1 as (select name from r2 where name = "Michael"), --CTE   
r2 as (select name from employee where agender_age.agender = "Male"),
r3 as (select name from employee where agender_age.agender = "Female")
select * from r1 union all select * from r3;
select * from cte_employee;

--創建空表
//create table empty_ctas_employee as  --CTAS  會使用mapper,耗時不推薦
//select * from employee where 1 = 2;  
create table empty_like_employee  -- use LIKE only metadata replication
like employee; --like [table or view]

-- 統計行數
select count(*) as row_counts from employee;

-- 完全地刪除內部表, removes the metadata completely and moves date to Trash.
drop table if exists empty_ctas_employee; 

-- remove all the rows from a internal table
truncate table cte_employee;

ALTER 只改變元數據
--alter table rename
alter table internal_employee to empty_employee;

alter table employee set --添加或更新表屬性
tblproperties("comment" = "this is internal table");

alter table employee set
serdeproperties("field.delim" = "$");

alter table employee set
location "hdfs://mycluster/user/hive/warehouse/new_employee";  -- 設置路徑,hive不會自動創建路徑,路徑必須為hdfs中的絕對路徑

alter table external_employee partition(year = 2012, month = 1, day = 1) enable no_drop;    --阻止刪除分區表   
alter table external_employee enable off_line;      -- 阻止查詢分區表中的data(not metadata)

alter table employee concatenate;  --merge small files into larger files,only RCFile and ORCFile Formats are supportted right now

alter table employee set fileformat rcfile;    --設置文件格式
alter table employee set fileformat textfile;

--check column type
desc employee;
 
alter table empty_employee  --下述操作只改變元數據,數據必須與更新後的字段匹配
change column name employee_name string  -- change the column
after work_place;  -- move the column

alter table empty_employee 
add columns (wife string);  --添加新列

alter table empty_employee 
replace columns(wife string);  --替換掉原來的所有列為單個列

分區表
--創建分區表,極大地降低查詢時的時間和帶寬
create table partition_employee (
name string,
work_place array<string>,         -- 調用樣式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      調用樣式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
partitioned by (year int, month int)
row format delimited
fields terminated by "|"          
collection items terminated by ","
map keys terminated by ":";
--檢查分區
show partitions partition_employee;  
--1)首次創建表時無分區,需要手動添加分區
alter table partition_employee add
partition (year = 2017, month = 07)
partition (year = 2017, month = 08);
--2)load data into partitions
load data local inpath "/home/centos/hive essential/ch03/employee.txt". -- local:從本地文件系統加載數據
overwrite into table partition_employee 
partition (year = 2017, month = 7);

-- 查詢分區數據時,需先設置:
hive.strict.checks.large.query=false
hive.mapred.mode=nonstrict

-- drop the partition
alter table partition_employee
drop if exists partition (year = 2017, month = 7);

分桶表
--1)Prepare another dataset and table for bucket table
create table employee_id (
name string,
employee_id int,   -- bucket column 
work_place array<string>,         -- 調用樣式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      調用樣式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";
load data local inpath "/home/centos/hive essential/ch03/employee_id.txt"
overwrite into table employee_id;
--2)create bucket table
create table employee_id_buckets (
name string,
employee_id int,   -- bucket column
work_place array<string>,         -- 調用樣式 array_name[0] 
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      調用樣式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
clustered by (employee_id) into 2 buckets  --桶的容量:near two blocks of data(256M) 桶的數量:2N
row format delimited
fields terminated by "|"
collection items terminated by ","           --tuple1,tuple2,...
map keys terminated by ":";

-- 3)
set map.reducer.max.tasks = 2; --reducer的數量等於桶數
set hive.enforce.bucketing = true; 

-- 4)populate data into buckets
insert overwrite table employee_id_buckets     -- insert的作用??: 根據元數據校驗數據
select * from employee_id;
-- 5) verify the buckets in the HDFS
dfs -ls /user/hive/warehouse/employee_id_buckets;

視圖
降低查詢的復雜性,增加數據安全性

參考書籍

Programming_Hive
Apache Hive Essentials

數據定義和描述