1. 程式人生 > 實用技巧 >hive新增欄位和修改欄位的影響

hive新增欄位和修改欄位的影響

DROP TABLE IF EXISTS tmp_dm_test_a.t_aa_orc;
USE tmp_dm_test_a;
CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa_orc(
 user_id          string COMMENT '使用者id'
,all_addr  string COMMENT '常用地址'
)
PARTITIONED BY (
  inc_day string COMMENT 'inc_day used by partition'
)
STORED AS orc
TBLPROPERTIES(
'orc.compress'='SNAPPY'); set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.fetch.task.conversion=more; set hive.exec.parallel=true; set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec; set mapreduce.output.fileoutputformat.compress.type=
BLOCK; WITH tmp AS ( SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180101' union all SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180101' union all SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180101' union all SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180102' union all SELECT 'sf2222' as
user_id, '江西省' as all_addr, '20180102' union all SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180102' ) INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa_orc PARTITION (inc_day) SELECT * from tmp; SELECT * from tmp_dm_test_a.t_aa_orc; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc where inc_day='20180101'; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc where inc_day='20180103'; ---- SELECT phone_number,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc; alter table tmp_dm_test_a.t_aa_orc add columns(original_union_id string) cascade; alter table tmp_dm_test_a.t_aa_orc partition(inc_day='20180101') add columns(original_union_id string); DROP TABLE IF EXISTS tmp_dm_test_a.t_aa_orc; USE tmp_dm_test_a; CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa_orc( user_id string COMMENT '使用者id' ,all_addr string COMMENT '常用地址' ,original_union_id string ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS orc TBLPROPERTIES('orc.compress'='SNAPPY'); MSCK REPAIR TABLE tmp_dm_test_a.t_aa_orc; WITH tmp AS ( SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x1111' as original_union_id, '20180103' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x2211' as original_union_id, '20180103' union all SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x3311' as original_union_id, '20180103' union all SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x4411' as original_union_id, '20180104' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x5511' as original_union_id, '20180104' union all SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x6611' as original_union_id, '20180104' ) INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa_orc PARTITION (inc_day) SELECT * from tmp; ******************** alter table tmp_dm_test_a.t_aa_orc change column user_id phone_number string; alter table tmp_dm_test_a.t_aa_orc change column user_id phone_number string cascade; ------------- show create table tmp_dm_test_a.t_aa_orc;

parquet儲存格式

DROP TABLE IF EXISTS tmp_dm_test_a.t_aa;
USE tmp_dm_test_a;
CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa(
 user_id          string COMMENT '使用者id'
,all_addr  string COMMENT '常用地址'
)
PARTITIONED BY (
  inc_day string COMMENT 'inc_day used by partition'
)
STORED AS parquet
TBLPROPERTIES('parquet.compression'='SNAPPY');



set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.fetch.task.conversion=more;
set hive.exec.parallel=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.output.fileoutputformat.compress.type=BLOCK;

WITH tmp AS
(
SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180101'
union all
SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180101'
union all
SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180101'
union all
SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180102'
union all
SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180102'
union all
SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180102'
)


INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa PARTITION (inc_day)
SELECT * from tmp;

SELECT * from tmp_dm_test_a.t_aa;
SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa;
SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa where inc_day='20180101';
----
SELECT phone_number,all_addr,original_union_id from tmp_dm_test_a.t_aa;


 alter table tmp_dm_test_a.t_aa add columns(original_union_id string); 
 
 alter table tmp_dm_test_a.t_aa partition(inc_day='20180101') add columns(original_union_id string); 
 
 
 
 
DROP TABLE IF EXISTS tmp_dm_test_a.t_aa;
USE tmp_dm_test_a;
CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa(
 user_id          string COMMENT '使用者id'
,all_addr  string COMMENT '常用地址'
,original_union_id string
)
PARTITIONED BY (
  inc_day string COMMENT 'inc_day used by partition'
)
STORED AS parquet
TBLPROPERTIES('parquet.compression'='SNAPPY');


MSCK REPAIR TABLE tmp_dm_test_a.t_aa;

WITH tmp AS
(
SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x1111' as original_union_id, '20180103'
union all                                                                       
SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x2211' as original_union_id, '20180103'
union all                                                                       
SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x3311' as original_union_id, '20180103'
union all                                                                       
SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x4411' as original_union_id, '20180104'
union all                                                                       
SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x5511' as original_union_id, '20180104'
union all                                                                       
SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x6611' as original_union_id, '20180104'
)


INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa PARTITION (inc_day)
SELECT * from tmp;


********************
alter table tmp_dm_test_a.t_aa change column user_id phone_number string;
alter table tmp_dm_test_a.t_aa change column user_id phone_number string cascade;

*********************************************

結論:

1、parquet和orc格式,舊分割槽中資料檔案內容不可變。

2、parquet和orc格式:欄位增加後,舊資料檔案中無新欄位內容;新產生的分割槽中資料檔案才會有新欄位內容。

3、parquet和orc格式:通過add語句新增欄位後,舊分割槽和新分割槽都可以查,舊資料為null而已。

4、parquet格式:修改欄位名後,無法從舊資料解析原欄位內容,相當於新舊欄位名沒有印射關係,select不能解析新欄位名,因為舊資料中只有舊欄位名沒有新欄位名,而且新欄位名無法印射舊欄位名。

5、orc格式:修改欄位名後,可以從舊資料解析原欄位內容,相當於新舊欄位名有印射關係,select可以解析新欄位名,因為雖然舊資料中只有舊欄位名沒有新欄位名,但是新欄位名印射了舊欄位名,相當於一個別名。