1. 程式人生 > >hive不同格式資料大小,無重複資料

hive不同格式資料大小,無重複資料

-- 重點,目標表無重複資料

-- dbName.num_result 無重複記錄
-- 插入資料
CREATE TABLE dbName.test_textfile(
  `key` string, 
  `value` string,
  `p_key` string, 
  `p_key2` string)
STORED AS textfile
;
insert overwrite table dbName.test_textfile select * from dbName.num_result where p_key='9' and p_key2='0';

drop
table dbName.test_orcfile; CREATE TABLE dbName.test_orcfile( `key` string, `value` string, `p_key` string, `p_key2` string) STORED AS orc ; insert overwrite table dbName.test_orcfile select * from test_textfile; CREATE TABLE dbName.test_rcfile( `key` string, `value` string, `p_key` string, `p_key2` string) STORED
AS rcfile ; insert overwrite table dbName.test_rcfile select * from test_textfile; CREATE TABLE dbName.test_parquet( `key` string, `value` string, `p_key` string, `p_key2` string) STORED AS parquet ; insert overwrite table dbName.test_parquet select * from test_textfile; -- 統計資料量 select count
(1) as cnt from dbName.test_textfile; select count(1) as cnt from dbName.test_orcfile; select count(1) as cnt from dbName.test_rcfile; select count(1) as cnt from dbName.test_parquet; -- 統計檔案大小 dfs -du -s -h hdfs://nameservice1/user/hive/warehouse/dbName.db/test_text*; dfs -du -s -h hdfs://nameservice1/user/hive/warehouse/dbName.db/test_par*; dfs -du -s -h hdfs://nameservice1/user/hive/warehouse/dbName.db/test_rc*; dfs -du -s -h hdfs://nameservice1/user/hive/warehouse/dbName.db/test_orc*;
1.0 G  3.1 G  hdfs://nameNode/user/hive/warehouse/dbName.db/test_textfile
1.1 G  3.3 G  hdfs://nameNode/user/hive/warehouse/dbName.db/test_parquet
984.0 M  2.9 G  hdfs://nameNode/user/hive/warehouse/dbName.db/test_rcfile
470.0 M  1.4 G  hdfs://nameNode/user/hive/warehouse/dbName.db/test_orcfile

從結果可以看出,在無重複資料的情況下,parquet的壓縮無用武之地,佔用空間比textfile還大,ORC是壓縮最強的檔案模式。

 

hive (dbName)> dfs -du -s hdfs://nameNode/user/hive/warehouse/dbName.db/test_text*;
1110741501  3332224503  hdfs://nameNode/user/hive/warehouse/dbName.db/test_textfile
hive (dbName)> dfs -du -s hdfs://nameNode/user/hive/warehouse/dbName.db/test_par*;
1167366639  3502099917  hdfs://nameNode/user/hive/warehouse/dbName.db/test_parquet
hive (dbName)> dfs -du -s hdfs://nameNode/user/hive/warehouse/dbName.db/test_rc*;
1031774688  3095324064  hdfs://nameNode/user/hive/warehouse/dbName.db/test_rcfile
hive (dbName)> dfs -du -s hdfs://nameNode/user/hive/warehouse/dbName.db/test_orc*;
492795434  1478386302  hdfs://nameNode/user/hive/warehouse/dbName.db/test_orcfile