1. 程式人生 > 其它 >Hive 表複雜型別欄位使用

Hive 表複雜型別欄位使用

1. Hive中複雜資料型別

  1>. 複雜型別定義

     1.1 map結構資料定義  map<string,string>
1.2 array結構資料定義 array<string>
1.3 struct結構資料定義 struct<id:int,name:string,age:int>
1.4 struct和array巢狀定義 array<struct<id:int,name:string,age:int>>

  2>. 複雜型別資料封裝

     2.1 map型別
map(key1,val1,key2,val2,....) --使用map函式

2.2 struct型別
struct(val1,val2,val3,..) --使用struct構造器函式,對應列名預設是col1,col2,col3,...
named_struct(name1,val1,name2,val2,..) --使用帶名稱struct構造器函式,指定對應列名
2.3 array型別
array(val1,val2,val3,...)
collect_list() 函式
collect_set() 函式

  3>. 複雜型別資料訪問

    3.1 map
map[key] --獲取key對應的value
3.2 struct
struct.columnName --columnName代表列名
3.3 array
array[index] --index表示索引值

2. 具體使用案例

  1>. 資料準備: 建立一個複雜型別的表+簡單型別表

CREATE TABLE test.employee(
              name STRING,
              salary FLOAT,
              subordinates ARRAY
<string>, deductions MAP<string,string>, address ARRAY<STRUCT<stree:string,city:string,state:string,zip:int>> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY ':' STORED AS TEXTFILE;
CREATE TABLE test.emp( name STRING, salary FLOAT, subord string, dedkey string, dedval FLOAT, stree string, city string, state string, zip
int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE; insert into test.emp (name,salary,subord,dedkey,dedval,stree,city,state,zip) values ('u001',25000,'sub001','ded-k01',10.01,'china','beijing','use','100000'); insert into test.emp (name,salary,subord,dedkey,dedval,stree,city,state,zip) values ('u001',25000,'sub002','ded-k02',20.02,'china-02','shanghai','use','100001'); insert into test.emp (name,salary,subord,dedkey,dedval,stree,city,state,zip) values ('u001',25000,'sub003','ded-k03',30.03,'china-03','lanzhou','use','100002');

  2>. 根據簡單型別表資料組裝複雜型別表中資料

     1). 使用collect_list()組裝 ARRAY<string>欄位
             select name,collect_list(subord) subordinates  from   test.emp  group by name;
                 name    subordinates
                 u001    ["sub002","sub003","sub001"]
        2). 使用collect_list()組裝 ARRAY<STRUCT<stree:string,city:string,state:string,zip:int>>欄位
            select name,collect_set(named_struct('stree',stree,'city',city,'state',state,'zip',zip)) address from  test.emp group by name;
                name    address
                u001    [{"stree":"china-02","city":"shanghai","state":"use","zip":100001},{"stree":"china-03","city":"lanzhou","state":"use","zip":100002},{"stree":"china","city":"beijing","state":"use","zip":100000}]

        3). 組裝Map型別欄位
            select name,collect_set(named_struct('dedkey',dedkey,'dedval',dedval)) page_stats from  test.emp group by name;
                u001    [{"dedkey":"ded-k02","dedval":20.02},{"dedkey":"ded-k03","dedval":30.03},{"dedkey":"ded-k01","dedval":10.01}]

            select name,collect_set(concat_ws('=',dedkey,cast(dedval as string))) page_stats from  test.emp group by name;
                u001    ["ded-k02=20.02","ded-k03=30.03","ded-k01=10.01"]

            --  第一步: 將key-value欄位組裝成一個字串,藉助於concat_ws
             select name,concat_ws(':',dedkey,cast(dedval as string)) kvs from  test.emp ;
                     name    kvs
                    u001    ded-k03:30.03
                    u001    ded-k01:10.01
                    u001    ded-k02:20.02
            --  第二步: 將所有屬於同一個人的資料組合在一起,,藉助於collect_set
             select name,collect_set(concat_ws(':',dedkey,cast(dedval as string))) kvs from  test.emp group by name;
                    name    kvs
                    u001    ["ded-k02:20.02","ded-k03:30.03","ded-k01:10.01"]

            --  第三步: 將陣列變成一個字串,藉助於concat_ws
             select name,concat_ws(',',collect_set(concat_ws(':',dedkey,cast(dedval as string)))) kvs from  test.emp group by name;
                    name    kvs
                    u001    ded-k02:20.02,ded-k03:30.03,ded-k01:10.01

            --  第四步:將字串轉成map 使用函式str_to_map(text, delimiter1, delimiter2)
                --  text:是字串
                --  delimiter1:多個鍵值對之間的分隔符
                --  delimiter2:key和value之間的分隔符
             select name,str_to_map(concat_ws(',',collect_set(concat_ws(':',dedkey,cast(dedval as string)))),",",":") from test.emp group by name;
                map     deductions
                u001    {"ded-k02":"20.02","ded-k03":"30.03","ded-k01":"10.01"}

  3>.最終插入複雜表的SQL

with deds as (
             select name,str_to_map(concat_ws(',',collect_set(concat_ws(':',dedkey,cast(dedval as string)))),",",":") deductions from test.emp group by name
            ),
            adds  as(
             select name,collect_set(named_struct('stree',stree,'city',city,'state',state,'zip',zip)) address from  test.emp group by name
            ),
            subs as(
                select name,collect_list(subord) subordinates  from   test.emp  group by name
            )
            insert  into table test.employee
            select
                coalesce(adds.name,deds.name,subs.name) name,
                25000,
                subs.subordinates,
                deds.deductions,
                adds.address
            from deds
            full join adds on deds.name=adds.name
            full join subs on deds.name=subs.name;

  4>.最終複雜表中資料查詢

SELECT * from  test.employee;
  employee.name    employee.salary        employee.subordinates            employee.deductions
   u001                25000            ["sub002","sub003","sub001"]    {"ded-k02":"20.02","ded-k03":"30.03","ded-k01":"10.01"}
  employee.address
  [{"stree":"china-02","city":"shanghai","state":"use","zip":100001},{"stree":"china-03","city":"lanzhou","state":"use","zip":100002},{"stree":"china","city":"beijing","state":"use","zip":100000}]