1. 程式人生 > >hive-分析函數

hive-分析函數

uil 訪問 查詢 重用 計數 into 最終 創建索引 UC

bitmap
--------------
    位圖/位映射。
    5
    
hive
--------------
    分區表
    udtf函數
    wordcount
    lateral view            //和udtf配合使用。
    order by                //數據傾斜
    sort by                    //reduce內排序
    distribute by            //分區
    cluster by                //sort by + distribute by

hive
-------------- --顯式表頭 hive>set hive.cli.print.header=true ; hive分析函數 -------------- 1.準備數據 create table emp ( id int , empno string , age int , salary float , deptno int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY
‘,‘ STORED AS TEXTFILE; 2.加載數據 1,tom1,28,3000,1 2,tom2,29,5000,2 3,tom3,30,5400,1 4,tom4,28,6600,3 5,tom5,27,8000,2 6,tom6,35,10000,3 3.加載數據 load data local inpath ‘/home/centos/emp.txt‘ into table emp ; hive分區函數應用 ----------------- 0.簡介 hive分析函數可以連接每條記錄形成數據集,字段甚至可以不是分組字段,使用一次mr 完成聚合查詢。 常規的sql下的分組聚合很諸多限制,其中select字段必須是分組字段,有時需要多次mr. select deptno , max(salary) from emp group by deptno ;
1.分析函數 -- over , 只是分區 SELECT id, empno, salary ,deptno ,max(salary) OVER (PARTITION BY deptno) AS max from emp ; -- OVER + ORDER BY , 分區並在分區內排序 SELECT empno, deptno, salary ,SUM(salary) OVER(PARTITION BY deptno ORDER BY salary) AS t1 from emp; -- OVER ... rows unbounded preceding 基於前導所有行的匯總操作。 SELECT empno, deptno, salary , SUM(salary) OVER(ORDER BY deptno, empno rows unbounded preceding) AS t3 from emp ; -- RANK, 排名操作 ,計算每個部門內按照工資的降序進行排名(有縫,並列情況存在縫隙)絕對排名。 SELECT empno, deptno, salary, RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ; -- dense_rank()密度排名,無縫。絕對排名。 SELECT empno, deptno, salary, Dense_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ; -- percent_rank()百分比排名,相對排名. SELECT empno, deptno, salary, percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ; --NTILE(n) ,分桶操縱,將數據均勻分散到各個桶中。 SELECT empno, deptno, salary , NTILE(4) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; -- lead()前導操作,查詢從當前開始,後續第幾行的操作。 SELECT empno, deptno, salary, LEAD(salary, 2) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --lag,從當前行計數,訪問之前的第幾行salary,如果超過窗口範圍返回null。 SELECT empno, deptno, salary, lag(salary, 1) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --first_value() SELECT empno, deptno, salary, first_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --last_value() SELECT empno, deptno, salary, last_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ; --使用range開窗函數 RANGE BETWEEN ... AND ...,在分區內在劃分記錄範圍。 SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ; SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ; --RANGE : 對值的+/-. SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ; SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ; --range : 計算的是值 SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN 2000 PRECEDING AND 2000 FOLLOWING) AS t1 from emp ; //rows計算的行 SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS t1 from emp ; --窗口重用 SELECT empno, deptno, salary , MAX(salary) OVER w1 AS mx,MIN(salary) OVER w1 AS mn,AVG(salary) OVER w1 AS ag from emp WINDOW w1 AS (PARTITION BY deptno ORDER BY salary desc ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) ; [開窗函數] range|rows between ... and ; range是值偏移,rows是行偏移。 2.統計員工數量 select max(salary) from emp group by deptno union select max(salary) from emp group by age union select max(salary) from emp group by deptno,age ; hive高級聚合 --------------- 1.grouping set 作用等同於union. select deptno , age , count(1) from emp group by deptno,age grouping sets(deptno ,age ,(deptno,age) ) ; 3.rollup select ... from ... GROUP BY a,b,c WITH ROLLUP ; select ... from ... GROUP BY a,b,c GROUPING SETS ((a,b,c),(a,b),(a),()) select deptno , age , count(1) from emp group by deptno,age with rollup ; 4.cube select ... from ... GROUP BY a,b,c WITH cube ; select ... from ... GROUP BY a,b,c GROUPING SETS ((a),(a,b),(a,c),(a,b,c) ,(b),(b,c),(c),())) select deptno , age , count(1) from emp group by deptno,age with cube ; hive優化 ------------------- 1.創建索引 本質上就是表,對於檢索單條記錄是有優勢的。 排序的。 --創建索引 CREATE INDEX idx_emp ON TABLE emp (empno) AS ‘COMPACT‘ WITH DEFERRED REBUILD; --生成索引 ALTER INDEX idx_emp ON emp REBUILD; --查詢是通過索引列查詢 select * from emp where empno = ‘tom1‘ 2.文件格式 [列存儲格式] parquet , orcfile , orc // 投影查詢時,發揮磁盤的線性讀寫。 select id,name from emp ; //創建表,存儲成parquet格式 create table pq1(id int , empno string, age int ,salary float , deptno int ) stored as parquet ; //查看文件格式 insert into pq1 select * from emp ; [行存儲] txt 3.壓縮 減少數據傳輸量,降低網絡IO的負載。 --在多級job中,job之間的結果是否需要壓縮。 SET hive.exec.compress.intermediate=true ; SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec ; SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec ; SET hive.intermediate.compression.type=record|block|none ; --控制job的最終輸出是否壓縮. SET hive.exec.compress.output=true; SET mapred.output.compression.codec= org.apache.hadoop.io.compress.GzipCodec; 4.大量小文件 導致map過多。 1.歸檔 hadoop Archive and HAR進行歸檔。 2.轉換成序列文件 將小文件合成SeqenceFile. 3.org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat alter table emp set inputformat=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat --修改默認輸入格式 set hive.input.format=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat ; --建新表 create table emp2 ( id int , empno string , age int , salary float , deptno int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,‘ STORED AS INPUTFORMAT ‘org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat‘ OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat‘

hive-分析函數