Hive之Order,Sort,Cluster and Distribute By
阿新 • • 發佈:2018-11-11
- 測試資料
create table sort_test( id int, name string ) row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile; [[email protected] ~]# cat sort_test.log 4679 aaa 4728 aaa 3040 aaa 4207 aaa 2231 aaa 1279 aaa 7954 aaa 582 aaa 7096 aaa 4878 aaa 9684 aaa 1540 aaa 4826 aaa 2543 aaa 2323 aaa 1420 aaa 5083 aaa 8965 aaa 1391 aaa 9719 aaa 9901 aaa 2393 aaa 6024 aaa 444 aaa 1574 aaa 8881 aaa 5739 aaa 8689 aaa 1614 aaa 9340 aaa 6726 aaa 109 aaa 6941 aaa 9562 aaa 9019 aaa 4945 aaa 2206 aaa 5910 aaa 8552 aaa 1795 aaa 2720 aaa 9007 aaa 8377 aaa 2179 aaa 3683 aaa 5869 aaa 5448 aaa 5223 aaa 5127 aaa 4616 aaa 2340 aaa 1268 aaa 4332 aaa 2989 aaa 19 aaa 7880 aaa 505 aaa 5975 aaa 5288 aaa 5682 aaa 376 aaa 7502 aaa 6448 aaa 3774 aaa 5541 aaa 9636 aaa 2037 aaa 246 aaa 6151 aaa 7837 aaa 1506 aaa 3749 aaa 9335 aaa 3973 aaa 5160 aaa 7929 aaa 834 aaa 3451 aaa 1766 aaa 6228 aaa 8961 aaa 8177 aaa 2340 aaa 4245 aaa 3226 aaa 2670 aaa 784 aaa 7699 aaa 2054 aaa 6006 aaa 4204 aaa 8905 aaa 6182 aaa 1271 aaa 5415 aaa 5164 aaa 4320 aaa 3736 aaa 2287 aaa 6559 aaa
- Order By
- Job中只會啟動一個reduce做全域性排序,資料量大時,耗時會很久
- 在strict模式(hive.mapred.mode=strict)下,必須新增limit語句限制返回條數
# 語法格式 colOrder: ( ASC | DESC ) colNullOrder: (NULLS FIRST | NULLS LAST) -- (Note: Available in Hive 2.1.0 and later) orderBy: ORDER BY colName colOrder? colNullOrder? (',' colName colOrder? colNullOrder?)* query: SELECT expression (',' expression)* FROM src orderBy # 排序 select * from sort_test order by id desc; +---------------+-----------------+--+ | sort_test.id | sort_test.name | +---------------+-----------------+--+ | 9901 | aaa | | 9719 | aaa | | 9684 | aaa | | 9636 | aaa | | 9562 | aaa | | 9340 | aaa | | 9335 | aaa | | 9019 | aaa | | 9007 | aaa | | 8965 | aaa | | 8961 | aaa | | 8905 | aaa | | 8881 | aaa | | 8689 | aaa | | 8552 | aaa | | 8377 | aaa | | 8177 | aaa | | 7954 | aaa | | 7929 | aaa | | 7880 | aaa | | 7837 | aaa | | 7699 | aaa | | 7502 | aaa | | 7096 | aaa | | 6941 | aaa | | 6726 | aaa | | 6559 | aaa | | 6448 | aaa | | 6228 | aaa | | 6182 | aaa | | 6151 | aaa | | 6024 | aaa | | 6006 | aaa | | 5975 | aaa | | 5910 | aaa | | 5869 | aaa | | 5739 | aaa | | 5682 | aaa | | 5541 | aaa | | 5448 | aaa | | 5415 | aaa | | 5288 | aaa | | 5223 | aaa | | 5164 | aaa | | 5160 | aaa | | 5127 | aaa | | 5083 | aaa | | 4945 | aaa | | 4878 | aaa | | 4826 | aaa | | 4728 | aaa | | 4679 | aaa | | 4616 | aaa | | 4332 | aaa | | 4320 | aaa | | 4245 | aaa | | 4207 | aaa | | 4204 | aaa | | 3973 | aaa | | 3774 | aaa | | 3749 | aaa | | 3736 | aaa | | 3683 | aaa | | 3451 | aaa | | 3226 | aaa | | 3040 | aaa | | 2989 | aaa | | 2720 | aaa | | 2670 | aaa | | 2543 | aaa | | 2393 | aaa | | 2340 | aaa | | 2340 | aaa | | 2323 | aaa | | 2287 | aaa | | 2231 | aaa | | 2206 | aaa | | 2179 | aaa | | 2054 | aaa | | 2037 | aaa | | 1795 | aaa | | 1766 | aaa | | 1614 | aaa | | 1574 | aaa | | 1540 | aaa | | 1506 | aaa | | 1420 | aaa | | 1391 | aaa | | 1279 | aaa | | 1271 | aaa | | 1268 | aaa | | 834 | aaa | | 784 | aaa | | 582 | aaa | | 505 | aaa | | 444 | aaa | | 376 | aaa | | 246 | aaa | | 109 | aaa | | 19 | aaa | +---------------+-----------------+--+
- Sort By
- 排序前會根據排序欄位分割槽,一個job啟動多個reduce進行區域性排序
- 如果有limit語句,會再次啟動一個job,取出每個區域性排好序的前n條,再進行全域性排序
- 只保證區域性有序,不保證全域性有序
# Sort By語法 colOrder: ( ASC | DESC ) sortBy: SORT BY colName colOrder? (',' colName colOrder?)* query: SELECT expression (',' expression)* FROM src sortBy # 設定開啟的reduce個數 set mapreduce.job.reduces=2; 0: jdbc:hive2://> set mapreduce.job.reduces; +--------------------------+--+ | set | +--------------------------+--+ | mapreduce.job.reduces=2 | +--------------------------+--+ # 執行區域性排序(未帶limit) 0: jdbc:hive2://> select * from sort_test sort by id desc; +---------------+-----------------+--+ | sort_test.id | sort_test.name | +---------------+-----------------+--+ | 9901 | aaa | | 9684 | aaa | | 9340 | aaa | | 9019 | aaa | | 9007 | aaa | | 8965 | aaa | | 8961 | aaa | | 8689 | aaa | | 8552 | aaa | | 8177 | aaa | | 7837 | aaa | | 7699 | aaa | | 7502 | aaa | | 6559 | aaa | | 6448 | aaa | | 6228 | aaa | | 6024 | aaa | | 6006 | aaa | | 5975 | aaa | | 5910 | aaa | | 5869 | aaa | | 5739 | aaa | | 5682 | aaa | | 5541 | aaa | | 5448 | aaa | | 5415 | aaa | | 5288 | aaa | | 5164 | aaa | | 5160 | aaa | | 5083 | aaa | | 4878 | aaa | | 4826 | aaa | | 4679 | aaa | | 4616 | aaa | | 4245 | aaa | | 4207 | aaa | | 3736 | aaa | | 3451 | aaa | | 3226 | aaa | | 3040 | aaa | | 2989 | aaa | | 2720 | aaa | | 2670 | aaa | | 2340 | aaa | | 2231 | aaa | | 2206 | aaa | | 2054 | aaa | | 2037 | aaa | | 1766 | aaa | | 1614 | aaa | | 1540 | aaa | | 1506 | aaa | | 1420 | aaa | | 1268 | aaa | | 834 | aaa | | 784 | aaa | | 582 | aaa | | 444 | aaa | | 376 | aaa | | 246 | aaa | | 19 | aaa | | 9719 | aaa | | 9636 | aaa | | 9562 | aaa | | 9335 | aaa | | 8905 | aaa | | 8881 | aaa | | 8377 | aaa | | 7954 | aaa | | 7929 | aaa | | 7880 | aaa | | 7096 | aaa | | 6941 | aaa | | 6726 | aaa | | 6182 | aaa | | 6151 | aaa | | 5223 | aaa | | 5127 | aaa | | 4945 | aaa | | 4728 | aaa | | 4332 | aaa | | 4320 | aaa | | 4204 | aaa | | 3973 | aaa | | 3774 | aaa | | 3749 | aaa | | 3683 | aaa | | 2543 | aaa | | 2393 | aaa | | 2340 | aaa | | 2323 | aaa | | 2287 | aaa | | 2179 | aaa | | 1795 | aaa | | 1574 | aaa | | 1391 | aaa | | 1279 | aaa | | 1271 | aaa | | 505 | aaa | | 109 | aaa | +---------------+-----------------+--+ # 帶limit排序(會額外再啟動一個job進行全域性排序) 0: jdbc:hive2://> select * from sort_test sort by id desc limit 300; +---------------+-----------------+--+ | sort_test.id | sort_test.name | +---------------+-----------------+--+ | 9901 | aaa | | 9719 | aaa | | 9684 | aaa | | 9636 | aaa | | 9562 | aaa | | 9340 | aaa | | 9335 | aaa | | 9019 | aaa | | 9007 | aaa | | 8965 | aaa | | 8961 | aaa | | 8905 | aaa | | 8881 | aaa | | 8689 | aaa | | 8552 | aaa | | 8377 | aaa | | 8177 | aaa | | 7954 | aaa | | 7929 | aaa | | 7880 | aaa | | 7837 | aaa | | 7699 | aaa | | 7502 | aaa | | 7096 | aaa | | 6941 | aaa | | 6726 | aaa | | 6559 | aaa | | 6448 | aaa | | 6228 | aaa | | 6182 | aaa | | 6151 | aaa | | 6024 | aaa | | 6006 | aaa | | 5975 | aaa | | 5910 | aaa | | 5869 | aaa | | 5739 | aaa | | 5682 | aaa | | 5541 | aaa | | 5448 | aaa | | 5415 | aaa | | 5288 | aaa | | 5223 | aaa | | 5164 | aaa | | 5160 | aaa | | 5127 | aaa | | 5083 | aaa | | 4945 | aaa | | 4878 | aaa | | 4826 | aaa | | 4728 | aaa | | 4679 | aaa | | 4616 | aaa | | 4332 | aaa | | 4320 | aaa | | 4245 | aaa | | 4207 | aaa | | 4204 | aaa | | 3973 | aaa | | 3774 | aaa | | 3749 | aaa | | 3736 | aaa | | 3683 | aaa | | 3451 | aaa | | 3226 | aaa | | 3040 | aaa | | 2989 | aaa | | 2720 | aaa | | 2670 | aaa | | 2543 | aaa | | 2393 | aaa | | 2340 | aaa | | 2340 | aaa | | 2323 | aaa | | 2287 | aaa | | 2231 | aaa | | 2206 | aaa | | 2179 | aaa | | 2054 | aaa | | 2037 | aaa | | 1795 | aaa | | 1766 | aaa | | 1614 | aaa | | 1574 | aaa | | 1540 | aaa | | 1506 | aaa | | 1420 | aaa | | 1391 | aaa | | 1279 | aaa | | 1271 | aaa | | 1268 | aaa | | 834 | aaa | | 784 | aaa | | 582 | aaa | | 505 | aaa | | 444 | aaa | | 376 | aaa | | 246 | aaa | | 109 | aaa | | 19 | aaa | +---------------+-----------------+--+
- Order By 和 Sort By區別
- Order By全域性排序,Sort By區域性排序
- 取TopN時,Sort By 比 Order By效率更高
- Distribute By
- 查詢語句對指定欄位分組
- 通常結合Sort By語句使用,比如同一個地區,不同商家排序,就需要用到這個
- Cluster By
- 分組且排序,等價於 Distribute By 和 Sort By 的結合
-- 使用示例
SELECT col1, col2 FROM t1 CLUSTER BY col1
SELECT col1, col2 FROM t1 DISTRIBUTE BY col1
SELECT col1, col2 FROM t1 DISTRIBUTE BY col1 SORT BY col1 ASC, col2 DESC