hive進行詞頻統計
統計文件信息:
$ /opt/cdh-5.3.6/hadoop-2.5.0/bin/hdfs dfs -text /user/hadoop/wordcount/input/wc.input
hadoop spark
spark hadoop
oracle mysql postgresql
postgresql oracle mysql
mysql mongodb
hdfs yarn mapreduce
yarn hdfs
zookeeper
針對於以上文件使用hive做詞頻統計:
create table docs (line string);
load data inpath ‘/user/hadoop/wordcount/input/wc.input‘ into table docs;
create table word_counts as
select word,count(1) as count from
(select explode(split(line,‘ ‘)) as word from docs) word
group by word
order by word;
分段解釋:
--使用split函數對表中行按空格進行分隔:
select split(line,‘ ‘) from docs;
["hadoop","spark",""]
["spark","hadoop"]
["oracle","mysql","postgresql"]
["postgresql","oracle","mysql"]
["hdfs","yarn","mapreduce"]
["yarn","hdfs"]
["zookeeper"]
--使用explode函數對split的結果集進行行拆列:
select explode(split(line,‘ ‘)) as word from docs;
word
hadoop
spark
spark
hadoop
oracle
mysql
postgresql
postgresql
oracle
mysql
mysql
mongodb
hdfs
yarn
mapreduce
yarn
zookeeper
--以上輸出內容已經滿足對其做統計分析,這時通過sql對其進行分析:
select word,count(1) as count from
(select explode(split(line,‘ ‘)) as word from docs) word
group by word
order by word;
word count
1
hadoop 2
hdfs 2
mapreduce 1
mongodb 1
mysql 3
oracle 2
postgresql 2
spark 2
yarn 2
zookeeper 1
hive進行詞頻統計