1. 程式人生 > >Hive之——整合MySQL儲存元資料資訊及基本操作示例

Hive之——整合MySQL儲存元資料資訊及基本操作示例

一、概述

Hive在實際工作過程中,需要一個統一的儲存媒介來儲存元資料資訊,這個儲存媒介通常可以是MySQL資料,那麼如何將Hive的元資料資訊儲存到MySQL中呢,今天,就帶著大家一起學習大資料入門系列的Hive篇——整合MySQL儲存元資料資訊。

二、環境整合

1、安裝MySQL資料庫

2、安裝Hadoop

(1) 偽分散式安裝

(2) 叢集安裝

(3) 高可用叢集安裝

3、安裝Hive

這裡我用的hive版本是hive-0.12.0.tar.gz,也可以使用其他版本的Hive.

1.1上傳並解壓

tar -zxvf hive-0.12.0.tar.gz
1.2 修改配置檔案

進入到HIVE_HOME的conf目錄下,修改檔案hive-default.xml.template檔名稱為hive-site.xml並編輯裡面的內容如下:

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
-->

<configuration>

<!-- WARNING!!! This file is provided for documentation purposes ONLY!     -->
<!-- WARNING!!! Any changes you make to this file will be ignored by Hive. -->
<!-- WARNING!!! You must make your changes in hive-site.xml instead.       -->


<!-- Hive Execution Parameters -->
<property>
	<name>javax.jdo.option.ConnectionURL</name>
	<value>jdbc:mysql://127.0.0.1:3306/hive?createDatabaseIfNotExist=true</value>
</property>
<property>
	<name>javax.jdo.option.ConnectionDriverName</name>
	<value>com.mysql.jdbc.Driver</value>
</property>
<property>
	<name>javax.jdo.option.ConnectionUserName</name>
	<value>root</value>
</property>
<property>
	<name>javax.jdo.option.ConnectionPassword</name>
	<value>root</value>
</property>


</configuration>
並將MySQL驅動Jar包放到HIVE_HOME的lib目錄下。

配置好後啟動hive即可。

三、Hive基本操作示例

1、在hive當中建立兩張表

create table trade_detail (id bigint, account string, income double, expenses double, time string) row format delimited fields terminated by '\t';
create table user_info (id bigint, account string, name  string, age int) row format delimited fields terminated by '\t';

2、將mysq當中的資料直接匯入到hive當中

sqoop import --connect jdbc:mysql://192.168.1.10:3306/lyz --username root --password 123 --table trade_detail --hive-import --hive-overwrite --hive-table trade_detail --fields-terminated-by '\t'
sqoop import --connect jdbc:mysql://192.168.1.10:3306/lyz --username root --password 123 --table user_info --hive-import --hive-overwrite --hive-table user_info --fields-terminated-by '\t'

3、建立一個result表儲存前一個sql執行的結果

create table result row format delimited fields terminated by '\t' as select t2.account, t2.name, t1.income, t1.expenses, t1.surplus from user_info t2 join (select account, sum(income) as income, sum(expenses) as expenses, sum(income-expenses) as surplus from trade_detail group by account) t1 on (t1.account = t2.account);

4、將本地檔案系統上的資料匯入到HIVE當中

create table user (id int, name string) row format delimited fields terminated by '\t'
load data local inpath '/root/user.txt' into table user;

5、建立外部表

Hive內部表和外部表的區別是,當Hive刪除表的時候,內部表會將所有相關的資料庫/表文件刪除,外部表則不會刪除;外部表用關鍵字external標識。

create external table stubak (id int, name string) row format delimited fields terminated by '\t' location '/stubak';

6、建立分割槽表

普通表和分割槽表區別:有大量資料增加的需要建分割槽表

create table book (id bigint, name string) partitioned by (pubdate string) row format delimited fields terminated by '\t'; 

7、分割槽表載入資料

load data local inpath './book.txt' overwrite into table book partition (pubdate='2017-10-21'); 

四、其他各種操作示例

set hive.cli.print.header=true;

CREATE TABLE page_view(viewTime INT, userid BIGINT,
     page_url STRING, referrer_url STRING,
     ip STRING COMMENT 'IP Address of the User')
 COMMENT 'This is the page view table'
 PARTITIONED BY(dt STRING, country STRING)
 ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '\001'
STORED AS SEQUENCEFILE;

//sequencefile
create table tab_ip_seq(id int,name string,ip string,country string) 
    row format delimited
    fields terminated by ','
    stored as sequencefile;
    
//使用select語句來批量插入資料
insert overwrite table tab_ip_seq select * from tab_ext;


//create & load
create table tab_ip(id int,name string,ip string,country string) 
    row format delimited
    fields terminated by ','
    stored as textfile;
    
//從本地匯入資料到hive的表中(實質就是將檔案上傳到hdfs中hive管理目錄下)
load data local inpath '/home/hadoop/ip.txt' into table tab_ext;

//從hdfs上匯入資料到hive表中(實質就是將檔案從原始目錄移動到hive管理的目錄下)
load data inpath 'hdfs://ns1/aa/bb/data.log' into table tab_user;


//external外部表
CREATE EXTERNAL TABLE tab_ip_ext(id int, name string,
     ip STRING,
     country STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
 STORED AS TEXTFILE
 LOCATION '/external/user';
 

// CTAS  根據select語句建表結構
CREATE TABLE tab_ip_ctas
   AS
SELECT id new_id, name new_name, ip new_ip,country new_country
FROM tab_ip_ext
SORT BY new_id;


//CLUSTER <--相對高階一點,你可以放在有精力的時候才去學習>
create table tab_ip_cluster(id int,name string,ip string,country string)
clustered by(id) into 3 buckets;

load data local inpath '/home/hadoop/ip.txt' overwrite into table tab_ip_cluster;
set hive.enforce.bucketing=true;
insert into table tab_ip_cluster select * from tab_ip;

select * from tab_ip_cluster tablesample(bucket 2 out of 3 on id); 



//PARTITION  分割槽表
create table tab_ip_part(id int,name string,ip string,country string) 
    partitioned by (year string)
    row format delimited fields terminated by ',';
    
load data local inpath '/home/hadoop/data.log' overwrite into table tab_ip_part
     partition(year='1990');
    
    
load data local inpath '/home/hadoop/data2.log' overwrite into table tab_ip_part
     partition(year='2000');

select * from tab_ip_part;

select * from tab_ip_part  where part_flag='part2';
select count(*) from tab_ip_part  where part_flag='part2';


alter table tab_ip change id id_alter string;
ALTER TABLE tab_cts ADD PARTITION (partCol = 'dt') location '/external/hive/dt';

show partitions tab_ip_part;

//insert from select   通過select語句批量插入資料到別的表
create table tab_ip_like like tab_ip;
insert overwrite table tab_ip_like
    select * from tab_ip;
   
//write to hdfs  將結果寫入到hdfs的檔案中
insert overwrite local directory '/home/hadoop/hivetemp/test.txt' select * from tab_ip_part where part_flag='part1';    
insert overwrite directory '/hiveout.txt' select * from tab_ip_part where part_flag='part1';

//cli shell  通過shell執行hive的hql語句
hive -S -e 'select country,count(*) from tab_ext' > /home/hadoop/hivetemp/e.txt  

select * from tab_ext sort by id desc limit 5;

select a.ip,b.book from tab_ext a join tab_ip_book b on(a.name=b.name);


//array 
create table tab_array(a array<int>,b array<string>)
row format delimited
fields terminated by '\t'
collection items terminated by ',';

select a[0] from tab_array;
select * from tab_array where array_contains(b,'word');
insert into table tab_array select array(0),array(name,ip) from tab_ext t; 

//map
create table tab_map(name string,info map<string,string>)
row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':';

load data local inpath '/home/hadoop/hivetemp/tab_map.txt' overwrite into table tab_map;
insert into table tab_map select name,map('name',name,'ip',ip) from tab_ext; 

//struct
create table tab_struct(name string,info struct<age:int,tel:string,addr:string>)
row format delimited
fields terminated by '\t'
collection items terminated by ','

load data local inpath '/home/hadoop/hivetemp/tab_st.txt' overwrite into table tab_struct;
insert into table tab_struct select name,named_struct('age',id,'tel',name,'addr',country) from tab_ext;



//UDF
select if(id=1,first,no-first),name from tab_ext;

hive>add jar /home/hadoop/myudf.jar;
hive>CREATE TEMPORARY FUNCTION fanyi AS 'cn.lyz.hive.Fanyi';
select id,name,ip,fanyi(country) from tab_ip_ext;