hive UDF 開發示例
阿新 • • 發佈:2018-12-19
一、建立一個java專案
對應的pom檔案
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.credithc</groupId> <artifactId>hive_udf_v1.0</artifactId> <version>1.0-SNAPSHOT</version> <!-- 根據要連線的hadoop和hive,設定版本引數 --> <properties> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> <maven-compiler-plugin.version>3.7.0</maven-compiler-plugin.version> </properties> <!-- 因為使用CDH的hadoop和hive,因此要新增CDH的官方repository,才能夠下載相應的依賴包 --> <!-- 如果使用Apache版本的hadoop和hive,則不需要新增該repository --> <repositories> <repository> <id>cloudera</id> <url>http://repository.cloudera.com/artifactory/cloudera-repos</url> </repository> </repositories> <dependencies> <!-- 新增依賴元件,根據上方配置的版本引數和repository知識庫下載依賴 --> <dependency> <groupId>org.elasticsearch</groupId> <artifactId>elasticsearch-hadoop-mr</artifactId> <version>5.6.3</version> </dependency> <dependency> <groupId>org.elasticsearch</groupId> <artifactId>elasticsearch-hadoop-hive</artifactId> <version>5.6.3</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>2.0.0</version> </dependency> <!-- junit是java的單元測試框架 --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.10</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-resources-plugin</artifactId> <version>2.4.3</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.5.1</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> </plugins> </build> </project>
二、UDF函式建立開發:
package com.credithc.rc.kg.udf; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.ql.exec.UDF; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by glin on 2018/11/1 0001. com.credithc.rc.kg.udf.MessageDecodeUdf */ public class MessageDecodeUdf extends UDF{ public MessageDecodeUdf(){ } public String evaluate(String str,String params) { if(StringUtils.isEmpty(str)||StringUtils.isEmpty(params)) return null; String re = null; try { switch (params) { //時間抽取 case "time": re = parserTime(str); break; //銀行名稱抽取 case "bankName": re = parserBankName(str); break; } }catch (Exception e){ } return re; } public String parserTime(String str){ //提取時間 Pattern p0 =Pattern.compile("\\d{4}年\\d{1,2}月\\d{1,2}日|\\d{1,2}月\\d{1,2}日|\\d{4}[-|/|.]\\d{1,2}[-|/|.]\\d{1,2}"); //時間匹配 Matcher m0 = p0.matcher(str); if(m0.find()){ return m0.group(0); }else{ return null; } } public String parserBankName(String str){ //提取[]裡的內容 Pattern p1 = Pattern.compile("\\[(.+?銀行)\\]"); //銀行 Matcher m1 = p1.matcher(str); if(m1.find()){ return m1.group(1); }else{ return null; } } } public static void main(String[] args) { MessageDecodeUdf dd = new MessageDecodeUdf(); System.out.println(dd.evaluate(" 。下載“中國建設銀行”手機銀行APP 。[建設銀行]", "time")); } }
測試執行結果:
三、匯出 jar包:
三、上傳hive測試:
找到該函式,滑鼠右鍵選擇Copy Reference 獲得該函式的全路徑:com.credithc.ss.sd.udf.MessageDecodeUdf
-- 將上傳的jar包匯入到classpath變數裡
hdfs dfs -put /home/sd/test/hive_udf_v1.0-1.0-SNAPSHOT.jar user/sd/hive_udf/
list jars; -- 檢視匯入的jar包
create temporary function message_udf as 'com.credithc.ss.sd.udf.MessageDecodeUdf'; -- 建立一個臨時函式,關聯該jar包
使用測試:
select message_udf ( str, params) from kkkk;