1. 程式人生 > >Hive-UDF&GenericUDF&Hive-UDTF&Hive-UDAF

Hive-UDF&GenericUDF&Hive-UDTF&Hive-UDAF

hive udf簡介

在Hive中,使用者可以自定義一些函式,用於擴充套件HiveQL的功能,而這類函式叫做UDF(使用者自定義函式)。UDF分為兩大類:UDAF(使用者自定義聚合函式)和UDTF(使用者自定義表生成函式)。在介紹UDAF和UDTF實現之前,我們先在本章介紹簡單點的UDF實現——UDF和GenericUDF,然後以此為基礎在下一章介紹UDAF和UDTF的實現。

Hive有兩個不同的介面編寫UDF程式。一個是基礎的UDF介面,一個是複雜的GenericUDF介面。

org.apache.hadoop.hive.ql. exec.UDF 基礎UDF的函式讀取和返回基本型別,即Hadoop和Hive的基本型別。如,Text、IntWritable、LongWritable、DoubleWritable等。

org.apache.hadoop.hive.ql.udf.generic.GenericUDF 複雜的GenericUDF可以處理Map、List、Set型別。

註解使用:

@Describtion註解是可選的,用於對函式進行說明,其中的FUNC字串表示函式名,當使用DESCRIBE FUNCTION命令時,替換成函式名。@Describtion包含三個屬性:

  • name:用於指定Hive中的函式名。
  • value:用於描述函式的引數。
  • extended:額外的說明,如,給出示例。當使用DESCRIBE FUNCTION EXTENDED name的時候列印。

而且,Hive要使用UDF,需要把Java檔案編譯、打包成jar檔案,然後將jar檔案加入到CLASSPATH中,最後使用CREATE FUNCTION語句定義這個Java類的函式:

  1. hive> ADD jar /root/experiment/hive/hive-0.0.1-SNAPSHOT.jar;
  2. hive> CREATE TEMPORARY FUNCTION hello AS "edu.wzm.hive. HelloUDF";
  3. hive> DROP TEMPORARY FUNCTION IF EXIST hello;

udf

簡單的udf實現很簡單,只需要繼承udf,然後實現evaluate()方法就行了。evaluate()允許過載。

一個例子:

@Description(  
    name = "hello",  
    value = "_FUNC_(str) - from the input string"  
        + "returns the value that is \"Hello $str\" ",  
    extended = "Example:\n"  
        + " > SELECT _FUNC_(str) FROM src;"  
)  
public class HelloUDF extends UDF{  
      
    public String evaluate(String str){  
        try {  
            return "Hello " + str;  
        } catch (Exception e) {  
            // TODO: handle exception  
            e.printStackTrace();  
            return "ERROR";  
        }  
    }  
}

genericUDF

GenericUDF實現比較複雜,需要先繼承GenericUDF。這個API需要操作Object Inspectors,並且要對接收的引數型別和數量進行檢查。GenericUDF需要實現以下三個方法:

//這個方法只調用一次,並且在evaluate()方法之前呼叫。該方法接受的引數是一個ObjectInspectors陣列。該方法檢查接受正確的引數型別和引數個數。  
abstract ObjectInspector initialize(ObjectInspector[] arguments);  
  
//這個方法類似UDF的evaluate()方法。它處理真實的引數,並返回最終結果。  
abstract Object evaluate(GenericUDF.DeferredObject[] arguments);  
  
//這個方法用於當實現的GenericUDF出錯的時候,打印出提示資訊。而提示資訊就是你實現該方法最後返回的字串。  
abstract String getDisplayString(String[] children);

一個例子:判斷array是否包含某個值。

/*** Eclipse Class Decompiler plugin, copyright (c) 2016 Chen Chao ([email protected]) ***/
package org.apache.hadoop.hive.ql.udf.generic;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.BooleanWritable;

@Description(name = "array_contains", value = "_FUNC_(array, value) - Returns TRUE if the array contains value.", extended = "Example:\n  > SELECT _FUNC_(array(1, 2, 3), 2) FROM src LIMIT 1;\n  true")
public class GenericUDFArrayContains extends GenericUDF {
    private static final int ARRAY_IDX = 0;
    private static final int VALUE_IDX = 1;
    private static final int ARG_COUNT = 2;
    private static final String FUNC_NAME = "ARRAY_CONTAINS";
    private transient ObjectInspector valueOI;
    private transient ListObjectInspector arrayOI;
    private transient ObjectInspector arrayElementOI;
    private BooleanWritable result;

    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        if (arguments.length != 2) {
            throw new UDFArgumentException("The function ARRAY_CONTAINS accepts 2 arguments.");
        }

        if (!(arguments[0].getCategory().equals(ObjectInspector.Category.LIST))) {
            throw new UDFArgumentTypeException(0, "\"array\" expected at function ARRAY_CONTAINS, but \""
                    + arguments[0].getTypeName() + "\" " + "is found");
        }

        this.arrayOI = ((ListObjectInspector) arguments[0]);
        this.arrayElementOI = this.arrayOI.getListElementObjectInspector();

        this.valueOI = arguments[1];

        if (!(ObjectInspectorUtils.compareTypes(this.arrayElementOI, this.valueOI))) {
            throw new UDFArgumentTypeException(1,
                    "\"" + this.arrayElementOI.getTypeName() + "\"" + " expected at function ARRAY_CONTAINS, but "
                            + "\"" + this.valueOI.getTypeName() + "\"" + " is found");
        }

        if (!(ObjectInspectorUtils.compareSupported(this.valueOI))) {
            throw new UDFArgumentException("The function ARRAY_CONTAINS does not support comparison for \""
                    + this.valueOI.getTypeName() + "\"" + " types");
        }

        this.result = new BooleanWritable(false);

        return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector;
    }

    public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException {
        this.result.set(false);

        Object array = arguments[0].get();
        Object value = arguments[1].get();

        int arrayLength = this.arrayOI.getListLength(array);

        if ((value == null) || (arrayLength <= 0)) {
            return this.result;
        }

        for (int i = 0; i < arrayLength; ++i) {
            Object listElement = this.arrayOI.getListElement(array, i);
            if ((listElement == null)
                    || (ObjectInspectorUtils.compare(value, this.valueOI, listElement, this.arrayElementOI) != 0))
                continue;
            this.result.set(true);
            break;
        }

        return this.result;
    }

    public String getDisplayString(String[] children) {
        assert (children.length == 2);
        return "array_contains(" + children[0] + ", " + children[1] + ")";
    }
}

總結

當寫Hive UDF時,有兩個選擇:一是繼承 UDF類,二是繼承抽象類GenericUDF。這兩種實現不同之處是:GenericUDF 可以處理複雜型別引數,並且繼承GenericUDF更加有效率,因為UDF class 需要HIve使用反射的方式去實現。
UDF是作用於一行的。

Hive-UDTF

UDTF

上面介紹了基礎的UDF——UDF和GenericUDF的實現,這一篇將介紹更復雜的使用者自定義表生成函式(UDTF)。使用者自定義表生成函式(UDTF)接受零個或多個輸入,然後產生多列或多行的輸出,如explode()。要實現UDTF,需要繼承org.apache.hadoop.hive.ql.udf.generic.GenericUDTF,同時實現三個方法

// 該方法指定輸入輸出引數:輸入的Object Inspectors和輸出的Struct。  
abstract StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException;   
  
// 該方法處理輸入記錄,然後通過forward()方法返回輸出結果。  
abstract void process(Object[] record) throws HiveException;  
  
// 該方法用於通知UDTF沒有行可以處理了。可以在該方法中清理程式碼或者附加其他處理輸出。  
abstract void close() throws HiveException;

其中:在0.13.0中initialize不需要實現。

定義如下:

public abstract class GenericUDTF {
    Collector collector;

    public GenericUDTF() {
        this.collector = null;
    }

    public void configure(MapredContext mapredContext) {
    }

    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        List inputFields = argOIs.getAllStructFieldRefs();
        ObjectInspector[] udtfInputOIs = new ObjectInspector[inputFields.size()];
        for (int i = 0; i < inputFields.size(); ++i) {
            udtfInputOIs[i] = ((StructField) inputFields.get(i)).getFieldObjectInspector();
        }
        return initialize(udtfInputOIs);
    }

    @Deprecated
    public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
        throw new IllegalStateException("Should not be called directly");
    }

    public abstract void process(Object[] paramArrayOfObject) throws HiveException;

    public abstract void close() throws HiveException;

    public final void setCollector(Collector collector) {
        this.collector = collector;
    }

    protected final void forward(Object o) throws HiveException {
        this.collector.collect(o);
    }

看一個例子

FUNC(a) - separates the elements of array a into multiple rows, or the elements of a map into multiple rows and columns

/*** Eclipse Class Decompiler plugin, copyright (c) 2016 Chen Chao ([email protected]) ***/
package org.apache.hadoop.hive.ql.udf.generic;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.TaskExecutionException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;

@Description(name = "explode", value = "_FUNC_(a) - separates the elements of array a into multiple rows, or the elements of a map into multiple rows and columns ")
public class GenericUDTFExplode extends GenericUDTF {
    private transient ObjectInspector inputOI;
    private final transient Object[] forwardListObj;
    private final transient Object[] forwardMapObj;

    public GenericUDTFExplode() {
        this.inputOI = null;

        this.forwardListObj = new Object[1];
        this.forwardMapObj = new Object[2];
    }

    public void close() throws HiveException {
    }

    public StructObjectInspector initialize(ObjectInspector[] args)
    throws UDFArgumentException
  {
    if (args.length != 1) {
      throw new UDFArgumentException("explode() takes only one argument");
    }

    ArrayList fieldNames = new ArrayList();
    ArrayList fieldOIs = new ArrayList();

    switch (1.$SwitchMap$org$apache$hadoop$hive$serde2$objectinspector$ObjectInspector$Category[args[0].getCategory().ordinal()])
    {
    case 1:
      this.inputOI = args[0];
      fieldNames.add("col");
      fieldOIs.add(((ListObjectInspector)this.inputOI).getListElementObjectInspector());
      break;
    case 2:
      this.inputOI = args[0];
      fieldNames.add("key");
      fieldNames.add("value");
      fieldOIs.add(((MapObjectInspector)this.inputOI).getMapKeyObjectInspector());
      fieldOIs.add(((MapObjectInspector)this.inputOI).getMapValueObjectInspector());
      break;
    default:
      throw new UDFArgumentException("explode() takes an array or a map as a parameter");
    }

    return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
  }

    public void process(Object[] o)
    throws HiveException
  {
    Iterator i$;
    switch (1.$SwitchMap$org$apache$hadoop$hive$serde2$objectinspector$ObjectInspector$Category[this.inputOI.getCategory().ordinal()])
    {
    case 1:
      ListObjectInspector listOI = (ListObjectInspector)this.inputOI;
      List list = listOI.getList(o[0]);
      if (list == null) {
        return;
      }
      for (i$ = list.iterator(); i$.hasNext(); ) { Object r = i$.next();
        this.forwardListObj[0] = r;
        forward(this.forwardListObj);
      }
      break;
    case 2:
      MapObjectInspector mapOI = (MapObjectInspector)this.inputOI;
      Map map = mapOI.getMap(o[0]);
      if (map == null) {
        return;
      }
      for (Map.Entry r : map.entrySet()) {
        this.forwardMapObj[0] = r.getKey();
        this.forwardMapObj[1] = r.getValue();
        forward(this.forwardMapObj);
      }
      break;
    default:
      throw new TaskExecutionException("explode() can only operate on an array or a map");
    }
  }

    public String toString() {
        return "explode";
    }
}

一個分割字串的例子:

@Description(  
    name = "explode_name",  
    value = "_FUNC_(col) - The parameter is a column name."  
        + " The return value is two strings.",  
    extended = "Example:\n"  
        + " > SELECT _FUNC_(col) FROM src;"  
        + " > SELECT _FUNC_(col) AS (name, surname) FROM src;"  
        + " > SELECT adTable.name,adTable.surname"  
        + " > FROM src LATERAL VIEW _FUNC_(col) adTable AS name, surname;"  
)  
public class ExplodeNameUDTF extends GenericUDTF{  
  
    @Override  
    public StructObjectInspector initialize(ObjectInspector[] argOIs)  
            throws UDFArgumentException {  
          
        if(argOIs.length != 1){  
            throw new UDFArgumentException("ExplodeStringUDTF takes exactly one argument.");  
        }  
        if(argOIs[0].getCategory() != ObjectInspector.Category.PRIMITIVE  
                && ((PrimitiveObjectInspector)argOIs[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING){  
            throw new UDFArgumentTypeException(0, "ExplodeStringUDTF takes a string as a parameter.");  
        }  
          
        ArrayList<String> fieldNames = new ArrayList<String>();  
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();  
        fieldNames.add("name");  
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);  
        fieldNames.add("surname");  
        fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);  
              
        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);  
    }  
      
    @Override  
    public void process(Object[] args) throws HiveException {  
        // TODO Auto-generated method stub  
        String input = args[0].toString();  
        String[] name = input.split(" ");  
        forward(name);  
    }  
  
    @Override  
    public void close() throws HiveException {  
        // TODO Auto-generated method stub  
          
    }  
  
}

記住 最後呼叫forward函式。

Hive-UDAF

UDAF

前兩節分別介紹了基礎UDF和UDTF,這一節我們將介紹最複雜的使用者自定義聚合函式(UDAF)。使用者自定義聚合函式(UDAF)接受從零行到多行的零個到多個列,然後返回單一值,如sum()、count()。要實現UDAF,我們需要實現下面的類:

org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver

org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator

AbstractGenericUDAFResolver檢查輸入引數,並且指定使用哪個resolver。在AbstractGenericUDAFResolver裡,只需要實現一個方法:

public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException;  

但是,主要的邏輯處理還是在Evaluator中。我們需要繼承GenericUDAFEvaluator,並且實現下面幾個方法:

// 輸入輸出都是Object inspectors  
public  ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException;  
  
// AggregationBuffer儲存資料處理的臨時結果  
abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;  
  
// 重新設定AggregationBuffer  
public void reset(AggregationBuffer agg) throws HiveException;  
  
// 處理輸入記錄  
public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException;  
  
// 處理全部輸出資料中的部分資料  
public Object terminatePartial(AggregationBuffer agg) throws HiveException;  
  
// 把兩個部分資料聚合起來  
public void merge(AggregationBuffer agg, Object partial) throws HiveException;  
  
// 輸出最終結果  
public Object terminate(AggregationBuffer agg) throws HiveException;

在處理之前,先看下UADF的Enum GenericUDAFEvaluator.Mode。Mode有4中情況:

  • PARTIAL1:Mapper階段。從原始資料到部分聚合,會呼叫iterate()和terminatePartial()。
  • PARTIAL2:Combiner階段,在Mapper端合併Mapper的結果資料。從部分聚合到部分聚合,會呼叫merge()和terminatePartial()。
  • FINAL:Reducer階段。從部分聚合資料到完全聚合,會呼叫merge()和terminate()。
  • COMPLETE:出現這個階段,表示MapReduce中只用Mapper沒有Reducer,所以Mapper端直接輸出結果了。從原始資料到完全聚合,會呼叫iterate()和terminate()。

GenericUDAFResolver2

@Deprecated
public abstract interface GenericUDAFResolver {
    public abstract GenericUDAFEvaluator getEvaluator(TypeInfo[] paramArrayOfTypeInfo) throws SemanticException;
}

已廢棄

public abstract interface GenericUDAFResolver2 extends GenericUDAFResolver {
    public abstract GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramGenericUDAFParameterInfo)
            throws SemanticException;
}

GenericUDAFEvaluator

@UDFType(deterministic = true)
public abstract class GenericUDAFEvaluator implements Closeable {
    Mode mode;

    public static boolean isEstimable(AggregationBuffer buffer) {
        if (buffer instanceof AbstractAggregationBuffer) {
            Class clazz = buffer.getClass();
            AggregationType annotation = (AggregationType) clazz.getAnnotation(AggregationType.class);
            return ((annotation != null) && (annotation.estimable()));
        }
        return false;
    }

    public void configure(MapredContext mapredContext) {
    }

    public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
        this.mode = m;
        return null;
    }

    public abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;

    public abstract void reset(AggregationBuffer paramAggregationBuffer) throws HiveException;

    public void close() throws IOException {
    }

    public void aggregate(AggregationBuffer agg, Object[] parameters) throws HiveException {
        if ((this.mode == Mode.PARTIAL1) || (this.mode == Mode.COMPLETE)) {
            iterate(agg, parameters);
        } else {
            assert (parameters.length == 1);
            merge(agg, parameters[0]);
        }
    }

    public Object evaluate(AggregationBuffer agg) throws HiveException {
        if ((this.mode == Mode.PARTIAL1) || (this.mode == Mode.PARTIAL2)) {
            return terminatePartial(agg);
        }
        return terminate(agg);
    }

    public abstract void iterate(AggregationBuffer paramAggregationBuffer, Object[] paramArrayOfObject)
            throws HiveException;

    public abstract Object terminatePartial(AggregationBuffer paramAggregationBuffer) throws HiveException;

    public abstract void merge(AggregationBuffer paramAggregationBuffer, Object paramObject) throws HiveException;

    public abstract Object terminate(AggregationBuffer paramAggregationBuffer) throws HiveException;

    public static abstract class AbstractAggregationBuffer implements GenericUDAFEvaluator.AggregationBuffer {
        public int estimate() {
            return -1;
        }
    }

    public static abstract interface AggregationBuffer {
    }

    public static enum Mode {
        PARTIAL1, PARTIAL2, FINAL, COMPLETE;
    }

    public static @interface AggregationType {
        public abstract boolean estimable();
    }
}

例子

count

/*** Eclipse Class Decompiler plugin, copyright (c) 2016 Chen Chao ([email protected]) ***/
package org.apache.hadoop.hive.ql.udf.generic;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.LongWritable;

@Description(name = "count", value = "_FUNC_(*) - Returns the total number of retrieved rows, including rows containing NULL values.\n_FUNC_(expr) - Returns the number of rows for which the supplied expression is non-NULL.\n_FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-NULL.")
public class GenericUDAFCount implements GenericUDAFResolver2 {
    private static final Log LOG;

    public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
        return new GenericUDAFCountEvaluator();
    }

    public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramInfo) throws SemanticException {
        TypeInfo[] parameters = paramInfo.getParameters();

        if (parameters.length == 0) {
            if (!(paramInfo.isAllColumns())) {
                throw new UDFArgumentException("Argument expected");
            }
            if ((!($assertionsDisabled)) && (paramInfo.isDistinct()))
                throw new AssertionError("DISTINCT not supported with *");
        } else {
            if ((parameters.length > 1) && (!(paramInfo.isDistinct()))) {
                throw new UDFArgumentException("DISTINCT keyword must be specified");
            }
            assert (!(paramInfo.isAllColumns())) : "* not supported in expression list";
        }

        return new GenericUDAFCountEvaluator().setCountAllColumns(paramInfo.isAllColumns());
    }

    static {
        LOG = LogFactory.getLog(GenericUDAFCount.class.getName());
    }

    public static class GenericUDAFCountEvaluator extends GenericUDAFEvaluator {
        private boolean countAllColumns;
        private LongObjectInspector partialCountAggOI;
        private LongWritable result;

        public GenericUDAFCountEvaluator() {
            this.countAllColumns = false;
        }

        public ObjectInspector init(GenericUDAFEvaluator.Mode m, ObjectInspector[] parameters) throws HiveException {
            super.init(m, parameters);
            this.partialCountAggOI = PrimitiveObjectInspectorFactory.writableLongObjectInspector;

            this.result = new LongWritable(0L);
            return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
        }

        private GenericUDAFCountEvaluator setCountAllColumns(boolean countAllCols) {
            this.countAllColumns = countAllCols;
            return this;
        }

        public GenericUDAFEvaluator.AggregationBuffer getNewAggregationBuffer() throws HiveException {
            CountAgg buffer = new CountAgg();
            reset(buffer);
            return buffer;
        }

        public void reset(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
            ((CountAgg) agg).value = 0L;
        }

        public void iterate(GenericUDAFEvaluator.AggregationBuffer agg, Object[] parameters) throws HiveException {
            if (parameters == null) {
                return;
            }
            if (this.countAllColumns) {
                assert (parameters.length == 0);
                ((CountAgg) agg).value += 1L;
            } else {
                assert (parameters.length > 0);
                boolean countThisRow = true;
                for (Object nextParam : parameters) {
                    if (nextParam == null) {
                        countThisRow = false;
                        break;
                    }
                }
                if (countThisRow)
                    ((CountAgg) agg).value += 1L;
            }
        }

        public void merge(GenericUDAFEvaluator.AggregationBuffer agg, Object partial) throws HiveException {
            if (partial != null) {
                long p = this.partialCountAggOI.get(partial);
                ((CountAgg) agg).value += p;
            }
        }

        public Object terminate(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
            this.result.set(((CountAgg) agg).value);
            return this.result;
        }

        public Object terminatePartial(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
            return terminate(agg);
        }

        @GenericUDAFEvaluator.AggregationType(estimable = true)
        static class CountAgg extends GenericUDAFEvaluator.AbstractAggregationBuffer {
            long value;

            public int estimate() {
                return 8;
            }
        }
    }
}

sum

udaf 需要hive的sql和group by聯合使用。hive的group by對於每個分組,只能返回一條記錄。

開發通用udaf有另個步驟,一個是編寫resolver類,第二個是編寫evaluator類。resolver負責型別檢查,操作符過載。evaluator負責實現真正的udaf邏輯、

以sum為例、

reslver通常繼承resolver2.但是建議繼承resolver。隔離將來hive介面的變化。

public class GenericUDAFSum extends AbstractGenericUDAFResolver {
    static final Log LOG = LogFactory.getLog(GenericUDAFSum.class.getName());

    public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
    throws SemanticException
  {
    if (parameters.length != 1) {
      throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected.");
    }

    if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
      throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + parameters[0].getTypeName() + " is passed.");
    }

    switch (1.$SwitchMap$org$apache$hadoop$hive$serde2$objectinspector$PrimitiveObjectInspector$PrimitiveCategory[((org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)parameters[0]).getPrimitiveCategory().ordinal()]) {
    case 1:
    case 2:
    case 3:
    case 4:
      return new GenericUDAFSumLong();
    case 5:
    case 6:
    case 7:
    case 8:
    case 9:
    case 10:
      return new GenericUDAFSumDouble();
    case 11:
      return new GenericUDAFSumHiveDecimal();
    case 12:
    case 13:
    }
    throw new UDFArgumentTypeException(0, "Only numeric or string type arguments are accepted but " + parameters[0].getTypeName() + " is passed.");
  }

著就是udaf的程式碼骨架。建立一個log物件。 重寫getEvaluator方法。根據sql傳入的引數型別,返回爭取的evaluator。主要實現操作符的過載。

實現evaluator

下面以genericudafsumlong為例。

public static class GenericUDAFSumLong extends GenericUDAFEvaluator {
        private PrimitiveObjectInspector inputOI;
        private LongWritable result;
        private boolean warned;

        public GenericUDAFSumLong() {
            this.warned = false;
        }
        //這個方法返回可udaf的返回型別。這裡定義返回型別為long
        public ObjectInspector init(GenericUDAFEvaluator.Mode m, ObjectInspector[] parameters) throws HiveException {
            assert (parameters.length == 1);
            super.init(m, parameters);
            this.result = new LongWritable(0L);
            this.inputOI = ((PrimitiveObjectInspector) parameters[0]);
            return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
        }

        //建立新的聚合計算需要的記憶體,用來儲存mapper,combiner,reducer運算過程中的相加總和。
        public GenericUDAFEvaluator.AggregationBuffer getNewAggregationBuffer() throws HiveException {
            SumLongAgg result = new SumLongAgg();
            reset(result);
            return result;
        }

        //mr支援mapper和reducer的重用,所以為了相容,也要做記憶體的重用
        public void reset(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
            SumLongAgg myagg = (SumLongAgg) agg;
            myagg.empty = true;
            myagg.sum = 0L;
        }
        
        //map階段,只要把儲存道歉和的物件agg,再加上輸入的引數,就可以了。
        public void iterate(GenericUDAFEvaluator.AggregationBuffer agg, Object[] parameters) throws HiveException {
            assert (parameters.length == 1);
            try {
                merge(agg, parameters[0]);
            } catch (NumberFormatException e) {
                if (!(this.warned)) {
                    this.warned = true;
                    GenericUDAFSum.LOG.warn(super.getClass().getSimpleName() + " " + StringUtils.stringifyException(e));
                }
            }
        }

        //mapper結束要返回的結果和combiner結束要返回的結果。
        public Object terminatePartial(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
            return terminate(agg);
        }
        
        //combiner合併map返回的結果,還有reducer合併mapper或combiner返回的結果
        public void merge(GenericUDAFEvaluator.AggregationBuffer agg, Object partial) throws HiveException {
            if (partial != null) {
                SumLongAgg myagg = (SumLongAgg) agg;
                myagg.sum += PrimitiveObjectInspectorUtils.getLong(partial, this.inputOI);
                myagg.empty = false;
            }
        }

        //reducer返回結果,或者是隻有mapper,沒有reducer,在mapper端返回結果。
        public Object terminate(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
            SumLongAgg myagg = (SumLongAgg) agg;
            if (myagg.empty) {
                return null;
            }
            this.result.set(myagg.sum);
            return this.result;
        }
        
        //儲存sum值得類
        @GenericUDAFEvaluator.AggregationType(estimable = true)
        static class SumLongAgg extends GenericUDAFEvaluator.AbstractAggregationBuffer {
            boolean empty;
            long sum;

            public int estimate() {
                return 12;
            }
        }
    }

連結:https://www.jianshu.com/p/7ebc8f9c9b78