impala udf函式實現中文擷取
阿新 • • 發佈:2019-01-26
目前,impala 的substr函式及substring函式都不支援中文的擷取,因此,需要通過udf函式實現。具體的實現效果需要與substr的英文效果相同。具體如下:
SUBSTR("abcde",3)=cde
SUBSTR("abcde",-2)=de
SUBSTR("abcde",3,2)=cd
SUBSTR("abcde",-4,2)=bc
對於impala udf函式,可以採用C++ 和 Java , 但出於效率考慮,一般採用c++(https://www.cloudera.com/documentation/enterprise/5-5-x/topics/search_prepare_install_search.html 以及
http://blog.csdn.net/yu616568/article/details/52746332)。
此處要求支援2個引數和3個引數,因此,在udf中,需要新增2個過載方法。
具體步驟:
1、下載impala-udf-devel 這個包。方法:
> git clone https://github.com/laserson/impala-udf-devel.git > cd impala-udf-devel/ > cmake .
2、 編輯在impala-udf-devel 目錄下編輯檔案udf-substr.cc udf-substr.h 兩個檔案,可以先將udf下的兩個udf.cc udf.h 檔案拷貝到父目錄,具體如下:
udf-substr.cc
#include "udf-substr.h"
#include <string>
#include <cmath>
using namespace std;
const unsigned char kFirstBitMask = 128; // 1000000
const unsigned char kSecondBitMask = 64; // 0100000
const unsigned char kThirdBitMask = 32; // 0010000
const unsigned char kFourthBitMask = 16; // 0001000
const unsigned char kFifthBitMask = 8; // 0000100
int utf8_char_len(char firstByte)
{
std::string::difference_type offset = 1;
if(firstByte & kFirstBitMask) // This means the first byte has a value greater than 127, and so is beyond the ASCII
range.
{
if(firstByte & kThirdBitMask) // This means that the first byte has a value greater than 224, and so it must be at least
a three-octet code point.
{
if(firstByte & kFourthBitMask) // This means that the first byte has a value greater than 240, and so it must be a four-octet
code point.
offset = 4;
else
offset = 3;
}
else
{
offset = 2;
}
}
return offset;
}
int getStringLength( const StringVal& str){
int index = 0;
int cnt = 0;
int i = 0;
std::string content((const char *)str.ptr,str.len);
while(index < str.len){
i = utf8_char_len(content[index]);
index = index + i;
cnt++;
}
return cnt;
}
StringVal udf_substr(FunctionContext* context, const StringVal& str , const IntVal& start , const IntVal& offsets){
if(str.is_null || start.val == 0 || abs(start.val) > str.len || offsets.val < 0){
return "";
}
int startp = 0;
if(start.val < 0){
startp = getStringLength(str) + start.val;
}else{
startp = start.val - 1; //c++ 從0開始索引
}
std::string content((const char *)str.ptr,str.len);
int endp = offsets.val>str.len?str.len:offsets.val ;
int i = 0 ;
int j = 0 ;
while( j < str.len){
if(j < startp){
i = utf8_char_len(content[j]);
j = j + i ;
startp = startp + i-1;
continue;
}else if(j < endp + startp){
i = utf8_char_len(content[j]);
j = j + i ;
endp = endp + i-1 ;
continue;
}else{
break;
}
}
endp = endp + startp > str.len ? max(j,str.len) - startp : endp;
//endp = str.len <= startp? startp+1:endp;
if(startp > str.len){
return "";
}
std::string res(content.substr(startp , endp));
StringVal result(context, endp);
memcpy(result.ptr, res.c_str(), endp);
return result;
}
StringVal udf_substr(FunctionContext* context, const StringVal& str , const IntVal& start ){
if(str.is_null || abs(start.val) > str.len || start.val == 0){
return "";
}
int startp = 0;
if(start.val < 0){
startp = getStringLength(str) + start.val;
}else{
startp = start.val - 1;
}
// int startp = start.val - 1; //c++ 從0開始索引
// int startp = start.val >0?start.val -1: str.len + start.val ;
std::string content((const char *)str.ptr,str.len);
int endp = str.len;
int i = 0 ;
int j = 0 ;
if(endp> str.len){
return "";
}
while( j < str.len){
if(j < startp){
i = utf8_char_len(content[j]);
j = j + i ;
startp = startp + i-1;
continue;
}else if(j < endp + startp){
i = utf8_char_len(content[j]);
j = j + i ;
endp = endp + i-1 ;
continue;
}else{
break;
}
}
// endp = endp + startp > str.len ? max(j,str.len) - startp : endp;
//endp = str.len <= startp? startp+1:endp;
endp = endp + startp > str.len ? max(j,str.len) - startp : endp;
std::string res(content.substr(startp , endp));
StringVal result(context, endp);
memcpy(result.ptr, res.c_str(), endp);
return result;
}
3、修改CMakeList 檔案。如下所示:
主要修改這一段:
# Build the UDA/UDFs into a shared library. You can have multiple UDFs per
# file, and/or specify multiple files here.
add_library(substr_udf SHARED udf-substr.cc)
# The resulting LLVM IR module will have the same name as the .cc file
if (CLANG_EXECUTABLE)
COMPILE_TO_IR(udf-substr.cc)
# COMPILE_TO_IR(my-udf-file-2.cc)
endif(CLANG_EXECUTABLE)
4、編譯
執行cmake . , 然後再執行make。在build 目錄下會生成一個libsubstr_udf.so檔案
5、上傳so檔案到hdfs目錄(具體目錄看情況)
> hadoop fs -put libsubstr_udf.so hdfs://namenode-or-nameservice/tmp/nrpt/
6、在impala-shell 或者hue 的impala 查詢介面新增udf函式
create function substr_udf(string,INT,INT) returns string location 'hdfs://namenode-or-nameservice/tmp/nrpt/libsubstr_udf.so' symbol='substr_udf'; //支援3個引數
create function substr_udf(string,INT) returns string location 'hdfs://namenode-or-nameservice/tmp/nrpt/libsubstr_udf.so' symbol='substr_udf'; //支援2個引數
7、如果要解除安裝或者刪除某個udf ,執行:
drop function substr_udf(string,INT,INT)
8、測試效果:
select udf_substr("adbw我當時aad",-4); //輸出: 時aad
select udf_substr("adbw我當時aad",-6,3); // 輸出:我當時
select udf_substr("adbw我當時aad",6); // 輸出:當時aad
select udf_substr("adbw我當時aad",6,3); // 輸出:當時a
select udf_substr("adbw我當時aad",16,3); // 輸出:""