1. 程式人生 > >自定義UDF函式:將漢字轉換成拼音

自定義UDF函式:將漢字轉換成拼音

工作需求要講漢字轉換成拼音,自定義UDF函式
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import com.sun.tools.javadoc.Main;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 該方法主要實現將漢字轉換成對應的拼音。
 */
@Description(name = "pinyin"
        , value = "_FUNC_(string) - get pinyin by given chinese."
        , extended = "Example:\n > select _FUNC_(string) from src;")
//方法名稱:UDFChineseToPinYin
public class  UDFChineseToPinYin  extends UDF {
    private Text result = new Text();
    public UDFChineseToPinYin() {
    }

    public Text evaluate(Text chinese) {
        if (chinese == null) {
            return null;
        }

        result.set(ConvertToPinyin(chinese.toString()));
        return result;
    }
   
    public String ConvertToPinyin(String name) {
        HanyuPinyinOutputFormat pyFormat = new HanyuPinyinOutputFormat();
        pyFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
        pyFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
        pyFormat.setVCharType(HanyuPinyinVCharType.WITH_V);

        String result = null;
        try {
            result = PinyinHelper.toHanyuPinyinString(name, pyFormat, "");
        } catch (BadHanyuPinyinOutputFormatCombination e) {
            return null;
        }

        return result;
    }
    
    //main方法裡面測試,將漢語轉換成拼音輸出
    public static void main(String[] args) {
    	UDFChineseToPinYin udfpy = new UDFChineseToPinYin();
    	System.out.println(udfpy.evaluate(new Text("你好,祖瑪朗瑪峰")));
        System.out.println(udfpy.evaluate(new Text("你好,不著調,著急")));
	}
}
//測試結果如下:很明顯,當出現多音字時則會出現錯誤的情況。
nihao,zumalangmafeng
nihao,buzhediao,zheji