自定義UDF函式:將漢字轉換成拼音
阿新 • • 發佈:2019-01-03
工作需求要講漢字轉換成拼音,自定義UDF函式
import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; import com.sun.tools.javadoc.Main; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 該方法主要實現將漢字轉換成對應的拼音。 */ @Description(name = "pinyin" , value = "_FUNC_(string) - get pinyin by given chinese." , extended = "Example:\n > select _FUNC_(string) from src;") //方法名稱:UDFChineseToPinYin public class UDFChineseToPinYin extends UDF { private Text result = new Text(); public UDFChineseToPinYin() { } public Text evaluate(Text chinese) { if (chinese == null) { return null; } result.set(ConvertToPinyin(chinese.toString())); return result; } public String ConvertToPinyin(String name) { HanyuPinyinOutputFormat pyFormat = new HanyuPinyinOutputFormat(); pyFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); pyFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); pyFormat.setVCharType(HanyuPinyinVCharType.WITH_V); String result = null; try { result = PinyinHelper.toHanyuPinyinString(name, pyFormat, ""); } catch (BadHanyuPinyinOutputFormatCombination e) { return null; } return result; } //main方法裡面測試,將漢語轉換成拼音輸出 public static void main(String[] args) { UDFChineseToPinYin udfpy = new UDFChineseToPinYin(); System.out.println(udfpy.evaluate(new Text("你好,祖瑪朗瑪峰"))); System.out.println(udfpy.evaluate(new Text("你好,不著調,著急"))); } } //測試結果如下:很明顯,當出現多音字時則會出現錯誤的情況。 nihao,zumalangmafeng nihao,buzhediao,zheji