1. 程式人生 > >java使用unicode過濾emoji表情

java使用unicode過濾emoji表情

最近搞個微信公眾號的專案,如果微信使用者名稱有表情符號的,那麼在我們的系統裡面就會儲存不了。最初的解決方案是把資料庫的utf8格式改為utf8mb4。這樣就能相容儲存emoji表情符號,後面又發現,儲存的表情符號有的能正常顯示有的不可以,後來經過幾次驗證才發現,用IOS自帶輸入法輸入的表情,跟安卓輸入法輸入的表情編碼不一樣,導致有些符號不能顯示,而且有些IOS有的符號安卓機子還沒有。如果要統一就必須要進行轉碼,這樣工作量比較大,不能在短時間內解決,最好的方案就是過濾,把emoji表情符號替換成□。於是就去查了unicode碼錶,做了一個過濾的工具類,使用正則表示式過濾表情字元

public final static String unicodeReg= "["+
				"\u4E00-\u9FBF"+//:CJK 統一表意符號 (CJK Unified Ideographs)
				"\u4DC0-\u4DFF"+//:易經六十四卦符號 (Yijing Hexagrams Symbols)
				"\u0000-\u007F"+//:C0控制符及基本拉丁文 (C0 Control and Basic Latin)
				"\u0080-\u00FF"+//:C1控制符及拉丁:補充-1 (C1 Control and Latin 1 Supplement)
				"\u0100-\u017F"+//:拉丁文擴充套件-A (Latin Extended-A)
				"\u0180-\u024F"+//:拉丁文擴充套件-B (Latin Extended-B)
				"\u0250-\u02AF"+//:國際音標擴充套件 (IPA Extensions)
				"\u02B0-\u02FF"+//:空白修飾字母 (Spacing Modifiers)
				"\u0300-\u036F"+//:結合用讀音符號 (Combining Diacritics Marks)
				"\u0370-\u03FF"+//:希臘文及科普特文 (Greek and Coptic)
				"\u0400-\u04FF"+//:西裡爾字母 (Cyrillic)
				"\u0500-\u052F"+//:西裡爾字母補充 (Cyrillic Supplement)
				"\u0530-\u058F"+//:亞美尼亞語 (Armenian)
				"\u0590-\u05FF"+//:希伯來文 (Hebrew)
				"\u0600-\u06FF"+//:阿拉伯文 (Arabic)
				"\u0700-\u074F"+//:敘利亞文 (Syriac)
				"\u0750-\u077F"+//:阿拉伯文補充 (Arabic Supplement)
				"\u0780-\u07BF"+//:馬爾地夫語 (Thaana)
				//"\u07C0-\u077F"+//:西非書面語言 (N'Ko)
				"\u0800-\u085F"+//:阿維斯塔語及巴列維語 (Avestan and Pahlavi)
				"\u0860-\u087F"+//:Mandaic
				"\u0880-\u08AF"+//:撒馬利亞語 (Samaritan)
				"\u0900-\u097F"+//:天城文書 (Devanagari)
				"\u0980-\u09FF"+//:孟加拉語 (Bengali)
				"\u0A00-\u0A7F"+//:錫克教文 (Gurmukhi)
				"\u0A80-\u0AFF"+//:古吉拉特文 (Gujarati)
				"\u0B00-\u0B7F"+//:奧里亞文 (Oriya)
				"\u0B80-\u0BFF"+//:泰米爾文 (Tamil)
				"\u0C00-\u0C7F"+//:泰盧固文 (Telugu)
				"\u0C80-\u0CFF"+//:卡納達文 (Kannada)
				"\u0D00-\u0D7F"+//:德拉維族語 (Malayalam)
				"\u0D80-\u0DFF"+//:僧伽羅語 (Sinhala)
				"\u0E00-\u0E7F"+//:泰文 (Thai)
				"\u0E80-\u0EFF"+//:寮國文 (Lao)
				"\u0F00-\u0FFF"+//:藏文 (Tibetan)
				"\u1000-\u109F"+//:緬甸語 (Myanmar)
				"\u10A0-\u10FF"+//:喬治亞語 (Georgian)
				"\u1100-\u11FF"+//:朝鮮文 (Hangul Jamo)
				"\u1200-\u137F"+//:衣索比亞語 (Ethiopic)
				"\u1380-\u139F"+//:衣索比亞語補充 (Ethiopic Supplement)
				"\u13A0-\u13FF"+//:切羅基語 (Cherokee)
				"\u1400-\u167F"+//:統一加拿大土著語音節 (Unified Canadian Aboriginal Syllabics)
				"\u1680-\u169F"+//:歐甘字母 (Ogham)
				"\u16A0-\u16FF"+//:如尼文 (Runic)
				"\u1700-\u171F"+//:塔加拉語 (Tagalog)
				"\u1720-\u173F"+//:Hanunóo
				"\u1740-\u175F"+//:Buhid
				"\u1760-\u177F"+//:Tagbanwa
				"\u1780-\u17FF"+//:高棉語 (Khmer)
				"\u1800-\u18AF"+//:蒙古文 (Mongolian)
				"\u18B0-\u18FF"+//:Cham
				"\u1900-\u194F"+//:Limbu
				"\u1950-\u197F"+//:德巨集泰語 (Tai Le)
				"\u1980-\u19DF"+//:新傣仂語 (New Tai Lue)
				"\u19E0-\u19FF"+//:高棉語記號 (Kmer Symbols)
				"\u1A00-\u1A1F"+//:Buginese
				"\u1A20-\u1A5F"+//:Batak
				"\u1A80-\u1AEF"+//:Lanna
				"\u1B00-\u1B7F"+//:巴釐語 (Balinese)
				"\u1B80-\u1BB0"+//:巽他語 (Sundanese)
				"\u1BC0-\u1BFF"+//:Pahawh Hmong
				"\u1C00-\u1C4F"+//:雷布查語(Lepcha)
				"\u1C50-\u1C7F"+//:Ol Chiki
				"\u1C80-\u1CDF"+//:曼尼普爾語 (Meithei/Manipuri)
				"\u1D00-\u1D7F"+//:語音學擴充套件 (Phone tic Extensions)
				"\u1D80-\u1DBF"+//:語音學擴充套件補充 (Phonetic Extensions Supplement)
				"\u1DC0-\u1DFF"+//結合用讀音符號補充 (Combining Diacritics Marks Supplement)
				"\u1E00-\u1EFF"+//:拉丁文擴充附加 (Latin Extended Additional)
				"\u1F00-\u1FFF"+//:希臘語擴充 (Greek Extended)
				"\u2000-\u206F"+//:常用標點 (General Punctuation)
				"\u2070-\u209F"+//:上標及下標 (Superscripts and Subscripts)
				"\u20A0-\u20CF"+//:貨幣符號 (Currency Symbols)
				"\u20D0-\u20FF"+//:組合用記號 (Combining Diacritics Marks for Symbols)
				"\u2100-\u214F"+//:字母式符號 (Letterlike Symbols)
				"\u2150-\u218F"+//:數字形式 (Number Form)
				"\u2190-\u21FF"+//:箭頭 (Arrows)
				"\u2200-\u22FF"+//:數學運算子 (Mathematical Operator)
				"\u2300-\u23FF"+//:雜項工業符號 (Miscellaneous Technical)
				"\u2400-\u243F"+//:控制圖片 (Control Pictures)
				"\u2440-\u245F"+//:光學識別符 (Optical Character Recognition)
				"\u2460-\u24FF"+//:封閉式字母數字 (Enclosed Alphanumerics)
				"\u2500-\u257F"+//:製表符 (Box Drawing)
				"\u2580-\u259F"+//:方塊元素 (Block Element)
				"\u25A0-\u25FF"+//:幾何圖形 (Geometric Shapes)
				"\u2600-\u26FF"+//:雜項符號 (Miscellaneous Symbols)
				"\u2700-\u27BF"+//:印刷符號 (Dingbats)
				"\u27C0-\u27EF"+//:雜項數學符號-A (Miscellaneous Mathematical Symbols-A)
				"\u27F0-\u27FF"+//:追加箭頭-A (Supplemental Arrows-A)
				"\u2800-\u28FF"+//:盲文點字模型 (Braille Patterns)
				"\u2900-\u297F"+//:追加箭頭-B (Supplemental Arrows-B)
				"\u2980-\u29FF"+//:雜項數學符號-B (Miscellaneous Mathematical Symbols-B)
				"\u2A00-\u2AFF"+//:追加數學運算子 (Supplemental Mathematical Operator)
				"\u2B00-\u2BFF"+//:雜項符號和箭頭 (Miscellaneous Symbols and Arrows)
				"\u2C00-\u2C5F"+//:格拉哥里字母 (Glagolitic)
				"\u2C60-\u2C7F"+//:拉丁文擴充套件-C (Latin Extended-C)
				"\u2C80-\u2CFF"+//:古埃及語 (Coptic)
				"\u2D00-\u2D2F"+//:喬治亞語補充 (Georgian Supplement)
				"\u2D30-\u2D7F"+//:提非納文 (Tifinagh)
				"\u2D80-\u2DDF"+//:衣索比亞語擴充套件 (Ethiopic Extended)
				"\u2E00-\u2E7F"+//:追加標點 (Supplemental Punctuation)
				"\u2E80-\u2EFF"+//:CJK 部首補充 (CJK Radicals Supplement)
				"\u2F00-\u2FDF"+//:康熙字典部首 (Kangxi Radicals)
				"\u2FF0-\u2FFF"+//:表意文字描述符 (Ideographic Description Characters)
				"\u3000-\u303F"+//:CJK 符號和標點 (CJK Symbols and Punctuation)
				"\u3040-\u309F"+//:日文平假名 (Hiragana)
				"\u30A0-\u30FF"+//:日文片假名 (Katakana)
				"\u3100-\u312F"+//:注音字母 (Bopomofo)
				"\u3130-\u318F"+//:朝鮮文相容字母 (Hangul Compatibility Jamo)
				"\u3190-\u319F"+//:象形字註釋標誌 (Kanbun)
				"\u31A0-\u31BF"+//:注音字母擴充套件 (Bopomofo Extended)
				"\u31C0-\u31EF"+//:CJK 筆畫 (CJK Strokes)
				"\u31F0-\u31FF"+//:日文片假名語音擴充套件 (Katakana Phonetic Extensions)
				"\u3200-\u32FF"+//:封閉式 CJK 文字和月份 (Enclosed CJK Letters and Months)
				"\u3300-\u33FF"+//:CJK 相容 (CJK Compatibility)
				"\u3400-\u4DBF"+//:CJK 統一表意符號擴充套件 A (CJK Unified Ideographs Extension A)
				"\u4DC0-\u4DFF"+//:易經六十四卦符號 (Yijing Hexagrams Symbols)
				"\u4E00-\u9FBF"+//:CJK 統一表意符號 (CJK Unified Ideographs)
				"\uA000-\uA48F"+//:彝文音節 (Yi Syllables)
				"\uA490-\uA4CF"+//:彝文字根 (Yi Radicals)
				"\uA500-\uA61F"+//:Vai
				"\uA660-\uA6FF"+//:統一加拿大土著語音節補充 (Unified Canadian Aboriginal Syllabics Supplement)
				"\uA700-\uA71F"+//:聲調修飾字母 (Modifier Tone Letters)
				"\uA720-\uA7FF"+//:拉丁文擴充套件-D (Latin Extended-D)
				"\uA800-\uA82F"+//:Syloti Nagri
				"\uA840-\uA87F"+//:八思巴字 (Phags-pa)
				"\uA880-\uA8DF"+//:Saurashtra
				"\uA900-\uA97F"+//:爪哇語 (Javanese)
				"\uA980-\uA9DF"+//:Chakma
				"\uAA00-\uAA3F"+//:Varang Kshiti
				"\uAA40-\uAA6F"+//:Sorang Sompeng
				"\uAA80-\uAADF"+//:Newari
				"\uAB00-\uAB5F"+//:越南傣語 (Vi?t Thái)
				"\uAB80-\uABA0"+//:Kayah Li
				"\uAC00-\uD7AF"+//:朝鮮文音節 (Hangul Syllables)
				//"\uD800-\uDBFF"+//:High-half zone of UTF-16
				//"\uDC00-\uDFFF"+//:Low-half zone of UTF-16
				"\uE000-\uF8FF"+//:自行使用區域 (Private Use Zone)
				"\uF900-\uFAFF"+//:CJK 相容象形文字 (CJK Compatibility Ideographs)
				"\uFB00-\uFB4F"+//:字母表達形式 (Alphabetic Presentation Form)
				"\uFB50-\uFDFF"+//:阿拉伯表達形式A (Arabic Presentation Form-A)
				"\uFE00-\uFE0F"+//:變數選擇符 (Variation Selector)
				"\uFE10-\uFE1F"+//:豎排形式 (Vertical Forms)
				"\uFE20-\uFE2F"+//:組合用半符號 (Combining Half Marks)
				"\uFE30-\uFE4F"+//:CJK 相容形式 (CJK Compatibility Forms)
				"\uFE50-\uFE6F"+//:小型變體形式 (Small Form Variants)
				"\uFE70-\uFEFF"+//:阿拉伯表達形式B (Arabic Presentation Form-B)
				"\uFF00-\uFFEF"+//:半型及全型形式 (Halfwidth and Fullwidth Form)
				"\uFFF0-\uFFFF]";//:特殊 (Specials);
		/** 
		* 將字串轉成unicode 
		* @param str 待轉字串 
		* @return unicode字串 
		*/ 
		public static String convert(String str) 
		{ 
		str = (str == null ? "" : str); 
		String tmp; 
		StringBuffer sb = new StringBuffer(1000); 
		char c; 
		int i, j; 
		sb.setLength(0); 
		for (i = 0; i < str.length(); i++) 
		{ 
		c = str.charAt(i); 
		sb.append("\\u"); 
		j = (c >>>8); //取出高8位 
		tmp = Integer.toHexString(j); 
		if (tmp.length() == 1) 
		{
			sb.append("0");
		}
		sb.append(tmp); 
		j = (c & 0xFF); //取出低8位 
		tmp = Integer.toHexString(j); 
		if (tmp.length() == 1)
		{
			sb.append("0");
		}
		sb.append(tmp); 

		} 
		return (new String(sb).toUpperCase()); 
		} 


		
		/** 
		 * 2)unicode轉成字串,與上述過程反向操作即可 
		* 將unicode 字串 
		* @param str 待轉字串 
		* @return 普通字串 
		*/ 
		public static String revert(String str) 
		{ 
		str = (str == null ? "" : str); 
		if (str.indexOf("\\u") == -1)//如果不是unicode碼則原樣返回 
		return str; 

		StringBuffer sb = new StringBuffer(1000); 

		for (int i = 0; i < str.length() - 6;) 
		{ 
		String strTemp = str.substring(i, i + 6); 
		String value = strTemp.substring(2); 
		int c = 0; 
		for (int j = 0; j < value.length(); j++) 
		{ 
		char tempChar = value.charAt(j); 
		int t = 0; 
		switch (tempChar) 
		{ 
		case 'a': 
		t = 10; 
		break; 
		case 'b': 
		t = 11; 
		break; 
		case 'c': 
		t = 12; 
		break; 
		case 'd': 
		t = 13; 
		break; 
		case 'e': 
		t = 14; 
		break; 
		case 'f': 
		t = 15; 
		break; 
		default: 
		t = tempChar - 48; 
		break; 
		} 

		c += t * ((int) Math.pow(16, (value.length() - j - 1))); 
		} 
		sb.append((char) c); 
		i = i + 6; 
		} 
		return sb.toString(); 
		}

		public static String emojiChange(String string){
			System.out.println("__________________________________");
			try{
				System.out.println("all-string:"+string);
				System.out.println("all-unicode:"+convert(string));
				Pattern pattern = Pattern.compile(unicodeReg);
		       StringBuffer sbBuffer=new StringBuffer();
				for(int i=0;i<string.length();i++){
					char c=string.charAt(i);
					String temp=String.valueOf(c);
					 Matcher matcher = pattern.matcher(temp);
					 if(matcher.find()){
						 sbBuffer.append(temp);
					 }else{
						 sbBuffer.append("□");
					 }
					System.out.println("temp:"+temp+";unicode:"+convert(temp));
				}
				System.out.println("sb:"+sbBuffer.toString());
				System.out.println("--------------------------------------");
				return sbBuffer.toString();
			}catch(Exception e){
				e.printStackTrace();
			}
			return "";
		}

這樣就先解決了表情符號無法顯示的問題,至於如果要真正能顯示符號。。。感覺還是有很長的路要走,而且IOS每次更新,都會新上幾個表情符號,如果微信更新不及時,微信裡面的顯示也是方塊或者替換成對應的文字。

參考