C++ 實現unicode到utf-8的轉碼
阿新 • • 發佈:2019-01-02
思路:
獲取字串裡面中的Unicode部分,然後將該部分轉換位utf-8格式的字元,最後將字串裡面的所有Unicode替換為utf-8即可。
廢話不多少,直接上程式碼:
標頭檔案:
/* * charsetEncode.h * * Created on: Jul 25, 2016 * Author: root */ #ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_ #define COMMONSERVER_INCLUDE_CHARSETENCODE_H_ #include <iostream> #include <algorithm> #include <string> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <errno.h> #include <assert.h> using namespace std; class CcharsetEncode { public: int unicode_to_utf8(string &source);//unicode to utf-8 //字串忽略大小寫字串替換 void ReplaceStr(string &strContent, const char *strSrc, const char *strDest); private: int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize); int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f }; #endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */
原始檔:
/* * charsetEncode.cpp * * Created on: Jul 25, 2016 * Author: root */ #include "charsetEncode.h" int CcharsetEncode::unicode_to_utf8(string &source) { int sourcesize = source.size(); string src; unsigned char pout[8]; for(int index = 0; index < sourcesize - 6;) { memset(pout, 0, 8); src = source.substr(index, 6); if(isUnicode(src) == 1) { string hexsrc = source.substr(index + 2, 4); int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8); ReplaceStr(source, src.c_str(), (char *)pout); index += 3; sourcesize = source.size(); } else { index++; } } return 0; } int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize) { assert(pOutput != NULL); assert(outSize >= 6); if ( unic <= 0x0000007F ) { // * U-00000000 - U-0000007F: 0xxxxxxx *pOutput = (unic & 0x7F); return 1; } else if ( unic >= 0x00000080 && unic <= 0x000007FF ) { // * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx *(pOutput+1) = (unic & 0x3F) | 0x80; *pOutput = ((unic >> 6) & 0x1F) | 0xC0; return 2; } else if ( unic >= 0x00000800 && unic <= 0x0000FFFF ) { // * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx *(pOutput+2) = (unic & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 6) & 0x3F) | 0x80; *pOutput = ((unic >> 12) & 0x0F) | 0xE0; return 3; } else if ( unic >= 0x00010000 && unic <= 0x001FFFFF ) { // * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+3) = (unic & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80; *pOutput = ((unic >> 18) & 0x07) | 0xF0; return 4; } else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF ) { // * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+4) = (unic & 0x3F) | 0x80; *(pOutput+3) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80; *pOutput = ((unic >> 24) & 0x03) | 0xF8; return 5; } else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF ) { // * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx *(pOutput+5) = (unic & 0x3F) | 0x80; *(pOutput+4) = ((unic >> 6) & 0x3F) | 0x80; *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80; *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80; *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80; *pOutput = ((unic >> 30) & 0x01) | 0xFC; return 6; } return 0; } int CcharsetEncode::isUnicode(const string &src) { if(src.size() != 6) return 0; if(src.find("\\u", 0) == 0) { for(int i = 2; i <= 5; i++) { if(!((src[i] >= 'a' && src[i] <= 'f') || (src[i] >= 'A' && src[i] <= 'F') || (src[i] >= '0' && src[i] <= '9'))) { return 0; } } return 1; } else { return 0; } } unsigned int CcharsetEncode::xstrtoshortint(const char *str) { int len = strlen(str); unsigned int ivalue = 0; for (int i = 0; i < len; i++) { if ((str[i] <= '9' && str[i] >= '0')) { ivalue = ivalue * 16 + (str[i] - '0'); //16進位制 可換其它進位制 } else if ((str[i] >= 'a' && str[i] <= 'f')) { ivalue = ivalue * 16 + (str[i] - 'a') + 10; } else if ((str[i] >= 'A' && str[i] <= 'F')) { ivalue = ivalue * 16 + (str[i] - 'A') + 10; } } return ivalue; } void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest) { string strCopy(strContent); string strSrcCopy(strSrc); string::size_type pos = 0; string::size_type srclen = strlen(strSrc); if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos) { strContent.replace(pos, srclen, strDest); } }
主函式測試:
int main()
{
CcharsetEncode encode;
string src = "\u300a\u58eb\u5175\u7a81\u51fb\u300b";
encode.unicode_to_utf8(src);
cout<<" unicode: "<<src<<endl;
return 0;
}