1. 程式人生 > >C++ 實現unicode到utf-8的轉碼

C++ 實現unicode到utf-8的轉碼

思路:

獲取字串裡面中的Unicode部分,然後將該部分轉換位utf-8格式的字元,最後將字串裡面的所有Unicode替換為utf-8即可。

廢話不多少,直接上程式碼:

標頭檔案:

/*
 * charsetEncode.h
 *
 *  Created on: Jul 25, 2016
 *      Author: root
 */


#ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_
#define COMMONSERVER_INCLUDE_CHARSETENCODE_H_


#include <iostream>
#include <algorithm>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>


using namespace std;


class CcharsetEncode
{
public:
   int unicode_to_utf8(string &source);//unicode to utf-8


   //字串忽略大小寫字串替換
   void ReplaceStr(string &strContent, const char *strSrc, const char *strDest);


private:
    int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize);
    int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no
    unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f


};

#endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */




原始檔:
/*
 * charsetEncode.cpp
 *
 *  Created on: Jul 25, 2016
 *      Author: root
 */


#include "charsetEncode.h"


int CcharsetEncode::unicode_to_utf8(string &source)
{
int sourcesize = source.size();
string src;
unsigned char pout[8];
for(int index = 0; index < sourcesize - 6;)
{
memset(pout, 0, 8);
src = source.substr(index, 6);
if(isUnicode(src) == 1)
{
string hexsrc = source.substr(index + 2, 4);
int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8);
ReplaceStr(source, src.c_str(), (char *)pout);
index += 3;
sourcesize = source.size();
}
else
{
index++;
}
}
return 0;
}


int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize)
{
    assert(pOutput != NULL);
    assert(outSize >= 6);


    if ( unic <= 0x0000007F )
    {
        // * U-00000000 - U-0000007F:  0xxxxxxx
        *pOutput     = (unic & 0x7F);
        return 1;
    }
    else if ( unic >= 0x00000080 && unic <= 0x000007FF )
    {
        // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
        *(pOutput+1) = (unic & 0x3F) | 0x80;
        *pOutput     = ((unic >> 6) & 0x1F) | 0xC0;
        return 2;
    }
    else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
    {
        // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
        *(pOutput+2) = (unic & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >>  6) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 12) & 0x0F) | 0xE0;
        return 3;
    }
    else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
    {
        // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+3) = (unic & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 18) & 0x07) | 0xF0;
        return 4;
    }
    else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
    {
        // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+4) = (unic & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 24) & 0x03) | 0xF8;
        return 5;
    }
    else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
    {
        // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+5) = (unic & 0x3F) | 0x80;
        *(pOutput+4) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 30) & 0x01) | 0xFC;
        return 6;
    }
    return 0;
}


int CcharsetEncode::isUnicode(const string &src)
{
if(src.size() != 6)
return 0;
if(src.find("\\u", 0) == 0)
{
for(int i = 2; i <= 5; i++)
{
if(!((src[i] >= 'a' && src[i] <= 'f')
|| (src[i] >= 'A' && src[i] <= 'F')
|| (src[i] >= '0' && src[i] <= '9')))
{
return 0;
}
}
return 1;
}
else
{
return 0;
}
}


unsigned int CcharsetEncode::xstrtoshortint(const char *str)
{
    int len = strlen(str);
    unsigned int ivalue = 0;
    for (int i = 0; i < len; i++)
    {
        if ((str[i] <= '9' && str[i] >= '0'))
        {
            ivalue = ivalue * 16 + (str[i] - '0'); //16進位制 可換其它進位制
        }
        else if ((str[i] >= 'a' && str[i] <= 'f'))
        {
            ivalue = ivalue * 16 + (str[i] - 'a') + 10;
        }
        else if ((str[i] >= 'A' && str[i] <= 'F'))
        {
            ivalue = ivalue * 16 + (str[i] - 'A') + 10;
        }
    }
    return ivalue;
}


void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest)
{
    string strCopy(strContent);
    string strSrcCopy(strSrc);


    string::size_type pos = 0;
    string::size_type srclen = strlen(strSrc);
    if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos)
    {
        strContent.replace(pos, srclen, strDest);
    }
}




主函式測試:
int main()
{
CcharsetEncode encode;
string src = "\u300a\u58eb\u5175\u7a81\u51fb\u300b";
encode.unicode_to_utf8(src);
cout<<" unicode: "<<src<<endl;
return 0;
}