C++判斷字串編碼格式(ANSI\UTF16_LE\UTF16_BE\UTF8\UTF8_BOM)
阿新 • • 發佈:2022-05-30
enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 }; __inline static Encode IsUtf8Data(const uint8_t* data, size_t size) { bool bAnsi= true; uint8_t ch = 0x00; int32_t nBytes = 0; for (auto i = 0; i < size; i++) { ch = *(data + i); if ((ch & 0x80) != 0x00) { bAnsi = false; } if (nBytes == 0) { if (ch >= 0x80) { if (ch >= 0xFC && ch <= 0xFD) { nBytes = 6; } else if (ch >= 0xF8) { nBytes = 5; } else if (ch >= 0xF0) { nBytes = 4; } else if (ch >= 0xE0) { nBytes = 3; } else if (ch >= 0xC0) { nBytes = 2; } else { return Encode::ANSI; } nBytes--; } } else { if ((ch & 0xC0) != 0x80) { return Encode::ANSI; } nBytes--; } } if (nBytes > 0 || bAnsi) { return Encode::ANSI; } return Encode::UTF8; } __inline static Encode DetectEncode(const uint8_t* data, size_t size) { if (size > 2 && data[0] == 0xFF && data[1] == 0xFE) { return Encode::UTF16_LE; } else if (size > 2 && data[0] == 0xFE && data[1] == 0xFF) { return Encode::UTF16_BE; } else if (size > 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) { return Encode::UTF8_BOM; } else { return IsUtf8Data(data, size); } }
呼叫例子:
auto s = FILE_READER(sv.begin()->c_str(), std::ios::binary); switch (DetectEncode((const uint8_t*)s.data(), s.size())) { case ANSI: break; case UTF16_LE: s.erase(s.begin()); s.erase(s.begin()); s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t))); break; case UTF16_BE: s.erase(s.begin()); s.erase(s.begin()); s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t))); break; case UTF8_BOM: s.erase(s.begin()); s.erase(s.begin()); s.erase(s.begin()); s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s)); break; case UTF8: s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s)); break; default: break; }