c語言判斷是否是utf8字串,計算字元個數
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/****************************************************************************
Unicode符號範圍 | UTF-8編碼方式
(十六進位制) | (二進位制)
0000 0000-0000 007F:0xxxxxxx
0000 0080-0000 07FF:110xxxxx 10xxxxxx
0000 0800-0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx
0001 0000-001F FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
0020 0000-03FF FFFF:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
0400 0000-7FFF FFFF:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
**************************************************************************/
unsigned
char
utf8_look_for_table[] =
{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};
#define UTFLEN(x) utf8_look_for_table[(x)]
//根據首位元組,獲取utf8字元所佔位元組數
inline
int
GetUtf8charByteNum(unsigned
char
ch)
{
int
byteNum = 0;
if
(ch >= 0xFC && ch < 0xFE)
byteNum = 6;
else
if
(ch >= 0xF8)
byteNum = 5;
else
if
(ch >= 0xF0)
byteNum = 4;
else
if
(ch >= 0xE0)
byteNum = 3;
else
if
(ch >= 0xC0)
byteNum = 2;
else
if
(0 == (ch & 0x80))
byteNum = 1;
return
byteNum;
}
//判斷字串是否是utf8格式
int
IsUtf8Format(
const
char
*str)
{
int
byteNum = 0;
unsigned
char
ch;
const
char
*ptr = str;
if
(NULL == str)
return
0;
while
(*ptr !=
'\0'
)
{
ch = (unsigned
char
)*ptr;
if
(byteNum == 0)
//根據首位元組特性判斷該字元的位元組數
{
if
(0 == (byteNum = GetUtf8charByteNum(ch)))
return
0;
}
else
//多位元組字元,非首位元組格式:10xxxxxx
{
if
((ch & 0xC0) != 0x80)
return
0;
}
byteNum--;
ptr++;
}
if
(byteNum > 0)
return
0;
return
1;
}
//計算utf8字串字元個數
int
GetUtf8Length(
char
*str)
{
int
clen = 0;
int
len = 0;
int
byteNum = 0;
unsigned
char
ch;
char
*ptr = str;
if
(NULL == str)
return
0;
clen =
strlen
(str);
while
(*ptr !=
'\0'
&& len < clen)
{
ch = (unsigned
char
)*ptr;
if
(0 == (byteNum = GetUtf8charByteNum(ch)))
return
0;
ptr += byteNum;
len++;
}
return
len;
}
int
GetChargeNum(
int
len)
{
int
num = 0;
if
(len > 70 && len <= 500)
{
if
(!len % 67)
num = len / 67;
else
num = len / 67 + 1;
}
else
if
(len > 0)
num = 1;
return
num;
}
int
main(
int
argc,
char
**argv)
{
//char *str = "hello 你好呀!";
char
*str;
int
len = 0;
int
num = 0;
if
(argc < 2)
return
0;
str = argv[1];
printf
(
"%s\n"
, str);
if
(!IsUtf8Format(str))
{
printf
(
"the text is not the Format of utf8\n"
);
return
0;
}
if
(!(len = GetUtf8Length(str)))
return
0;
printf
(
"the length of text: %d\n"
, len);
if
(!(num = GetChargeNum(len)))
return
0;
printf
(
"the chargeNumber of sms: %d\n"
, num);
return
1;
}