• c语言判断是否是utf8字符串,计算字符个数


    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    
    /****************************************************************************
    Unicode符号范围 | UTF-8编码方式
        (十六进制) | (二进制)
    0000 0000-0000 007F:0xxxxxxx 
    0000 0080-0000 07FF:110xxxxx 10xxxxxx
    0000 0800-0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx
    0001 0000-001F FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    0020 0000-03FF FFFF:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    0400 0000-7FFF FFFF:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    **************************************************************************/
    
    unsigned char utf8_look_for_table[] =
        {
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};
    
    #define UTFLEN(x) utf8_look_for_table[(x)]
    
    //根据首字节,获取utf8字符所占字节数
    inline int GetUtf8charByteNum(unsigned char ch)
    {
        int byteNum = 0;
    
        if (ch >= 0xFC && ch < 0xFE)
            byteNum = 6;
        else if (ch >= 0xF8)
            byteNum = 5;
        else if (ch >= 0xF0)
            byteNum = 4;
        else if (ch >= 0xE0)
            byteNum = 3;
        else if (ch >= 0xC0)
            byteNum = 2;
        else if (0 == (ch & 0x80))
            byteNum = 1;
    
        return byteNum;
    }
    
    //判断字符串是否是utf8格式
    int IsUtf8Format(const char *str)
    {
        int byteNum = 0;
        unsigned char ch;
        const char *ptr = str;
    
        if (NULL == str)
            return 0;
    
        while (*ptr != '')
        {
            ch = (unsigned char)*ptr;
            if (byteNum == 0) //根据首字节特性判断该字符的字节数
            {
                if (0 == (byteNum = GetUtf8charByteNum(ch)))
                    return 0;
            }
            else //多字节字符,非首字节格式:10xxxxxx
            {
                if ((ch & 0xC0) != 0x80)
                    return 0;
            }
            byteNum--;
            ptr++;
        }
    
        if (byteNum > 0)
            return 0;
    
        return 1;
    }
    
    //计算utf8字符串字符个数
    int GetUtf8Length(char *str)
    {
        int clen = 0;
        int len = 0;
        int byteNum = 0;
        unsigned char ch;
        char *ptr = str;
    
        if (NULL == str)
            return 0;
    
        clen = strlen(str);
        while (*ptr != '' && len < clen)
        {
            ch = (unsigned char)*ptr;
            if (0 == (byteNum = GetUtf8charByteNum(ch)))
                return 0;
            ptr += byteNum;
            len++;
        }
    
        return len;
    }
    
    int GetChargeNum(int len)
    {
        int num = 0;
    
        if (len > 70 && len <= 500)
        {
            if (!len % 67)
                num = len / 67;
            else
                num = len / 67 + 1;
        }
        else if (len > 0)
            num = 1;
    
        return num;
    }
    
    int main(int argc, char **argv)
    {
        //char *str = "hello 你好呀!";
        char *str;
        int len = 0;
        int num = 0;
    
        if (argc < 2)
            return 0;
    
        str = argv[1];
        printf("%s
    ", str);
    
        if (!IsUtf8Format(str))
        {
            printf("the text is not the Format of utf8
    ");
            return 0;
        }
    
        if (!(len = GetUtf8Length(str)))
            return 0;
        printf("the length of text: %d
    ", len);
    
        if (!(num = GetChargeNum(len)))
            return 0;
        printf("the chargeNumber of sms: %d
    ", num);
    
        return 1;
    }
    

      

    参考:

    http://blog.sina.com.cn/s/blog_62b2318d0101d7kb.html

    http://www.cnblogs.com/jiu0821/p/6371544.html

  • 相关阅读:
    观察是快速成长的一个牛逼技能
    linux下使用lftp的小结(转)
    关于升级cocos2d-x网络库来支持ipv6、https,以及socket怎么支持ipv6
    cocos2dx支持arm64
    android studio 命令行编译cocos 3.15.1 安卓工程
    认识Android.mk和Application.mk
    mac os x下Android Studio3.0 配置本地 Gradle
    图片转成base64编码
    集成pbc
    6、SpringMVC:结果跳转方式 和 数据提交时的处理
  • 原文地址:https://www.cnblogs.com/yaosj/p/6930319.html
Copyright © 2020-2023  润新知