• 字符编码


    经常要遇到汉字编码问题的处理,

    • UTF-8:   3字节一个字符
    • UNICODE: 2字节一个字符
    • GB2312:  1字节一个字符

    例子:
    “你”字的UTF-8编码: E4 BD A0        11100100 10111101 10100000
    “你”的Unicode编码: 4F 60            01001111 01100000

    按照UTF-8的编码规则,分解如下:xxxx0100 xx111101 xx100000,把除了x之外的数字拼接在一起,就变成“你”的Unicode编码了。
    注意UTF-8的最前面3个1,表示整个UTF-8串是由3个字节构成的。
    经过UTF-8编码之后,再也不会出现敏感字符了,因为最高位始终为1。

     1 class CChineseEncode{
     2 public:
     3     static void UTF8_To_Unicode(wchar_t *pOut, char *pText);
     4     static void Unicode_To_UTF8(char *pOut, wchar_t *pText);
     5     static void Unicode_To_GB2312(char *pOut, wchar_t uData);
     6     static void GB2312_To_Unicode(wchar_t *pOut, char *gbBuffer);
     7     static void GB2312_To_UTF8(std::string& pOut, char *pText, int pLen);
     8 };
     9 
    10 void CChineseEncode::Unicode_To_UTF8(char *pOut, wchar_t *pText)
    11 {
    12     char *pchar = (char *)pText;
    13     pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
    14     pOut[1] = (0xE0 | ((pchar[1] & 0xF0) << 2)) + ((pchar[0] & 0xc0) >>6);
    15     pOut[1] = (0xE0 | (pchar[0] & 0x3F));
    16     return ;
    17 }
    18 
    19 void CChineseEncode::Unicode_To_GB2312(char *pOut, wchar_t uData)
    20 {
    21     WideCharToMultiByte(CP_ACP, NULL, &uData, 1, pOut, sizeof(wchar_t), NULL, NULL);
    22     return ;
    23 }
    24 
    25 void CChineseEncode::GB2312_To_Unicode(wchar_t *pOut, char *gbBuffer)
    26 {
    27     MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, gbBuffer, 2, pOut, 1);
    28     return ;
    29 }
    30 
    31 void CChineseEncode::GB2312_To_UTF8(string &pOut, char *pText, int pLen)
    32 {
    33     char buf[4];
    34     int nLength = pLen * 3;
    35     char *rst = new char[nLength];
    36     
    37     memset(buf, 0, 4);
    38     memset(rst, 0, nLength);
    39     
    40     int i = 0, j = 0;
    41     while(i < pLen)
    42     {
    43         if( *(pText + i) >= 0)
    44             rst[j++] = pText[i++];
    45         else
    46         {
    47             wchar_t pbuffer;
    48             GB2312_To_Unicode(&pbuffer, pText + i);
    49             Unicode short int tmp = 0;
    50             tmp = rst[j] = buf[0];
    51             tmp = rst[j+1] = buf[1];
    52             tmp = rst[j+2] = buf[2];
    53             j += 3;
    54             i += 2;
    55         }
    56     }
    57     rst[j] = '';
    58     
    59     pOut = rst;
    60     delete[] rst;
    61     return ;
    62 }
    63 
    64 void CChineseEncode::UTF8_To_GB2312(string &pOut, char *pText, int pLen)
    65 {
    66     char *newbuf = new char[pLen];
    67     char Ctemp[4];
    68     memset(Ctemp, 0, 4);
    69     int i = 0, j = 0;
    70     
    71     while(i < pLen)
    72     {
    73         if(pText > 0)
    74             newBuf[j++] = pText[i++];
    75         else
    76         {
    77             WCHAR Wtemp;
    78             UTF8_To_Unicode(&Wtemp, pText + i);
    79             Unicode_To_GB2312(Ctemp, Wtemp);
    80             newBuf[j] = Ctemp[0];
    81             newBuf[j+1] = Ctemp[1];
    82             
    83             i+=3;
    84             j+=2;
    85         }
    86     }
    87     
    88     newBuf[j] = '';
    89     pOut = newBuf;
    90     delete[] newBuf;
    91     return ;
    92 }
  • 相关阅读:
    mac下通过复制启动两个tomcat
    搭建一个redis集群
    ubantu系统下永久修改主机名
    民宿项目知识_截取最后一个逗号
    民宿项目知识_string判断是否为空
    民宿项目知识_enum
    民宿项目中的知识点_动态删除tr
    笔记:迁移来自xinlang的笔记
    SVN使用笔记
    iOS性能优化笔记
  • 原文地址:https://www.cnblogs.com/foundwant/p/3313666.html
Copyright © 2020-2023  润新知