/* UTF-8 valid format list: 0xxxxxxx 110xxxxx 10xxxxxx 1110xxxx 10xxxxxx 10xxxxxx 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ char *filter_none_utf8_chars(char *src, int *len) { unsigned char *p; unsigned char *pSub; unsigned char *pStrEnd; unsigned char *pCharEnd; int bytes; unsigned char *filtered; unsigned char *pDest; unsigned char *pInvalidCharStart; pStrEnd = (unsigned char *)src + (*len); p = (unsigned char *)src; pInvalidCharStart = NULL; while (p < pStrEnd) { if (*p < 0x80) { p++; continue; } if ((*p & 0xE0) == 0xC0) //110xxxxx { bytes = 1; } else if ((*p & 0xF0) == 0xE0) //1110xxxx { bytes = 2; } else if ((*p & 0xF8) == 0xF0) //11110xxx { bytes = 3; } else if ((*p & 0xFC) == 0xF8) //111110xx { bytes = 4; } else if ((*p & 0xFE) == 0xFC) //1111110x { bytes = 5; } else { pInvalidCharStart = p; break; } p++; pCharEnd = p + bytes; if (pCharEnd > pStrEnd) { pInvalidCharStart = p - 1; break; } for (; p<pCharEnd; p++) { if ((*p & 0xC0) != 0x80) { break; } } if (p != pCharEnd) { pInvalidCharStart = pCharEnd - (bytes + 1); break; } } if (pInvalidCharStart == NULL) //all chars are valid { return src; } filtered = (unsigned char *)malloc(sizeof(char) * (*len)); if (filtered == NULL) { *len = 0; *src = '