/* filename: isutf8.c Time: 2016-12-9 20:27 Author: Albert Wang email: albertofwb@gmail.com Function: detect whether a text file's encoding is utf-8 format */ #include <stdio.h> #include <stdlib.h> // exit() #include <io.h> // _access() detect a file's existence #define True 1 #define False 0 typedef char Bool; typedef unsigned char Uchar; int DumpFromFile(const char *FileName, char *buf, size_t FileSize) { FILE *fp; if ((fp = fopen(FileName, "rb")) == NULL) { return -1; } fread(buf, 1, FileSize, fp); fclose(fp); return 0; } int GetFileSize(const char *FileName, size_t *FileSize) { FILE *fp; if ((fp = fopen(FileName, "rb")) == NULL) { return -1; } fseek(fp, 0, SEEK_END); *FileSize = ftell(fp); fclose(fp); return 0; } Bool IsUtf8(const char* FileName) { FILE *fp = NULL; size_t FileSize = 0; char *fileBuf = NULL; GetFileSize(FileName, &FileSize); fileBuf = (char *)malloc(FileSize); DumpFromFile(FileName, fileBuf, FileSize); size_t i = 0; Bool ret = True; for ( ; ret && (i < FileSize); i++) { Uchar hexchar = fileBuf[i]; // ignore ascii code if (!(hexchar & 0x80)) { continue; } // calculate how many serial "1" int BitOneCount = 0; Uchar num = hexchar; while (num & 0x80) { if (num & 0x80) { BitOneCount += 1; } num <<= 1; } BitOneCount -= 1; while (BitOneCount > 0) { i += 1; num = fileBuf[i]; // num suppose to be 10xx xxxx num >>= 6; // num = 0000 0010 if (2 != num) { ret = False; //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d ", i, num, hexchar, BitOneCount); break; } BitOneCount -= 1; } //end for } free(fileBuf); return ret; } int main(int argc, char *argv[]) { if (argc != 2) { printf("Usage: %s <FileName> ", argv[0]); exit(1); } const char* FileName = argv[1]; char *result[] = { "False", "True" }; if (-1 == _access(FileName, 0)) { printf("%s not exists! ", FileName); exit(1); } printf("[%s] %s ", FileName, result[IsUtf8(FileName)]); return 0; } /* 参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html */
运行结果
使用 winhex 以utf8 的编码查看样本文件:
文件