• C语言 检测一个文本文件的编码是否为utf-8


    /*
        filename: isutf8.c
        Time:     2016-12-9 20:27
        Author:   Albert Wang
        email:    albertofwb@gmail.com
        Function: detect whether a text file's encoding is utf-8 format
    */
    
    #include <stdio.h>
    #include <stdlib.h>  // exit()
    #include <io.h>  // _access() detect a file's existence
    
    #define True  1
    #define False 0
    
    typedef char Bool;
    typedef unsigned char Uchar;
    
    int DumpFromFile(const char *FileName, char *buf, size_t FileSize)
    {
        FILE     *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fread(buf, 1, FileSize, fp);
        fclose(fp);
    
        return 0;
    }
    
    
    int GetFileSize(const char *FileName, size_t *FileSize)
    {
        FILE *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fseek(fp, 0, SEEK_END);
        *FileSize = ftell(fp);
    
        fclose(fp);
    
        return 0;
    }
    
    Bool IsUtf8(const char* FileName)
    {
        FILE *fp = NULL;
        size_t FileSize = 0;
        char *fileBuf = NULL;
    
    
        GetFileSize(FileName, &FileSize);
        fileBuf = (char *)malloc(FileSize);
        DumpFromFile(FileName, fileBuf, FileSize);
    
        size_t i = 0;
        Bool ret = True;
    
        for ( ; ret && (i < FileSize); i++)
        {
            Uchar hexchar = fileBuf[i];
            // ignore ascii code
            if (!(hexchar & 0x80))
            {
                continue;
            }
    
            // calculate how many serial "1"
            int   BitOneCount = 0;
            Uchar num = hexchar;
            while (num & 0x80)
            {
                if (num & 0x80)
                {
                    BitOneCount += 1;
                }
                num <<= 1;
            }
    
            BitOneCount -= 1;
            while (BitOneCount > 0)
            {
                i += 1;
                num = fileBuf[i];   // num suppose to be 10xx xxxx
                num >>= 6;            // num = 0000 0010
                if (2 != num)
                {
                    ret = False;
                    //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d
    ", i, num, hexchar, BitOneCount);
                    break;
                }
                BitOneCount -= 1;
            }
    
        //end for
        }
    
    
        free(fileBuf);
        return ret;
    }
    
    int main(int argc, char *argv[])
    {
        if (argc != 2)
        {
            printf("Usage: %s <FileName>
    ", argv[0]);
            exit(1);
        }
    
        const char* FileName = argv[1];
        char  *result[] = {
            "False", "True"
        };
    
        if (-1 == _access(FileName, 0))
        {
            printf("%s not exists!
    ", FileName);
            exit(1);
        }
    
        printf("[%s] %s
    ", FileName, result[IsUtf8(FileName)]);
    
        return 0;
    }
    
    /*
        参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
    */

    运行结果

    使用 winhex 以utf8 的编码查看样本文件:

    文件

  • 相关阅读:
    Spring创建对象的原理
    java.io.WriteAbortedException异常
    在servlet中返回json数据
    Java中导入导出Excel -- POI技术
    Java文件下载
    MySql 分页关键字(limit)
    从dao层查出的数据到页面时数值都是零的异常
    注解
    事务的四大特性
    Java-事务管理
  • 原文地址:https://www.cnblogs.com/albertofwb/p/6151484.html
Copyright © 2020-2023  润新知