• C语言 检测一个文本文件的编码是否为utf-8


    /*
        filename: isutf8.c
        Time:     2016-12-9 20:27
        Author:   Albert Wang
        email:    albertofwb@gmail.com
        Function: detect whether a text file's encoding is utf-8 format
    */
    
    #include <stdio.h>
    #include <stdlib.h>  // exit()
    #include <io.h>  // _access() detect a file's existence
    
    #define True  1
    #define False 0
    
    typedef char Bool;
    typedef unsigned char Uchar;
    
    int DumpFromFile(const char *FileName, char *buf, size_t FileSize)
    {
        FILE     *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fread(buf, 1, FileSize, fp);
        fclose(fp);
    
        return 0;
    }
    
    
    int GetFileSize(const char *FileName, size_t *FileSize)
    {
        FILE *fp;
    
        if ((fp = fopen(FileName, "rb")) == NULL)
        {
            return -1;
        }
    
        fseek(fp, 0, SEEK_END);
        *FileSize = ftell(fp);
    
        fclose(fp);
    
        return 0;
    }
    
    Bool IsUtf8(const char* FileName)
    {
        FILE *fp = NULL;
        size_t FileSize = 0;
        char *fileBuf = NULL;
    
    
        GetFileSize(FileName, &FileSize);
        fileBuf = (char *)malloc(FileSize);
        DumpFromFile(FileName, fileBuf, FileSize);
    
        size_t i = 0;
        Bool ret = True;
    
        for ( ; ret && (i < FileSize); i++)
        {
            Uchar hexchar = fileBuf[i];
            // ignore ascii code
            if (!(hexchar & 0x80))
            {
                continue;
            }
    
            // calculate how many serial "1"
            int   BitOneCount = 0;
            Uchar num = hexchar;
            while (num & 0x80)
            {
                if (num & 0x80)
                {
                    BitOneCount += 1;
                }
                num <<= 1;
            }
    
            BitOneCount -= 1;
            while (BitOneCount > 0)
            {
                i += 1;
                num = fileBuf[i];   // num suppose to be 10xx xxxx
                num >>= 6;            // num = 0000 0010
                if (2 != num)
                {
                    ret = False;
                    //printf("i = %d num = %d hexchar = 0x%x BitOneCount= %d
    ", i, num, hexchar, BitOneCount);
                    break;
                }
                BitOneCount -= 1;
            }
    
        //end for
        }
    
    
        free(fileBuf);
        return ret;
    }
    
    int main(int argc, char *argv[])
    {
        if (argc != 2)
        {
            printf("Usage: %s <FileName>
    ", argv[0]);
            exit(1);
        }
    
        const char* FileName = argv[1];
        char  *result[] = {
            "False", "True"
        };
    
        if (-1 == _access(FileName, 0))
        {
            printf("%s not exists!
    ", FileName);
            exit(1);
        }
    
        printf("[%s] %s
    ", FileName, result[IsUtf8(FileName)]);
    
        return 0;
    }
    
    /*
        参考连接: http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
    */

    运行结果

    使用 winhex 以utf8 的编码查看样本文件:

    文件

  • 相关阅读:
    谷歌浏览器插件开发Tutorial: Getting Started (Hello, World!) 教程:准备开始(你好,世界!)
    Android ViewPager多页面滑动切换以及动画效果
    4.4 我同意条款—CheckBox的isCheck属性
    4.2设计具有背景图的按钮—ImageButton的焦点及事件处理
    【文件打开】浏览打开窗口
    【原创】PE检测工具
    emu8086注册算法分析及KeyGen实现
    学破解 <一> PE格式之MSDOS MZ header
    学破解 <二> PE格式之IMAGE_NT_HEADERS
    反虚拟机程序测试
  • 原文地址:https://www.cnblogs.com/albertofwb/p/6151484.html
Copyright © 2020-2023  润新知