• 判断文件是否为UTF8编码(以前收集的)


      1        private bool CheckEncoding(string strFileName)
      2        {
      3            using (FileStream stream = new FileStream(strFileName, FileMode.Open))
      4            {
      5                byte[] bs = new byte[stream.Length];
      6                stream.Read(bs, 0, bs.Length);
      7                if (utf8_probability(bs) > 0return true;
      8                else return false;
      9
     10                /*
     11                if (stream != null && stream.Length >= 2)
     12                {     
     13                    //保存文件流的前4个字节
     14                    byte byte1 = 0;
     15                    byte byte2 = 0;
     16                    byte byte3 = 0;
     17                    byte byte4 = 0;
     18                    //保存当前Seek位置
     19                    long origPos = stream.Seek(0, SeekOrigin.Begin);
     20                    stream.Seek(0, SeekOrigin.Begin);
     21                    int nByte = stream.ReadByte();
     22                    byte1 = Convert.ToByte(nByte);
     23                    byte2 = Convert.ToByte(stream.ReadByte());
     24                    if (stream.Length >= 3)
     25                    {
     26                        byte3 = Convert.ToByte(stream.ReadByte());
     27                    }
     28                    if (stream.Length >= 4)
     29                    {
     30                        byte4 = Convert.ToByte(stream.ReadByte());
     31                    }
     32
     33                    //根据文件流的前4个字节判断Encoding
     34                    //Unicode {0xFF, 0xFE};
     35                    //BE-Unicode {0xFE, 0xFF};
     36                    //UTF8 = {0xEF, 0xBB, 0xBF};
     37                    if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
     38                    {
     39                        targetEncoding = Encoding.BigEndianUnicode;
     40                    }
     41                    if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
     42                    {
     43                        targetEncoding = Encoding.Unicode;
     44                    }
     45                    if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
     46                    {
     47                        targetEncoding = Encoding.UTF8;
     48                    }
     49                    //恢复Seek位置       
     50                    stream.Seek(origPos, SeekOrigin.Begin);
     51                  
     52                }*/

     53            }

     54        }

     55        
     56        
     57        private int utf8_probability(byte[] rawtext)
     58        {
     59            int score = 0;
     60            int i, rawtextlen = 0;
     61            int goodbytes = 0, asciibytes = 0;
     62
     63            // Maybe also use UTF8 Byte Order Mark:  EF BB BF
     64
     65            // Check to see if characters fit into acceptable ranges
     66            rawtextlen = rawtext.Length;
     67            for (i = 0; i < rawtextlen; i++)
     68            {
     69                if ((rawtext[i] & (byte)0x7F== rawtext[i])
     70                {  // One byte
     71                    asciibytes++;
     72                    // Ignore ASCII, can throw off count
     73                }

     74                else
     75                {
     76                    int m_rawInt0 = Convert.ToInt16(rawtext[i]);
     77                    int m_rawInt1 = Convert.ToInt16(rawtext[i + 1]);
     78                    int m_rawInt2 = Convert.ToInt16(rawtext[i + 2]);
     79
     80                    if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
     81                     i + 1 < rawtextlen &&
     82                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65)
     83                    {
     84                        goodbytes += 2;
     85                        i++;
     86                    }

     87                    else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
     88                     i + 2 < rawtextlen &&
     89                     256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
     90                     256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65)
     91                    {
     92                        goodbytes += 3;
     93                        i += 2;
     94                    }

     95                }

     96            }

     97
     98            if (asciibytes == rawtextlen) return 0; }
     99
    100            score = (int)(100 * ((float)goodbytes / (float)(rawtextlen - asciibytes)));
    101
    102            // If not above 98, reduce to zero to prevent coincidental matches
    103            // Allows for some (few) bad formed sequences
    104            if (score > 98)
    105            {
    106                return score;
    107            }

    108            else if (score > 95 && goodbytes > 30)
    109            {
    110                return score;
    111            }

    112            else
    113            {
    114                return 0;
    115            }

    116
    117        }
  • 相关阅读:
    fiddler 抓取 安卓模拟器 https包
    heidiSQL使用简介
    weblogic重启脚本
    svn命令在linux下的使用
    LVS之NAT和DR服务脚本
    LVS之NAT模型、DR模型配置和持久连接
    apache ab压力测试报错apr_socket_recv
    LVS负载均衡模型及算法概述
    Orcale11g单机安装与卸载
    IPC相关的命令
  • 原文地址:https://www.cnblogs.com/sxlfybb/p/803100.html
Copyright © 2020-2023  润新知