• 利用偏移量快速定位数据内容


    本项目需要把数据存档位二进制文件,载入时只载入文件索引,通过索引,快速定位到数据内容,从而实现最小存储,最快速查找。下面代码是初步实现,通过扩展,还实现搜索引擎关键字匹配度,权重,分词效果,这是后话,先把最基础的通过偏移量快速查找分享下。

    /// <summary>
        /// 索引文件结构
        /// </summary>
        struct Token
        {
            /// <summary>
            /// 关键字
            /// </summary>
            public string ID;
    
            /// <summary>
            /// 移偏量
            /// </summary>
            public int Offset;
    
            /// <summary>
            /// 长度
            /// </summary>
            public int Length;
    
        }
    

      

    /// <summary>
        /// 搜索
        /// </summary>
        class Search
        {
            private static StringBuilder _mainContent = new StringBuilder();
    
            //生成索引文件和数据文件
            public void BuildFile()
            {
                //生成索引文件
                if (File.Exists("index.txt"))
                    File.Delete("index.txt");
                using (FileStream aFile = new FileStream("index.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite))
                {
                    Random rd = new Random();
                    int rdv = 0;
    
                    byte[] bytes = null;
                    byte[] byCont = null;
                    int offset = 0;
                    int len = 0;
    
    
                    using (BinaryWriter bw = new BinaryWriter(aFile, Encoding.UTF8))
                    {
                        for (int i = 0; i < 15; i++)
                        {
                            rdv = rd.Next(10, 305000);
                            string indexerid = string.Empty;
                            indexerid = i.ToString() + DateTime.Today.ToString("yyyyMMdd");
    
                            _mainContent.Append(indexerid + "|test programe" + rdv.ToString());
                            string result = indexerid + "|test programe" + rdv.ToString();
    
                            bytes = System.Text.Encoding.UTF8.GetBytes(_mainContent.ToString()); //所有内容
                            byCont = System.Text.Encoding.UTF8.GetBytes(result);  //本次内容
    
                            //计算偏移量和内容长度
                            if (i == 0)
                            {
                                offset = 0;
                                len = byCont.Length;
                            }
                            else
                            {
                                offset = bytes.Length - byCont.Length;
                                len = byCont.Length;
                            }
                            bw.Write(indexerid + "," + (offset) + "," + (len));
                            bw.Flush();
                        }
    
                    }
                }
    
                //生成数据文件
                if (File.Exists("data.txt"))
                    File.Delete("data.txt");
    
                using (FileStream dFile = new FileStream("data.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite))
                {
                    Random rd = new Random();
    
                    byte[] bytes = null;
    
    
                    using (BinaryWriter bw = new BinaryWriter(dFile, Encoding.UTF8))
                    {
    
                        bytes = System.Text.Encoding.Default.GetBytes(_mainContent.ToString());
                        bw.Write(_mainContent.ToString());
                        bw.Flush();
                    }
                }
            }
         //加载索引表到内存
            public Dictionary<string, Token> GetTokenDic()
            {
                Dictionary<string, Token> dic = new Dictionary<string, Token>();
    
                using (FileStream aFile = new FileStream("index.txt", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                {
                    using (BinaryReader bw = new BinaryReader(aFile, Encoding.UTF8))
                    {
                        for (int i = 0; i < 15; i++)
                        {
                            string result = bw.ReadString();
                            if (result.IndexOf(',') != -1)
                            {
                                string[] arr = result.Split(',');
                                Token token = new Token();
                                token.ID = arr[0];
                                token.Length = Convert.ToInt32(arr[2]);
                                token.Offset = Convert.ToInt32(arr[1]);
    
                                dic.Add(arr[0], token);
                            }
    
                        }
    
                    }
                }
                return dic;
    
            }
    
            //根据关键字,通过偏移量快速查找内容
            public void ReadFile(string key)
            {
                Dictionary<string, Token> dic = GetTokenDic();
    
    
                //char[] charData = null;
                FileStream file = new FileStream("data.txt", FileMode.Open);
                int dOffset = 0;
                int dLen = 0;
                Token t = new Token();
    
                if (dic.TryGetValue(key, out t))
                {
                    dOffset = t.Offset;
                    dLen = t.Length;
                }
                byte[] byData = new byte[dLen];
                using (BinaryReader bw = new BinaryReader(file, Encoding.UTF8))
                {
                    file.Seek(dOffset + 2, SeekOrigin.Begin);
                    
                    file.Read(byData, 0, dLen); 
                    string d = Encoding.UTF8.GetString(byData);
                    Console.WriteLine(d);
                    file.Close();
                }
    
            }
        }
    

      

    static void Main(string[] args)
            {
                Search s = new Search();
                //s.BuildFile();
                s.ReadFile("1420130825");
                Console.Read();
            }
    

      

  • 相关阅读:
    接口的上溯造型——《Thinking in Java》随笔015
    数据库
    小结
    异常及String
    多态&接口
    继承&封装
    Java 类 对象 包
    Java 方法的应用
    Java数组的运用
    Java代码运用及算法思路养成——用*号输出形状
  • 原文地址:https://www.cnblogs.com/kevinke/p/3281627.html
Copyright © 2020-2023  润新知