• 蛙蛙推荐:蛙蛙牌XML压缩算法


    摘要:

    在用XML传输数据的时候,XML本身的元素名称,属性名称可能比有效的信息量占的地方还要大,本文示例一种简单实用的算法来进行XML压缩,主要思路是把XML标签和属性用整数来表示以便降低传输量。

    单元测试代码

    class Program {
       
    public static string XML = @"<?xml version=""1.0"" encoding=""utf-16""?>
        <Customer>
    <CustomerID>ALFKI</CustomerID>
    <PO>9572658</PO>
    <Address AddressType=""work"">
        <Street>One Main Street</Street>
        <City>Anywhere</City>
        <State>NJ</State>
        <Zip>08080</Zip>
    </Address>
    <Order>
        <OrderID>10966</OrderID >
        <LineItem>
            <ProductID>37</ProductID>
            <UnitPrice>26.50 </UnitPrice>
            <Quantity>8</Quantity>
            <Description>Gravad lax </Description>             
        </LineItem>
        <LineItem>
            <ProductID>56 </ProductID>
            <UnitPrice>38.00</UnitPrice>
            <Quantity>12</Quantity>
            <Description>Gnocchi di nonna Alice</Description>             
        </LineItem>
    </Order>    
    </Customer>
    ";
    static void Main(string[] args) {
        XmlZip zip 
    = new XmlZip();

        
    byte[] bs = Encoding.UTF8.GetBytes(XML);
        Console.WriteLine(
    "原始文件长度:{0}", bs.Length);
        MemoryStream ms 
    = new MemoryStream();
        DeflateStream compressedzipStream 
    = new DeflateStream(ms, CompressionMode.Compress, true);
        compressedzipStream.Write(bs, 
    0, bs.Length);
        compressedzipStream.Close();
        Console.WriteLine(
    "Deflate压缩后长度: {0}", ms.Length);

        zip.Init(XML);
        bs 
    = zip.XmlToBytes(XML);
        Console.WriteLine(
    "XML压缩后长度:{0}", bs.Length);
        
    string str = zip.BytesToXml(bs);
        Console.WriteLine(
    "还原后长度:{0}", Encoding.UTF8.GetByteCount(str));
        Console.WriteLine(str);


        ms 
    = new MemoryStream();
        compressedzipStream 
    = new DeflateStream(ms, CompressionMode.Compress, true);
        compressedzipStream.Write(bs, 
    0, bs.Length);
        compressedzipStream.Close();
        Console.WriteLine(
    "先XML压缩,再Deflate压缩后的长度:{0}", ms.Length);
        Console.ReadKey();

    }
    }

    测试输出

    原始文件长度:740
    Deflate压缩后长度: 438
    XML压缩后长度:295
    还原后长度:727
    <?xml version="1.0" encoding="utf-16"?>
    <Customer>
      <CustomerID>ALFKI</CustomerID>
      <PO>9572658</PO>
      <Address AddressType="work">
        <Street>One Main Street</Street>
        <City>Anywhere</City>
        <State>NJ</State>
        <Zip>08080</Zip>
      </Address>
      <Order>
        <OrderID>10966</OrderID>
        <LineItem>
          <ProductID>37</ProductID>
          <UnitPrice>26.50 </UnitPrice>
          <Quantity>8</Quantity>
          <Description>Gravad lax </Description>             
        </LineItem>
        <LineItem>
          <ProductID>56 </ProductID>
          <UnitPrice>38.00</UnitPrice>
          <Quantity>12</Quantity>
          <Description>Gnocchi di nonna Alice</Description>             
        </LineItem>
      </Order>
    </Customer>
    先XML压缩,再Deflate压缩后的长度:357

    可以看到,压缩后的数据约是原来数据的3分之一,可能没有其它专有的压缩算法的压缩率高,但效果还算是满意吧,而且我的算法是比较通用的,只要通信双方知道了XMLSchema,甚至双方只需要有一段完整的示例代码,就可以进行压缩通信,只做了功能测试,没做性能测试,大家可以先借鉴下思路。

    完整代码

    大致原理,就是通信双方各持有一个XML文档节点名称,属性名称的一个字典,然后发送方传输的时候用ushort代替原有的XML标签和属性名,接收方通过字典把ushort再转换成原始的元素名和属性名,这样大量不必要的重复的标签等就省去了。

    代码只做本文的示例,写的比较随意,没有什么防御性和健壮性。

    internal enum ItemType {
        Element,
        Attritube
    }
    internal class XmlNodeItem {
        
    public string Xpath { getset; }
        
    public string Text { getset; }
        
    public ItemType ItemType { getset; }
        
    public override string ToString() {
            
    return Xpath;
        }
    }
    internal class MyXpath {
        LinkedList
    <string> _node = new LinkedList<string>();
        
    public void AddElement(string name) {
            _node.AddLast(
    string.Format("/{0}", name));
        }
        
    public void AddAttribute(string name) {
            _node.AddLast(
    string.Format("/@{0}", name));
        }
        
    public void RemoveLastElement() {
            _node.RemoveLast();
        }
        
    public override string ToString() {
            StringBuilder sb 
    = new StringBuilder();
            LinkedListNode
    <string> node = _node.First;
            sb.Append(node.Value);
            
    while ((node = node.Next) != null) {
                sb.Append(node.Value);
            }
            
    return sb.ToString();
        }
    }
    class XmlZip {
        Dictionary
    <ushort, XmlNodeItem> _map = new Dictionary<ushort, XmlNodeItem>();
        Dictionary
    <stringushort> _map2 = new Dictionary<stringushort>();
        MyXpath _path 
    = new MyXpath();

        
    public void Init(string xmlInput) {
            StringReader sr 
    = new StringReader(xmlInput);
            XmlReader reader 
    = XmlReader.Create(sr);
            MemoryStream ms 
    = new MemoryStream();
            
    ushort i = 1;
            
    while (reader.Read()) {
                
    switch (reader.NodeType) {
                    
    case XmlNodeType.Element:
                        _path.AddElement(reader.Name);
                        _map[i
    ++= new XmlNodeItem() {
                            Xpath 
    = _path.ToString(),
                            Text 
    = reader.Name,
                            ItemType 
    = ItemType.Element
                        };
                        
    if (reader.HasAttributes) {
                            reader.MoveToFirstAttribute();
                            _path.AddAttribute(reader.Name);
                            _map[i
    ++= new XmlNodeItem() {
                                Xpath 
    = _path.ToString(),
                                Text 
    = reader.Name,
                                ItemType 
    = ItemType.Attritube
                            };
                            _path.RemoveLastElement();
                            
    while (reader.MoveToNextAttribute()) {
                                _path.AddAttribute(reader.Name);
                                _map[i
    ++= new XmlNodeItem() {
                                    Xpath 
    = _path.ToString(),
                                    Text 
    = reader.Name,
                                    ItemType 
    = ItemType.Attritube
                                };
                                _path.RemoveLastElement();
                            }
                            reader.MoveToElement();
                        }
                        
    if (reader.IsEmptyElement) _path.RemoveLastElement();
                        
    break;
                    
    case XmlNodeType.EndElement:
                        _path.RemoveLastElement();
                        
    break;
                    
    default:
                        
    break;
                }
            }
            
    foreach (KeyValuePair<ushort, XmlNodeItem> pair in _map) {
                _map2[pair.Value.Xpath] 
    = pair.Key;
            }
        }

        
    public byte[] XmlToBytes(string xmlInput) {
            StringReader sr 
    = new StringReader(xmlInput);
            XmlReader reader 
    = XmlReader.Create(sr);
            MemoryStream ms 
    = new MemoryStream();
            BinaryWriter bw 
    = new BinaryWriter(ms);
            
    while (reader.Read()) {
                
    ushort index;
                
    byte[] bs;
                
    switch (reader.NodeType) {
                    
    case XmlNodeType.Element:
                        _path.AddElement(reader.Name);
                        
    if (_map2.TryGetValue(_path.ToString(), out index)) {
                            bw.Write(index);
                        }
                        
    if (reader.HasAttributes) {
                            reader.MoveToFirstAttribute();
                            _path.AddAttribute(reader.Name);
                            
    if (_map2.TryGetValue(_path.ToString(), out index)) {
                                _path.RemoveLastElement();
                                bw.Write(index);
                                bs 
    = Encoding.UTF8.GetBytes(reader.Value);
                                bw.Write((
    ushort)bs.Length);
                                bw.Write(bs);
                            }
                            
    while (reader.MoveToNextAttribute()) {
                                _path.AddAttribute(reader.Name);
                                
    if (_map2.TryGetValue(_path.ToString(), out index)) {
                                    _path.RemoveLastElement();
                                    bw.Write(index);
                                    bs 
    = Encoding.UTF8.GetBytes(reader.Value);
                                    bw.Write((
    ushort)bs.Length);
                                    bw.Write(bs);
                                }
                            }
                            reader.MoveToElement();
                        }
                        
    if (reader.IsEmptyElement) {
                            _path.RemoveLastElement();
                            bw.Write(
    ushort.MaxValue);
                        }
                        
    break;
                    
    case XmlNodeType.EndElement:
                        _path.RemoveLastElement();
                        bw.Write(
    ushort.MaxValue);
                        
    break;
                    
    case XmlNodeType.Text:
                        bw.Write((
    ushort)0);
                        bs 
    = Encoding.UTF8.GetBytes(reader.Value);
                        bw.Write((
    ushort)bs.Length);
                        bw.Write(bs);
                        
    break;
                    
    default:
                        
    break;
                }
            }
            bw.Close();
            ms.Close();
            reader.Close();
            
    return ms.ToArray();
        }

        
    public string BytesToXml(byte[] bytes) {
            MemoryStream ms 
    = new MemoryStream(bytes);
            BinaryReader br 
    = new BinaryReader(ms);
            StringBuilder sb 
    = new StringBuilder();
            StringWriter sw 
    = new StringWriter(sb);
            XmlWriterSettings settings 
    = new XmlWriterSettings();
            settings.Indent 
    = true;
            XmlWriter writer 
    = XmlWriter.Create(sw, settings);

            XmlNodeItem item;
            
    while (br.PeekChar() != -1) {
                
    ushort readFlag = br.ReadUInt16();
                
    int len;
                
    byte[] bs;
                
    string str;
                
    if (_map.TryGetValue(readFlag, out item)) {
                    
    if (item.ItemType == ItemType.Element)
                        writer.WriteStartElement(item.Text);
                    
    else if (item.ItemType == ItemType.Attritube) {
                        len 
    = br.ReadUInt16();
                        bs 
    = br.ReadBytes(len);
                        str 
    = Encoding.UTF8.GetString(bs);
                        writer.WriteAttributeString(item.Text, str);
                    }
                }
                
    else if (readFlag == 0) {
                    len 
    = br.ReadUInt16();
                    bs 
    = br.ReadBytes(len);
                    str 
    = Encoding.UTF8.GetString(bs);
                    writer.WriteString(str);
                }
                
    else if (readFlag == ushort.MaxValue) {
                    writer.WriteEndElement();
                }
            }
            writer.Flush();
            writer.Close();
            sw.Close();
            br.Close();
            
    return sb.ToString();
        }
    }

    参考链接

    XML压缩和传输性能的改善

    http://blog.csdn.net/BruceWayen/archive/2006/03/13/623483.aspx

    XQzipXML压缩技术(1--介绍

    http://qiyanfeng.blog.51cto.com/503144/105203

    XQzip:可查询MXL压缩算法分析(1)

    http://qiyanfeng.blog.51cto.com/503144/105578

    WAP Binary XML Content Format

    http://www.w3.org/TR/wbxml/

  • 相关阅读:
    删除mysql服务
    取消jQuery validate验证
    评估期已过.有关如何升级您的测试软件
    修复fiddler无法抓包抓取https问题
    Win10开启高性能模式
    cmd git批量检出当前目录下的所有模块指定远程分支
    解决代码没有编译错误运行却报错: “程序包xxx不存在“的问题,最终解决办法
    尾递归
    idea Project maven根目录不见了找回
    秒转时分秒
  • 原文地址:https://www.cnblogs.com/onlytiancai/p/XmlCompression.html
Copyright © 2020-2023  润新知