• Asp.net对比两网页不同生成修改痕迹用于编辑器修改内容的历史版本


    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Collections;
    
    namespace Util
    {
        public class HtmlDiff
        {
    
            private StringBuilder content;
            private string oldText, newText;
            private string[] oldWords, newWords;
            Dictionary<string, List<int>> wordIndices;
            private string[] specialCaseOpeningTags = new string[] { "<strong[\\>\\s]+", "<b[\\>\\s]+", "<i[\\>\\s]+", "<big[\\>\\s]+", "<small[\\>\\s]+", "<u[\\>\\s]+", "<sub[\\>\\s]+", "<sup[\\>\\s]+", "<strike[\\>\\s]+", "<s[\\>\\s]+" };
            private string[] specialCaseClosingTags = new string[] { "</strong>", "</b>", "</i>", "</big>", "</small>", "</u>", "</sub>", "</sup>", "</strike>", "</s>" };
    
            //private string[] imgOpenTag = new string[] { "<img[\\>\\s]+" };
    
            /// <summary>
            /// Initializes a new instance of the <see cref="Diff"/> class.
            /// </summary>
            /// <param name="oldText">The old text.</param>
            /// <param name="newText">The new text.</param>
            public HtmlDiff(string oldText, string newText)
            {
                this.oldText = oldText;
                this.newText = newText;
    
                this.content = new StringBuilder();
            }
    
            /// <summary>
            /// Builds the HTML diff output
            /// </summary>
            /// <returns>HTML diff markup</returns>
            public string Build()
            {
                this.SplitInputsToWords();
    
                this.IndexNewWords();
    
                var operations = this.Operations();
    
                foreach (var item in operations)
                {
                    this.PerformOperation(item);
                }
    
                return this.content.ToString();
            }
    
            private void IndexNewWords()
            {
                this.wordIndices = new Dictionary<string, List<int>>();
                for (int i = 0; i < this.newWords.Length; i++)
                {
                    string word = this.newWords[i];
    
                    if (this.wordIndices.ContainsKey(word))
                    {
                        this.wordIndices[word].Add(i);
                    }
                    else
                    {
                        this.wordIndices[word] = new List<int>();
                        this.wordIndices[word].Add(i);
                    }
                }
            }
    
            private void SplitInputsToWords()
            {
                this.oldWords = ConvertHtmlToListOfWords(this.Explode(this.oldText));
                this.newWords = ConvertHtmlToListOfWords(this.Explode(this.newText));
            }
    
            //把html字符串分成<>,空格和单个字符串,然后比较
            private string[] ConvertHtmlToListOfWords(string[] characterString)
            {
                Mode mode = Mode.character;
                string current_word = String.Empty;
                List<string> words = new List<string>();
    
                foreach (var character in characterString)
                {
                    switch (mode)
                    {
                        case Mode.character:
    
                            if (this.IsStartOfTag(character))
                            {
                                if (current_word != String.Empty)
                                {
                                    words.Add(current_word);
                                }
    
                                current_word = "<";
                                mode = Mode.tag;
                            }
                            else if (Regex.IsMatch(character, "\\s"))
                            {
                                if (current_word != String.Empty)
                                {
                                    words.Add(current_word);
                                }
                                current_word = character;
                                mode = Mode.whitespace;
                            }
                            else
                            {
                                if (current_word != string.Empty)
                                {
                                    words.Add(current_word);
                                }
                                current_word = character;
                            }
    
                            break;
                        case Mode.tag:
    
                            if (this.IsEndOfTag(character))
                            {
                                current_word += ">";
                                words.Add(current_word);
                                current_word = "";
    
                                if (IsWhiteSpace(character))
                                {
                                    mode = Mode.whitespace;
                                }
                                else
                                {
                                    mode = Mode.character;
                                }
                            }
                            else
                            {
                                current_word += character;
                            }
    
                            break;
                        case Mode.whitespace:
    
                            if (this.IsStartOfTag(character))
                            {
                                if (current_word != String.Empty)
                                {
                                    words.Add(current_word);
                                }
                                current_word = "<";
                                mode = Mode.tag;
                            }
                            else if (Regex.IsMatch(character, "\\s"))
                            {
                                current_word += character;
                            }
                            else
                            {
                                if (current_word != String.Empty)
                                {
                                    words.Add(current_word);
                                }
    
                                current_word = character;
                                mode = Mode.character;
                            }
    
                            break;
                        default:
                            break;
                    }
    
    
                }
                if (current_word != string.Empty)
                {
                    words.Add(current_word);
                }
    
                return words.ToArray();
            }
    
            private bool IsStartOfTag(string val)
            {
                return val == "<";
            }
    
            private bool IsEndOfTag(string val)
            {
                return val == ">";
            }
    
            private bool IsWhiteSpace(string value)
            {
                return Regex.IsMatch(value, "\\s", RegexOptions.IgnoreCase);
            }
    
            private string[] Explode(string value)
            {
                return Regex.Split(value, "", RegexOptions.IgnoreCase);
            }
    
            private void PerformOperation(Operation operation)
            {
                switch (operation.Action)
                {
                    case Action.equal:
                        this.ProcessEqualOperation(operation);
                        break;
                    case Action.delete:
                        this.ProcessDeleteOperation(operation, "diffdel");
                        break;
                    case Action.insert:
                        this.ProcessInsertOperation(operation, "diffins");
                        break;
                    case Action.none:
                        break;
                    case Action.replace:
                        this.ProcessReplaceOperation(operation);
                        break;
                    default:
                        break;
                }
            }
    
            private void ProcessReplaceOperation(Operation operation)
            {
                this.ProcessDeleteOperation(operation, "diffmod");
                this.ProcessInsertOperation(operation, "diffmod");
            }
    
            private void ProcessInsertOperation(Operation operation, string cssClass)
            {
                this.InsertTag("ins", cssClass, this.newWords.Where((s, pos) => pos >= operation.StartInNew && pos < operation.EndInNew).ToList());
            }
    
            private void ProcessDeleteOperation(Operation operation, string cssClass)
            {
                var text = this.oldWords.Where((s, pos) => pos >= operation.StartInOld && pos < operation.EndInOld).ToList();
                this.InsertTag("del", cssClass, text);
            }
    
            private void ProcessEqualOperation(Operation operation)
            {
                var result = this.newWords.Where((s, pos) => pos >= operation.StartInNew && pos < operation.EndInNew).ToArray();
                this.content.Append(String.Join("", result));
            }
    
    
            /// <summary>
            /// This method encloses words within a specified tag (ins or del), and adds this into "content", 
            /// with a twist: if there are words contain tags, it actually creates multiple ins or del, 
            /// so that they don't include any ins or del. This handles cases like
            /// old: '<p>a</p>'
            /// new: '<p>ab</p><p>c</b>'
            /// diff result: '<p>a<ins>b</ins></p><p><ins>c</ins></p>'
            /// this still doesn't guarantee valid HTML (hint: think about diffing a text containing ins or
            /// del tags), but handles correctly more cases than the earlier version.
            /// 
            /// P.S.: Spare a thought for people who write HTML browsers. They live in this ... every day.
            /// </summary>
            /// <param name="tag"></param>
            /// <param name="cssClass"></param>
            /// <param name="words"></param>
            private void InsertTag(string tag, string cssClass, List<string> words)
            {
                while (true)
                {
                    if (words.Count == 0)
                    {
                        break;
                    }
    
                    var nonTags = ExtractConsecutiveWords(words, x => !this.IsTag(x));
    
                    string specialCaseTagInjection = string.Empty;
                    bool specialCaseTagInjectionIsBefore = false;                       //标签是插入在前还是在后
    
                    //string text = this.WrapText(string.Join("", words.ToArray()), tag, cssClass);
    
                    //this.content.Append(text);
    
    
                    if (nonTags.Length != 0)
                    {
                        string text = this.WrapText(string.Join("", nonTags), tag, cssClass);
    
                        this.content.Append(text);
                    }
                    else
                    {
                        // Check if strong tag
    
                        if (this.specialCaseOpeningTags.FirstOrDefault(x => Regex.IsMatch(words[0], x, RegexOptions.IgnoreCase)) != null)
                        {
                            specialCaseTagInjection = "<ins class='mod'>";
    
                            //判断是否是图片,是图片设specialCaseTagInjectionIsBefore=true
                            //if (imgOpenTag.FirstOrDefault(x => Regex.IsMatch(words[0], x)) != null)
                            //{
                            //    specialCaseTagInjectionIsBefore = true;
                            //}
    
                            //if (tag == "del")
                            //{
                            words.Clear();
                            //}
                        }
                        else if (this.specialCaseClosingTags.Contains(words[0]))
                        {
                            specialCaseTagInjection = "</ins>";
                            specialCaseTagInjectionIsBefore = true;
                            //if (tag == "del")
                            //{
                            words.Clear();
                            //}
                        }
                        else if (Regex.IsMatch(words[0], "<img[\\>\\s]+")) // 图片
                        {
                            specialCaseTagInjectionIsBefore = true;
                            specialCaseTagInjection = string.Format("<{0} class='{1}'>", tag, cssClass);
                            this.content.Append(specialCaseTagInjection + String.Join("", this.ExtractConsecutiveWords(words, x => this.IsTag(x))));
                            specialCaseTagInjection = string.Format("</{0}>", tag);
                            words.Clear();
                        }
    
                    }
    
                    if (words.Count == 0 && specialCaseTagInjection.Length == 0)
                    {
                        break;
                    }
    
                    if (specialCaseTagInjectionIsBefore)
                    {
                        this.content.Append(specialCaseTagInjection + String.Join("", this.ExtractConsecutiveWords(words, x => this.IsTag(x))));
                    }
                    else
                    {
                        this.content.Append(String.Join("", this.ExtractConsecutiveWords(words, x => this.IsTag(x))) + specialCaseTagInjection);
                    }
                }
            }
    
            private string WrapText(string text, string tagName, string cssClass)
            {
                return string.Format("<{0} class='{1}'>{2}</{0}>", tagName, cssClass, text);
            }
    
            private string[] ExtractConsecutiveWords(List<string> words, Func<string, bool> condition)
            {
                int? indexOfFirstTag = null;
    
                for (int i = 0; i < words.Count; i++)
                {
                    string word = words[i];
    
                    if (!condition(word))
                    {
                        indexOfFirstTag = i;
                        break;
                    }
                }
    
                if (indexOfFirstTag != null)
                {
                    var items = words.Where((s, pos) => pos >= 0 && pos < indexOfFirstTag).ToArray();
                    if (indexOfFirstTag.Value > 0)
                    {
                        words.RemoveRange(0, indexOfFirstTag.Value);
                    }
                    return items;
                }
                else
                {
                    var items = words.Where((s, pos) => pos >= 0 && pos <= words.Count).ToArray();
                    words.RemoveRange(0, words.Count);
                    return items;
                }
            }
    
            /// <summary>
            /// 是否是标签<img>,<b>
            /// </summary>
            /// <param name="item"></param>
            /// <returns></returns>
            private bool IsTag(string item)
            {
                bool isTag = IsOpeningTag(item) || IsClosingTag(item);
                return isTag;
            }
    
            private bool IsOpeningTag(string item)
            {
                return Regex.IsMatch(item, "^\\s*<[^>]+>\\s*$", RegexOptions.IgnoreCase);
            }
    
            private bool IsClosingTag(string item)
            {
                return Regex.IsMatch(item, "^\\s*</[^>]+>\\s*$", RegexOptions.IgnoreCase);
            }
    
    
            private List<Operation> Operations()
            {
                int positionInOld = 0, positionInNew = 0;
                List<Operation> operations = new List<Operation>();
    
                var matches = this.MatchingBlocks();
    
                matches.Add(new Match(this.oldWords.Length, this.newWords.Length, 0));
    
                for (int i = 0; i < matches.Count; i++)
                {
                    var match = matches[i];
    
                    bool matchStartsAtCurrentPositionInOld = (positionInOld == match.StartInOld);
                    bool matchStartsAtCurrentPositionInNew = (positionInNew == match.StartInNew);
    
                    Action action = Action.none;
    
                    if (matchStartsAtCurrentPositionInOld == false
                        && matchStartsAtCurrentPositionInNew == false)
                    {
                        action = Action.replace;
                    }
                    else if (matchStartsAtCurrentPositionInOld == true
                        && matchStartsAtCurrentPositionInNew == false)
                    {
                        action = Action.insert;
                    }
                    else if (matchStartsAtCurrentPositionInOld == false
                        && matchStartsAtCurrentPositionInNew == true)
                    {
                        action = Action.delete;
                    }
                    else // This occurs if the first few words are the same in both versions
                    {
                        action = Action.none;
                    }
    
                    if (action != Action.none)
                    {
                        operations.Add(
                            new Operation(action,
                                positionInOld,
                                match.StartInOld,
                                positionInNew,
                                match.StartInNew));
                    }
    
                    if (match.Size != 0)
                    {
                        operations.Add(new Operation(
                            Action.equal,
                            match.StartInOld,
                            match.EndInOld,
                            match.StartInNew,
                            match.EndInNew));
    
                    }
    
                    positionInOld = match.EndInOld;
                    positionInNew = match.EndInNew;
                }
    
                return operations;
    
            }
    
            private List<Match> MatchingBlocks()
            {
                List<Match> matchingBlocks = new List<Match>();
                this.FindMatchingBlocks(0, this.oldWords.Length, 0, this.newWords.Length, matchingBlocks);
                return matchingBlocks;
            }
    
    
            private void FindMatchingBlocks(int startInOld, int endInOld, int startInNew, int endInNew, List<Match> matchingBlocks)
            {
                var match = this.FindMatch(startInOld, endInOld, startInNew, endInNew);
    
                if (match != null)
                {
                    if (startInOld < match.StartInOld && startInNew < match.StartInNew)
                    {
                        this.FindMatchingBlocks(startInOld, match.StartInOld, startInNew, match.StartInNew, matchingBlocks);
                    }
    
                    matchingBlocks.Add(match);
    
                    if (match.EndInOld < endInOld && match.EndInNew < endInNew)
                    {
                        this.FindMatchingBlocks(match.EndInOld, endInOld, match.EndInNew, endInNew, matchingBlocks);
                    }
    
                }
            }
    
    
            private Match FindMatch(int startInOld, int endInOld, int startInNew, int endInNew)
            {
                int bestMatchInOld = startInOld;
                int bestMatchInNew = startInNew;
                int bestMatchSize = 0;
    
                Dictionary<int, int> matchLengthAt = new Dictionary<int, int>();
    
                for (int indexInOld = startInOld; indexInOld < endInOld; indexInOld++)
                {
                    var newMatchLengthAt = new Dictionary<int, int>();
    
                    string index = this.oldWords[indexInOld];
    
                    if (!this.wordIndices.ContainsKey(index))
                    {
                        matchLengthAt = newMatchLengthAt;
                        continue;
                    }
    
                    foreach (var indexInNew in this.wordIndices[index])
                    {
                        if (indexInNew < startInNew)
                        {
                            continue;
                        }
    
                        if (indexInNew >= endInNew)
                        {
                            break;
                        }
    
    
                        int newMatchLength = (matchLengthAt.ContainsKey(indexInNew - 1) ? matchLengthAt[indexInNew - 1] : 0) + 1;
                        newMatchLengthAt[indexInNew] = newMatchLength;
    
                        if (newMatchLength > bestMatchSize)
                        {
                            bestMatchInOld = indexInOld - newMatchLength + 1;
                            bestMatchInNew = indexInNew - newMatchLength + 1;
                            bestMatchSize = newMatchLength;
                        }
                    }
    
                    matchLengthAt = newMatchLengthAt;
                }
    
                return bestMatchSize != 0 ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize) : null;
            }
    
        }
    
        public class Match
        {
            public Match(int startInOld, int startInNew, int size)
            {
                this.StartInOld = startInOld;
                this.StartInNew = startInNew;
                this.Size = size;
            }
    
            public int StartInOld { get; set; }
            public int StartInNew { get; set; }
            public int Size { get; set; }
    
            public int EndInOld
            {
                get
                {
                    return this.StartInOld + this.Size;
                }
            }
    
            public int EndInNew
            {
                get
                {
                    return this.StartInNew + this.Size;
                }
            }
    
        }
    
        public class Operation
        {
            public Action Action { get; set; }
            public int StartInOld { get; set; }
            public int EndInOld { get; set; }
            public int StartInNew { get; set; }
            public int EndInNew { get; set; }
    
            public Operation(Action action, int startInOld, int endInOld, int startInNew, int endInNew)
            {
                this.Action = action;
                this.StartInOld = startInOld;
                this.EndInOld = endInOld;
                this.StartInNew = startInNew;
                this.EndInNew = endInNew;
            }
        }
    
        public enum Mode
        {
            character,
            tag,
            whitespace,
        }
    
        public enum Action
        {
            equal,
            delete,
            insert,
            none,
            replace
        }
    }

    使用方法:

    HtmlDiff html=new HtmlDiff(string 旧版本,string 新版本);
    string 比对后字符=html.Build();
    
    
  • 相关阅读:
    字,字节,字长,位的概念与区分
    Prim算法、Kruskal算法、Dijkstra算法
    关联容器
    各种排序算法的性能特点
    随机种子
    实参&形参
    C++中的I/O输入输出问题
    NLPIR智能KGB知识图谱引擎可视化数据挖掘
    NLPIR-KGB知识图谱引擎突破传统数据挖掘束缚
    NLPIR大数据语义系统KGB技术引领新方向
  • 原文地址:https://www.cnblogs.com/iwenwen/p/3129060.html
Copyright © 2020-2023  润新知