• 网页格式化排版代码,专用信息采集后的内容整理


            public static string ClearHtml(string content) {           
                Regex regex = new Regex("");
                //首先把p标签的属性去掉,只留<p>
                regex = new Regex(@"<p.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                content = regex.Replace(content, "<p>");
    
                //找到网页中的各种标签,留待后续处理
                regex = new Regex(@"<[/]*(?<txt>.*?)[s>]", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                List<string> labels = new List<string>();
                MatchCollection mclabels = regex.Matches(content);
                foreach (Match m in mclabels) {
                    if(labels.Contains(m.Groups["txt"].Value) == false)
                        labels.Add(m.Groups["txt"].Value.ToLower());
                }
                //对各种标签进行替换,p、img、strong除外。br后面会进行单独处理
                foreach (string lable in labels) {
                    if (lable=="p" || lable == "img" || lable=="strong" || lable=="br")
                        continue;
                    regex = new Regex(@"<[/]*" + lable + ".*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                    content = regex.Replace(content, "");
                }
                MatchCollection mc = null;
    
                regex = new Regex(@"<img.*?srcs*?=s*?['""](?<txt>.*?)['""].*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                mc = regex.Matches(content);
                foreach (Match m in mc) {
                    content = content.Replace(m.Value,"<img src='"+ m.Groups["txt"].Value+"' />");
                }
    
                Regex r = new Regex(@"<br.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                content = r.Replace(content, "
    ");
                r = new Regex(@"[
    	]", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                content = r.Replace(content, "</p><p>");
                content = content.Trim();
                if (content.StartsWith("</p>") == true)
                    content = content.Substring(4);
                if (content.EndsWith("<p>") == true)
                    content = content.Remove(content.Length - 3);
    
                //替换段前空格开始
                regex = new Regex(@"<p>s*&nbsp;", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                while (regex.IsMatch(content))
                {
                    content = regex.Replace(content, @"<p>");
                }
                regex = new Regex(@"<p>s+", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                while (regex.IsMatch(content))
                {
                    content = regex.Replace(content, @"<p>");
                }
                regex = new Regex(@"<p> +", RegexOptions.IgnoreCase | RegexOptions.Singleline);
                while (regex.IsMatch(content))
                {
                    content = regex.Replace(content, @"<p>");
                }
                //替换段前空格结束
    
                //替换p标签空嵌套的情况
                regex = new Regex(@"<p>s*?<p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                while (regex.IsMatch(content))
                    content = regex.Replace(content, "<p>");
                regex = new Regex(@"</p>s*?</p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                while (regex.IsMatch(content))
                    content = regex.Replace(content, @"</p>");
    
                //替换p标签内容为空的情况
                regex = new Regex(@"<p>(?<txt>.*?)</p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                mc = regex.Matches(content);
                foreach (Match m in mc) {
                    string value = m.Groups["txt"].Value;
                    value = value.Replace("&nbsp;", "").Trim();
                    if (string.IsNullOrEmpty(value) == true)
                        content = content.Replace(m.Value,"");
                }
    
                //段首加空格
                content = content.Replace("<p>", "<p>  ");
    
                return content;
            }

    剔除了除p、img、strong之外的其他标签,对p、img的各种属性也进行了清除,专门用于生成干净的网页正文,可用于信息采集后的内容整理和格式化排版。自用代码,算法效率可能不高,但是足以满足目前需求了。

  • 相关阅读:
    四校联考【20171001】
    C语言基础知识
    页表和TLB
    python
    Cache组织方式
    On the Spectre and Meltdown Processor Security Vulnerabilities
    latex-组织文本
    深入理解计算机系统
    深入理解计算机系统-计算机系统漫游
    逻辑地址到物理地址的转换
  • 原文地址:https://www.cnblogs.com/theluther/p/4762435.html
Copyright © 2020-2023  润新知