• 正则表达式过滤HTML、JS、CSS


    功能用途

    主要是用来提取html页面内容时使用。

    示例代码

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Net;
    using System.Net.NetworkInformation;
    using System.Net.Sockets;
    using System.Threading;
    using System.Text.RegularExpressions;
    namespace HtmlRegex
    {
        public class BaseRegex
        {
            WebClient web = new WebClient();
            public void DeBug(string path,int encoding,string content)
            {
                Encoding encods;
                if (encoding == 1)
                    encods = Encoding.UTF8;
                else
                    encods = Encoding.Default;
                StreamWriter sw = new StreamWriter(path,true ,encods);
                sw.WriteLine(content);
                sw.Flush();
                sw.Close();
            }
            public string getPageContent(string url, int encoding)
            {
                byte[] buff = web.DownloadData(url);
                if (encoding == 1)
                {
                    return Encoding.UTF8.GetString(buff);
                }
                return Encoding.Default.GetString(buff);
            }
            public string checkHtml(string html)
            {
                //过滤JS和CSS
                Regex regex1 = new Regex(@"<script.*?>.+?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex2 = new Regex(@"<style.*?>.+?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex3 = new Regex(@"<script.*?>.*?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex4 = new Regex(@"<style.*?>.*?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                
                Regex regex5 = new Regex(@"<.*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex6 = new Regex(@"&S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex7 = new Regex(@"<!--.+?-->", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex8 = new Regex(@"[
    ]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                //HTML标签包括自闭和标签
                //Regex regex9 = new Regex(@"<(.*)(.*)>.*</1>|<(.*) />", RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                html = regex1.Replace(html, "");
                html = regex2.Replace(html, "");
                html = regex3.Replace(html, "");
                html = regex4.Replace(html, "");
                html = regex5.Replace(html, "");
                html = regex6.Replace(html, "");
                html = regex7.Replace(html, "");
                html = regex8.Replace(html, "");
                html = html.Replace(" ", "");
                return html;
            }
        }
    }
  • 相关阅读:
    would clobber existing tag
    已成功与服务器建立连接,但是在登录前的握手期间发生错误。 (provider: TCP 提供程序, error: 0
    C#搭建简单的http服务器,访问静态资源
    使用iis反向代理
    WorkerServices部署为Windows服务
    mongo 操作数据库的方式
    odoo db_name 指定多个数据库
    odoo 如何设置字段变更跟踪
    odoo qweb 视图使用widget
    odoo 代码片段比较全的扩展
  • 原文地址:https://www.cnblogs.com/shya/p/2439443.html
Copyright © 2020-2023  润新知