• 3种方法从Html中取文本


    public static string NoHTML(string Htmlstring)
        {
            //删除脚本
            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
            //删除HTML
            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

            Htmlstring.Replace("<", "");
            Htmlstring.Replace(">", "");
            Htmlstring.Replace("\r\n", "");
            Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

            return Htmlstring;
        }

    using System.Text.RegularExpressions;//需要引用

      // 利用正则表达式去掉"<"和">"之间的内容
      private string StripHT(string strHtml)
      {
       Regex regex=new Regex("<.+?>",RegexOptions.IgnoreCase);
       string strOutput=regex.Replace(strHtml,"");
       return strOutput;
      }

     

     

    2

    using System.Text.RegularExpressions;
    /// <summary>
    /// HtmlExtract 抽取html里面的文本信息
    /// </summary>
    public class HtmlExtract
    {
       
            #region private attributes
            private string _strHtml;
            #endregion
            #region public mehtods
             public HtmlExtract(string inStrHtml)
            { _strHtml = inStrHtml;}
            public string ExtractText()
            {
                string result = _strHtml;
                result = RemoveComment(result);
                result = RemoveScript(result);
                result = RemoveStyle(result);
                result = RemoveTags(result);
                return result.Trim();
            }
            #endregion
         #region private methods
           private string RemoveComment(string input)
    {
    string result = input;
    //remove comment
    result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }

    3,using System.Text.RegularExpressions;

    /// <summary>
    /// HtmlExtract 抽取html里面的文本信息
    /// </summary>
    public class HtmlExtract
    {
       
            #region private attributes
            private string _strHtml;
            #endregion

            #region public mehtods
             public HtmlExtract(string inStrHtml)
            { _strHtml = inStrHtml;}

            public string ExtractText()
            {
                string result = _strHtml;
                result = RemoveComment(result);
                result = RemoveScript(result);
                result = RemoveStyle(result);
                result = RemoveTags(result);
                return result.Trim();
            }
            #endregion


         #region private methods
           private string RemoveComment(string input)
    {
    string result = input;
    //remove comment
    result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }
           private string RemoveStyle(string input)
    {
    string result = input;
    //remove all styles
    result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    return result;
    }
           private string RemoveScript(string input)
    {
    string result = input;
    result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    return result;
    }
           private string RemoveTags(string input)
    {
    string result = input;
    result = result.Replace(" ", " ");
    result = result.Replace("'", "\"");
    result = result.Replace("<", "<");
    result = result.Replace(">", ">");
    result = result.Replace("&", "&");
    result = result.Replace("<br>", "\r\n");
    result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }
         #endregion
    }

     

  • 相关阅读:
    java几种数据的默认扩容机制
    web.xml配置详解
    Bootstrap文件上传组件
    JAVA四则运算算法
    Oracle 和 mysql 的批量操作Sql语句 的区别
    JAVA使用ItextPDF
    c# 状态机实现
    c++11模拟boost元占位符placeholder
    vs2012 函数参数内存对齐引发编译错误
    windows下matplotlib编译安装备忘
  • 原文地址:https://www.cnblogs.com/glume/p/1997500.html
Copyright © 2020-2023  润新知