• HtmlEntities


    #region GetOnlyTextFromHtmlCode + RemoveHtmlChars + RemoveTagFromHtmlCode
            /// <summary>
            /// http://www.codeproject.com/script/Content/ViewAssociatedFile.aspx?rzp=%2FKB%2Fedit%2FZetaHtmlEditControl%2F%2FZetaHtmlEditControl-Source.zip&zep=Control%2FHtmlEditControl.cs&obid=43954&obtid=2&ovid=13
            /// </summary>
            /// <param name="htmlCode"></param>
            /// <returns></returns>
            private static string getOnlyTextFromHtmlCode(string htmlCode)
            {
                //<br>
                htmlCode = htmlCode.Replace("
    ", @" ");
                htmlCode = htmlCode.Replace("
    ", @" ");
                htmlCode = htmlCode.Replace("
    ", @" ");
    
                htmlCode = htmlCode.Replace(@"</p>", Environment.NewLine + Environment.NewLine);
                htmlCode = htmlCode.Replace(@"</P>", Environment.NewLine + Environment.NewLine);
    
                //html comment 
                htmlCode = Regex.Replace(
                    htmlCode,
                    @"<!--.*?-->",
                    string.Empty,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                //<p>
                htmlCode = Regex.Replace(htmlCode,
                    @"<br[^>]*>",
                    Environment.NewLine,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                //tags
                htmlCode = removeTagFromHtmlCode(@"style", htmlCode);
                htmlCode = removeTagFromHtmlCode(@"script", htmlCode);
    
                //html
                htmlCode = Regex.Replace(
                    htmlCode,
                    "<(.|
    )+?>",
                    string.Empty,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                //umlaute
                htmlCode = unescapeHtmlEntities(htmlCode);
    
                //whitespaces
                htmlCode = Regex.Replace(
                    htmlCode,
                    @" +",
                    @" ",
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                return htmlCode;
            }
            /// <summary>
            /// http://dev.w3.org/html5/html-author/charref
            /// </summary>
            /// <param name="htmlCode"></param>
            /// <returns></returns>
            private static string unescapeHtmlEntities(string htmlCode)
            {

          htmlCode = htmlCode.Replace(@"&nbsp;", @" ");

          htmlCode = htmlCode.Replace(@"&Auml;", @"ä");
          htmlCode = htmlCode.Replace(@"&absp;", @"");
          htmlCode = htmlCode.Replace(@"&obsp;", @"");
          htmlCode = htmlCode.Replace(@"&Obsp;", @"");
          htmlCode = htmlCode.Replace(@"&ubsp;", @"");
          htmlCode = htmlCode.Replace(@"&Ubsp;", @"");
          htmlCode = htmlCode.Replace(@"&szlig;", @"ß");

          htmlCode = htmlCode.Replace(@"&pound;", @"£");
          htmlCode = htmlCode.Replace(@"&sect;", @"§");
          htmlCode = htmlCode.Replace(@"&copy;", @"©");
          htmlCode = htmlCode.Replace(@"&reg;", @"®");
          htmlCode = htmlCode.Replace(@"&micro;", @"µ");
          htmlCode = htmlCode.Replace(@"&para;", @"¶");
          htmlCode = htmlCode.Replace(@"&Oslash;", @"Ø");
          htmlCode = htmlCode.Replace(@"&oslash;", @"Ø");
          htmlCode = htmlCode.Replace(@"&divide;", @"÷");
          htmlCode = htmlCode.Replace(@"&times;", @"×");

                return htmlCode;
            }
    
            private static string removeTagFromHtmlCode(
                string tag,
                string htmlCode)
            {
                return Regex.Replace(
                    htmlCode,
                    string.Format(@"<{0}.*?</{1}>", tag, tag),
                    string.Empty,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
            }
            #endregion
    

      

  • 相关阅读:
    [转][html5]网页横屏
    [转][EasyUI]扩展 DateBox
    [转][C#]枚举的遍历Enum
    [转][C#]单例模式之懒加载
    [C#][Quartz]添加监听器
    [C#][Quartz]帮助类
    [转]Win 10 的 Win 按键没反应
    转来的--轻松自动化---selenium-webdriver(python) (七)---定位iframe——转来的
    安装charles
    遇到的问题汇总
  • 原文地址:https://www.cnblogs.com/geovindu/p/4310328.html
Copyright © 2020-2023  润新知