#region GetOnlyTextFromHtmlCode + RemoveHtmlChars + RemoveTagFromHtmlCode /// <summary> /// http://www.codeproject.com/script/Content/ViewAssociatedFile.aspx?rzp=%2FKB%2Fedit%2FZetaHtmlEditControl%2F%2FZetaHtmlEditControl-Source.zip&zep=Control%2FHtmlEditControl.cs&obid=43954&obtid=2&ovid=13 /// </summary> /// <param name="htmlCode"></param> /// <returns></returns> private static string getOnlyTextFromHtmlCode(string htmlCode) { //<br> htmlCode = htmlCode.Replace(" ", @" "); htmlCode = htmlCode.Replace(" ", @" "); htmlCode = htmlCode.Replace(" ", @" "); htmlCode = htmlCode.Replace(@"</p>", Environment.NewLine + Environment.NewLine); htmlCode = htmlCode.Replace(@"</P>", Environment.NewLine + Environment.NewLine); //html comment htmlCode = Regex.Replace( htmlCode, @"<!--.*?-->", string.Empty, RegexOptions.Singleline | RegexOptions.IgnoreCase); //<p> htmlCode = Regex.Replace(htmlCode, @"<br[^>]*>", Environment.NewLine, RegexOptions.Singleline | RegexOptions.IgnoreCase); //tags htmlCode = removeTagFromHtmlCode(@"style", htmlCode); htmlCode = removeTagFromHtmlCode(@"script", htmlCode); //html htmlCode = Regex.Replace( htmlCode, "<(.| )+?>", string.Empty, RegexOptions.Singleline | RegexOptions.IgnoreCase); //umlaute htmlCode = unescapeHtmlEntities(htmlCode); //whitespaces htmlCode = Regex.Replace( htmlCode, @" +", @" ", RegexOptions.Singleline | RegexOptions.IgnoreCase); return htmlCode; } /// <summary> /// http://dev.w3.org/html5/html-author/charref /// </summary> /// <param name="htmlCode"></param> /// <returns></returns> private static string unescapeHtmlEntities(string htmlCode) {
htmlCode = htmlCode.Replace(@" ", @" ");
htmlCode = htmlCode.Replace(@"Ä", @"ä");
htmlCode = htmlCode.Replace(@"&absp;", @"");
htmlCode = htmlCode.Replace(@"&obsp;", @"");
htmlCode = htmlCode.Replace(@"&Obsp;", @"");
htmlCode = htmlCode.Replace(@"&ubsp;", @"");
htmlCode = htmlCode.Replace(@"&Ubsp;", @"");
htmlCode = htmlCode.Replace(@"ß", @"ß");
htmlCode = htmlCode.Replace(@"£", @"£");
htmlCode = htmlCode.Replace(@"§", @"§");
htmlCode = htmlCode.Replace(@"©", @"©");
htmlCode = htmlCode.Replace(@"®", @"®");
htmlCode = htmlCode.Replace(@"µ", @"µ");
htmlCode = htmlCode.Replace(@"¶", @"¶");
htmlCode = htmlCode.Replace(@"Ø", @"Ø");
htmlCode = htmlCode.Replace(@"ø", @"Ø");
htmlCode = htmlCode.Replace(@"÷", @"÷");
htmlCode = htmlCode.Replace(@"×", @"×");
return htmlCode; } private static string removeTagFromHtmlCode( string tag, string htmlCode) { return Regex.Replace( htmlCode, string.Format(@"<{0}.*?</{1}>", tag, tag), string.Empty, RegexOptions.Singleline | RegexOptions.IgnoreCase); } #endregion