using System; using System.IO; using System.Linq; using HtmlAgilityPack; namespace Wino.Services.Extensions { public static class HtmlAgilityPackExtensions { /// /// Clears out the src attribute for all `img` and `v:fill` tags. /// /// public static void ClearImages(this HtmlDocument document) { if (document.DocumentNode.InnerHtml.Contains(" /// Removes `style` tags from the document. /// /// public static void ClearStyles(this HtmlDocument document) { document.DocumentNode .Descendants() .Where(n => n.Name.Equals("script", StringComparison.OrdinalIgnoreCase) || n.Name.Equals("style", StringComparison.OrdinalIgnoreCase) || n.Name.Equals("#comment", StringComparison.OrdinalIgnoreCase)) .ToList() .ForEach(n => n.Remove()); } /// /// Returns plain text from the HTML content. /// /// Content to get preview from. /// Text body for the html. public static string GetPreviewText(string htmlContent) { if (string.IsNullOrEmpty(htmlContent)) return string.Empty; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(htmlContent); StringWriter sw = new StringWriter(); ConvertTo(doc.DocumentNode, sw); sw.Flush(); return sw.ToString().Replace(Environment.NewLine, ""); } private static void ConvertContentTo(HtmlNode node, TextWriter outText) { foreach (HtmlNode subnode in node.ChildNodes) { ConvertTo(subnode, outText); } } private static void ConvertTo(HtmlNode node, TextWriter outText) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if (parentName == "script" || parentName == "style") break; // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) break; // check the text is meaningful and not a bunch of whitespaces if (html.Trim().Length > 0) { outText.Write(HtmlEntity.DeEntitize(html)); } break; case HtmlNodeType.Element: switch (node.Name) { case "p": // treat paragraphs as crlf outText.Write("\r\n"); break; case "br": outText.Write("\r\n"); break; } if (node.HasChildNodes) { ConvertContentTo(node, outText); } break; } } } }