I have a requirement in which I need to render/display plain text from HTML content, so I would like to know how can I do it using C#?
For example, below is my HTML code
<p>Some text here</p>
<div>Some more <strong>text</strong></div>
I would like to have output as below in plain text
Some text here. Some more text
Any sample code or tutorial to do it?
You can create a helper class to convert your HTML content into plain text and use the below method written in C#
public static string HTMLToText(string HTMLCode)
{
// Remove new lines since they are not visible in HTML
HTMLCode = HTMLCode.Replace("\n", " ");
// Remove tab spaces
HTMLCode = HTMLCode.Replace("\t", " ");
// Remove multiple white spaces from HTML
HTMLCode = Regex.Replace(HTMLCode, "\\s+", " ");
// Remove HEAD tag
HTMLCode = Regex.Replace(HTMLCode, "<head.*?</head>", ""
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
// Remove any JavaScript
HTMLCode = Regex.Replace(HTMLCode, "<script.*?</script>", ""
, RegexOptions.IgnoreCase | RegexOptions.Singleline);
// Replace special characters like &, <, >, " etc.
StringBuilder sbHTML = new StringBuilder(HTMLCode);
// Note: There are many more special characters, these are just
// most common. You can add new characters in this arrays if needed
string[] OldWords = {" ", "&", """, "<",
">", "®", "©", "•", "™","'"};
string[] NewWords = { " ", "&", "\"", "<", ">", "®", "©", "•", "™","\'" };
for (int i = 0; i < OldWords.Length; i++)
{
sbHTML.Replace(OldWords[i], NewWords[i]);
}
// Check if there are line breaks (<br>) or paragraph (<p>)
sbHTML.Replace("<br>", "\n<br>");
sbHTML.Replace("<br ", "\n<br ");
sbHTML.Replace("<p ", "\n<p ");
// Finally, remove all HTML tags and return plain text
return System.Text.RegularExpressions.Regex.Replace(
sbHTML.ToString(), "<[^>]*>", "");
}
The above method takes the HTML content and remove's all the HTML tags and code and provide you output as plain text, you can check the online sample here http://rextester.com/AKMG13869
Go to the above link and run it, you can see output as
Some text here Some more text
done.
OR
you can also use HTML HtmlAgilityPack to convert HTML to text in C#
Example:
var sampleText = HtmlUtilities.ConvertToPlainText(string html);
Thanks.
You can also use HTMLAgility Pack for .NET and then directly convert HTML into plain text in C#, here is the sample code
using System;
using HtmlAgilityPack;
using System.IO;
public class Program
{
public static void Main()
{
var html = @"<html><head><meta name=""Generator"" content=""Microsoft Exchange Server"">
<!-- converted from text -->
<style><!-- .EmailQuote { margin-left: 1pt; padding-left: 4pt; border-left: #800000 2px solid; } --></style></head>
<body>
<font size=""2""><span style=""font-size:11pt;""><div class=""PlainText""> <p>Hello world, how are you doing today?</p><p>This is a separate <b>paragraph</b></p></div></span></font>
</body>
</html>";
var plainText = ConvertHtml(html);
Console.WriteLine(plainText);
}
public static string Convert(string path)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(path);
StringWriter sw = new StringWriter();
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
public static string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
StringWriter sw = new StringWriter();
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
public static void ConvertContentTo(HtmlNode node, TextWriter outText)
{
foreach(HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText);
}
}
public static void ConvertTo(HtmlNode node, TextWriter outText)
{
string html;
switch(node.NodeType)
{
case HtmlNodeType.Comment:
// don't output comments
break;
case HtmlNodeType.Document:
ConvertContentTo(node, outText);
break;
case HtmlNodeType.Text:
// script and style must not be output
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
break;
// get text
html = ((HtmlTextNode)node).Text;
// is it in fact a special closing node output as text?
if (HtmlNode.IsOverlappedClosingElement(html))
break;
// check the text is meaningful and not a bunch of whitespaces
if (html.Trim().Length > 0)
{
outText.Write(HtmlEntity.DeEntitize(html));
}
break;
case HtmlNodeType.Element:
switch(node.Name)
{
case "p":
// treat paragraphs as crlf
outText.Write("\r\n");
break;
}
if (node.HasChildNodes)
{
ConvertContentTo(node, outText);
}
break;
}
}
}
Working Fiddle link: https://dotnetfiddle.net/pXgXly
Hope it helps.
Subscribe to our weekly Newsletter & Keep getting latest article/questions in your inbox weekly