C# .NET: Remove HTML Markup From a String

11/25/2013 9:59:51 PM

.NET Framework

The following is to remove all HTML markup from a string. It has an option to keep line breaks (converts them to <br  />'s).

It uses Regular Expressions and requires clean HTML.

public static string RemoveHtml(string html, bool keepLineBreaks = false)
{	
	if (string.IsNullOrEmpty(html))
	{
		return html;
	}

	System.Text.RegularExpressions.Regex regHtml = new System.Text.RegularExpressions.Regex("<[^>]*>");

	if (keepLineBreaks)
	{
		//replace p's
		html = html.Replace("<p>", "");
		html = html.Replace("</p>", "<br />");
		html = html.Replace("<br>", "<br />");
		html = html.Replace("<br/>", "<br />");
		html = html.Replace("<br />", Environment.NewLine);
	}

	//clean html
	html = regHtml.Replace(html, "");

	if (keepLineBreaks)
	{
		//add back breaks
		html = html.Replace(Environment.NewLine, "<br />");
	}

	return html;
}