如何在不支付组件费用的情况下将HTML转换为.NET中的RTF(富文本格式)?
时间:2020-03-06 14:53:30 来源:igfitidea点击:
是否有免费的第三方或者.NET类将HTML转换为RTF(用于启用了富文本格式的Windows Forms控件)?
"免费"的要求来自以下事实:我仅在原型上工作,并且可以加载BrowserControl并根据需要渲染HTML(即使速度很慢),并且Developer Express将发布自己的此类代码。控制很快。
我不想学习手工编写RTF,而且我已经知道HTML,因此我认为这是快速发布一些可证明代码的最快方法。
解决方案
也许我们需要的是用于编辑HTML的控件?
当然,它并不是完美的,但是这是我用来将HTML转换为纯文本的代码。
(我不是原始作者,而是根据网络上的代码改编而成的)
public static string ConvertHtmlToText(string source) { string result; // Remove HTML Development formatting // Replace line breaks with space // because browsers inserts space result = source.Replace("\r", " "); // Replace line breaks with space // because browsers inserts space result = result.Replace("\n", " "); // Remove step-formatting result = result.Replace("\t", string.Empty); // Remove repeating speces becuase browsers ignore them result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " "); // Remove the header (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all scripts (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); //result = System.Text.RegularExpressions.Regex.Replace(result, // @"(<script>)([^(<script>\.</script>)])*(</script>)", // string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all styles (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert tabs in spaces of <td> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line breaks in places of <BR> and <LI> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line paragraphs (double line breaks) in place // if <P>, <DIV> and <TR> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove remaining tags like <a>, links, images, // comments etc - anything thats enclosed inside < > result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // replace special characters: result = System.Text.RegularExpressions.Regex.Replace(result, @" ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"•", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"‹", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"›", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"™", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"⁄", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @">", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"©", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"®", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove all others. More can be added, see // http://hotwired.lycos.com/webmonkey/reference/special_characters/ result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // make line breaking consistent result = result.Replace("\n", "\r"); // Remove extra line breaks and tabs: // replace over 2 breaks with 2 and over 4 tabs with 4. // Prepare first to remove any whitespaces inbetween // the escaped characters and remove redundant tabs inbetween linebreaks result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multible tabs followind a linebreak with just one tab result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Initial replacement target string for linebreaks string breaks = "\r\r\r"; // Initial replacement target string for tabs string tabs = "\t\t\t\t\t"; for (int index = 0; index < result.Length; index++) { result = result.Replace(breaks, "\r\r"); result = result.Replace(tabs, "\t\t\t\t"); breaks = breaks + "\r"; tabs = tabs + "\t"; } // Thats it. return result; }
在XHTML2RTF上查看此CodeProject文章。