1987kevin
View Code
 1           string wordPath = Server.MapPath("/Fileword/" + FileUpload1.FileName);          
 2             string htmlPath = Server.MapPath("/Fileword/测试.html");
 3             //上传word文件
 4             FileUpload1.SaveAs(wordPath);
 5             #region 文件格式转换
 6             //请引用Microsoft.Office.Interop.Word
 7             ApplicationClass word = new ApplicationClass();
 8             Type wordType = word.GetType();
 9             Documents docs = word.Documents;
10             // 打开文件
11             Type docsType = docs.GetType();
12             object fileName = wordPath;
13             Document doc = (Document)docsType.InvokeMember("Open", BindingFlags.InvokeMethod, null, (object)docs, new Object[] { fileName, true, true });
14             //判断与文件转换相关的文件是否存在,存在则删除。(这里,最好还判断一下存放文件的目录是否存在,不存在则创建)
15             if (File.Exists(htmlPath)) { File.Delete(htmlPath); }
16             ////每一个html文件,有一个对应的存放html相关元素的文件夹(html文件名.files)
17             if (Directory.Exists(htmlPath.Replace(".html", ".files")))
18             {
19                 Directory.Delete(htmlPath.Replace(".html", ".files"), true);
20             };
21             //转换格式,调用word的“另存为”方法
22             Type docType = doc.GetType();
23             object saveFileName = htmlPath;
24             docType.InvokeMember("SaveAs", BindingFlags.InvokeMethod, null, doc, new object[] { saveFileName, WdSaveFormat.wdFormatHTML });
25             // 退出 Word
26             wordType.InvokeMember("Quit", BindingFlags.InvokeMethod, null, word, null);

这样生成的html , 会有很多冗余HTML,去除冗余HTML方法如下

View Code
 1  public static string CleanWordHtml(string html)
 2     {
 3         StringCollection sc = new StringCollection();
 4         sc.Add(@"<!--(\w|\W)+?>");
 5         sc.Add(@"<!--(\w|\W)+?-->");
 6         sc.Add(@"<style>(\w|\W)+?</style>");
 7         sc.Add(@"\s?class=\w+");       
 8         sc.Add(@"<(meta|link|/?o:|/?font|/?strong|/?st\d|/?head|/?html|body|/?body|/?w:|/?m:|/?v:|!\[)[^>]*?>");
 9         sc.Add(@"(<[^>]+>)+ ()+");
10         sc.Add(@"<xml>(\w|\W)+?</xml>");//清除xml标签及所有值      
11         sc.Add(@"(\n\r){2,}");       
12         foreach (string s in sc)
13         {
14             html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
15         }
16 
17         foreach (Match match in Regex.Matches(html, "style=\'[^\']+\'", RegexOptions.IgnoreCase))
18         {
19             html = html.Replace(match.Value, match.Value.Replace(\'"\', \' \').Replace("\'","\""));
20 
21         }
22         html = Regex.Replace(html, @"(?<=style=[\'""])[^\'""]*(?=[\'""])", delegate(Match m)
23         {
24             return string.Join(";", m.Value.Split(\';\').Where(t => Regex.IsMatch(t.Trim(), @"^(background|color):")).ToArray());
25         });
26         
27         return html;
28 
29     }

可根据自己的需求修改正则

 

分类:

技术点:

相关文章: