以下解决方案会删除任何无效的 XML 字符,但我认为这样做会尽可能提高性能,尤其是它不分配一个新的 StringBuilder 以及一个新的字符串,除非已经确定字符串中包含任何无效字符。所以热点最终只是字符上的一个 for 循环,检查结果通常不超过两个大于/小于每个字符上的数字比较。如果没有找到,它只返回原始字符串。当绝大多数字符串一开始都很好时,这特别有用,最好尽快将这些字符串输入和输出(没有浪费的分配等)。
-- 更新--
见下文如何也可以直接编写包含这些无效字符的 XElement,尽管它使用此代码 --
部分代码受到by Mr. Tom Bogle's solution here 的影响。另请参阅同一线程上superlogical 的帖子中的有用信息。然而,所有这些总是实例化一个新的 StringBuilder 和字符串。
用法:
string xmlStrBack = XML.ToValidXmlCharactersString("any string");
测试:
public static void TestXmlCleanser()
{
string badString = "My name is Inigo Montoya"; // you may not see it, but bad char is in 'MontXoya'
string goodString = "My name is Inigo Montoya!";
string back1 = XML.ToValidXmlCharactersString(badString); // fixes it
string back2 = XML.ToValidXmlCharactersString(goodString); // returns same string
XElement x1 = new XElement("test", back1);
XElement x2 = new XElement("test", back2);
XElement x3WithBadString = new XElement("test", badString);
string xml1 = x1.ToString();
string xml2 = x2.ToString().Print();
string xmlShouldFail = x3WithBadString.ToString();
}
// --- 代码 ---(我在一个名为 XML 的静态实用程序类中有这些方法)
/// <summary>
/// Determines if any invalid XML 1.0 characters exist within the string,
/// and if so it returns a new string with the invalid chars removed, else
/// the same string is returned (with no wasted StringBuilder allocated, etc).
/// </summary>
/// <param name="s">Xml string.</param>
/// <param name="startIndex">The index to begin checking at.</param>
public static string ToValidXmlCharactersString(string s, int startIndex = 0)
{
int firstInvalidChar = IndexOfFirstInvalidXMLChar(s, startIndex);
if (firstInvalidChar < 0)
return s;
startIndex = firstInvalidChar;
int len = s.Length;
var sb = new StringBuilder(len);
if (startIndex > 0)
sb.Append(s, 0, startIndex);
for (int i = startIndex; i < len; i++)
if (IsLegalXmlChar(s[i]))
sb.Append(s[i]);
return sb.ToString();
}
/// <summary>
/// Gets the index of the first invalid XML 1.0 character in this string, else returns -1.
/// </summary>
/// <param name="s">Xml string.</param>
/// <param name="startIndex">Start index.</param>
public static int IndexOfFirstInvalidXMLChar(string s, int startIndex = 0)
{
if (s != null && s.Length > 0 && startIndex < s.Length) {
if (startIndex < 0) startIndex = 0;
int len = s.Length;
for (int i = startIndex; i < len; i++)
if (!IsLegalXmlChar(s[i]))
return i;
}
return -1;
}
/// <summary>
/// Indicates whether a given character is valid according to the XML 1.0 spec.
/// This code represents an optimized version of Tom Bogle's on SO:
/// https://stackoverflow.com/a/13039301/264031.
/// </summary>
public static bool IsLegalXmlChar(char c)
{
if (c > 31 && c <= 55295)
return true;
if (c < 32)
return c == 9 || c == 10 || c == 13;
return (c >= 57344 && c <= 65533) || c > 65535;
// final comparison is useful only for integral comparison, if char c -> int c, useful for utf-32 I suppose
//c <= 1114111 */ // impossible to get a code point bigger than 1114111 because Char.ConvertToUtf32 would have thrown an exception
}
======== ======== ========
直接写 XElement.ToString
======== ======== ========
一、这个扩展方法的用法:
string result = xelem.ToStringIgnoreInvalidChars();
-- 更全面的测试--
public static void TestXmlCleanser()
{
string badString = "My name is Inigo Montoya"; // you may not see it, but bad char is in 'MontXoya'
XElement x = new XElement("test", badString);
string xml1 = x.ToStringIgnoreInvalidChars();
//result: <test>My name is Inigo Montoya</test>
string xml2 = x.ToStringIgnoreInvalidChars(deleteInvalidChars: false);
//result: <test>My name is Inigo Montoya</test>
}
--- 代码---
/// <summary>
/// Writes this XML to string while allowing invalid XML chars to either be
/// simply removed during the write process, or else encoded into entities,
/// instead of having an exception occur, as the standard XmlWriter.Create
/// XmlWriter does (which is the default writer used by XElement).
/// </summary>
/// <param name="xml">XElement.</param>
/// <param name="deleteInvalidChars">True to have any invalid chars deleted, else they will be entity encoded.</param>
/// <param name="indent">Indent setting.</param>
/// <param name="indentChar">Indent char (leave null to use default)</param>
public static string ToStringIgnoreInvalidChars(this XElement xml, bool deleteInvalidChars = true, bool indent = true, char? indentChar = null)
{
if (xml == null) return null;
StringWriter swriter = new StringWriter();
using (XmlTextWriterIgnoreInvalidChars writer = new XmlTextWriterIgnoreInvalidChars(swriter, deleteInvalidChars)) {
// -- settings --
// unfortunately writer.Settings cannot be set, is null, so we can't specify: bool newLineOnAttributes, bool omitXmlDeclaration
writer.Formatting = indent ? Formatting.Indented : Formatting.None;
if (indentChar != null)
writer.IndentChar = (char)indentChar;
// -- write --
xml.WriteTo(writer);
}
return swriter.ToString();
}
-- 这使用以下 XmlTextWritter--
public class XmlTextWriterIgnoreInvalidChars : XmlTextWriter
{
public bool DeleteInvalidChars { get; set; }
public XmlTextWriterIgnoreInvalidChars(TextWriter w, bool deleteInvalidChars = true) : base(w)
{
DeleteInvalidChars = deleteInvalidChars;
}
public override void WriteString(string text)
{
if (text != null && DeleteInvalidChars)
text = XML.ToValidXmlCharactersString(text);
base.WriteString(text);
}
}