我需要一些更强大的东西,所以我从这里获取并创建了这个......这个解决方案不太优雅,也更冗长,但在我的测试中(使用 1,000,000 行样本),我发现了这个要快 2 到 3 倍。此外,它还可以处理非转义的嵌入式引号。由于我的解决方案的要求,我使用了字符串分隔符和限定符而不是字符。我发现找到一个好的通用 CSV 解析器比我预期的要困难,所以我希望这个解析算法可以帮助某人。
public static string[] SplitRow(string record, string delimiter, string qualifier, bool trimData)
{
// In-Line for example, but I implemented as string extender in production code
Func <string, int, int> IndexOfNextNonWhiteSpaceChar = delegate (string source, int startIndex)
{
if (startIndex >= 0)
{
if (source != null)
{
for (int i = startIndex; i < source.Length; i++)
{
if (!char.IsWhiteSpace(source[i]))
{
return i;
}
}
}
}
return -1;
};
var results = new List<string>();
var result = new StringBuilder();
var inQualifier = false;
var inField = false;
// We add new columns at the delimiter, so append one for the parser.
var row = $"{record}{delimiter}";
for (var idx = 0; idx < row.Length; idx++)
{
// A delimiter character...
if (row[idx]== delimiter[0])
{
// Are we inside qualifier? If not, we've hit the end of a column value.
if (!inQualifier)
{
results.Add(trimData ? result.ToString().Trim() : result.ToString());
result.Clear();
inField = false;
}
else
{
result.Append(row[idx]);
}
}
// NOT a delimiter character...
else
{
// ...Not a space character
if (row[idx] != ' ')
{
// A qualifier character...
if (row[idx] == qualifier[0])
{
// Qualifier is closing qualifier...
if (inQualifier && row[IndexOfNextNonWhiteSpaceChar(row, idx + 1)] == delimiter[0])
{
inQualifier = false;
continue;
}
else
{
// ...Qualifier is opening qualifier
if (!inQualifier)
{
inQualifier = true;
}
// ...It's a qualifier inside a qualifier.
else
{
inField = true;
result.Append(row[idx]);
}
}
}
// Not a qualifier character...
else
{
result.Append(row[idx]);
inField = true;
}
}
// ...A space character
else
{
if (inQualifier || inField)
{
result.Append(row[idx]);
}
}
}
}
return results.ToArray<string>();
}
一些测试代码:
//var input = "111,222,\"33,44,55\",666,\"77,88\",\"99\"";
var input =
"111, 222, \"99\",\"33,44,55\" , \"666 \"mark of a man\"\", \" spaces \"77,88\" \"";
Console.WriteLine("Split with trim");
Console.WriteLine("---------------");
var result = SplitRow(input, ",", "\"", true);
foreach (var r in result)
{
Console.WriteLine(r);
}
Console.WriteLine("");
// Split 2
Console.WriteLine("Split with no trim");
Console.WriteLine("------------------");
var result2 = SplitRow(input, ",", "\"", false);
foreach (var r in result2)
{
Console.WriteLine(r);
}
Console.WriteLine("");
// Time Trial 1
Console.WriteLine("Experimental Process (1,000,000) iterations");
Console.WriteLine("-------------------------------------------");
watch = Stopwatch.StartNew();
for (var i = 0; i < 1000000; i++)
{
var x1 = SplitRow(input, ",", "\"", false);
}
watch.Stop();
elapsedMs = watch.ElapsedMilliseconds;
Console.WriteLine($"Total Process Time: {string.Format("{0:0.###}", elapsedMs / 1000.0)} Seconds");
Console.WriteLine("");
结果
Split with trim
---------------
111
222
99
33,44,55
666 "mark of a man"
spaces "77,88"
Split with no trim
------------------
111
222
99
33,44,55
666 "mark of a man"
spaces "77,88"
Original Process (1,000,000) iterations
-------------------------------
Total Process Time: 7.538 Seconds
Experimental Process (1,000,000) iterations
--------------------------------------------
Total Process Time: 3.363 Seconds