您的示例数据在 'double qoute' (sic) 之后有一个单双引号,它终止了字段的双引号开头,但您必须继续阅读,直到下一个逗号或行尾。这是一个格式错误的多行字段,但您会看到 Excel 对它的作用。您可以在(优秀的)书籍The Practice of Programming(死链接 - Internet Archive copy;Princeton University 的实时链接)中找到对此的描述,其中包括 C 中的 CSV 解析库和重新实现C++,并讨论了这个级别的细节。 RFC 4180“Common Format and MIME Type for Comma-Separated Values”中还有一个 CSV 的标准,您还可以研究 Wikipedia 关于该主题。
另一个答案是一些示例代码,仍在测试中。它在其限制范围内非常有弹性。这里修改成SSCCE(Short, Self-Contained, Correct Example)。
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input: %d [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wprintf(L"Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
示例输出
$ cat csv.data
a,bb, c ,d""e,f
1,"2","",,"""",4
$ ./wcsv < csv.data
Input: 16 [a,bb, c ,d""e,f]
Field: [a,] (eol = 0)
Field: [bb,] (eol = 0)
Field: [ c ,] (eol = 0)
Field: [d""e,] (eol = 0)
Field: [f] (eol = 1)
Input: 17 [1,"2","",,"""",4]
Field: [1,] (eol = 0)
Field: ["2",] (eol = 0)
Field: ["",] (eol = 0)
Field: [,] (eol = 0)
Field: ["""",] (eol = 0)
Field: [4] (eol = 1)
Input: 296 [First field of first row,"This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
"This is second field of second row, but it is not multiline
because it doesn't start
with an immediate double quote"]
Field: [First field of first row,] (eol = 0)
Field: ["This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field: [ "This is second field of second row,] (eol = 0)
Field: [ but it is not multiline] (eol = 1)
Field: [ because it doesn't start ] (eol = 1)
Field: [ with an immediate double quote"] (eol = 1)
$
我说的是“在其限制范围内”;它的局限性是什么?
主要是隔离原始字段,而不是转换后的字段。因此,必须修改它隔离的字段以产生“真实”值,去掉封闭的双引号,并将内部双引号替换为单引号。将原始字段转换为实际值模仿了nextCsvField() 函数中的大量代码。输入是字段的开头和字段的结尾(分隔符)。这是第二个 SSCCE,带有一个额外的函数 csvFieldData(),上面显示的 dissect() 函数修改为调用它。解剖输出的格式略有不同,因此看起来更好:
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);
// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
// Parse quoted sequences
if ('"' == p[0]) {
p++;
while (1) {
// Find next double-quote
p = wcschr(p, L'"');
// If we don't find it or it's the last symbol
// then this is the last field
if (!p || !p[1])
return 0;
// Check for "", it is an escaped double-quote
if (p[1] != '"')
break;
// Skip the escaped double-quote
p += 2;
}
}
// Find next newline or comma.
wchar_t newline_or_sep[4] = L"\n\r ";
newline_or_sep[2] = sep;
p = wcspbrk(p, newline_or_sep);
// If no newline or separator, this is the last field.
if (!p)
return 0;
// Check if we had newline.
*newline = (p[0] == '\r' || p[0] == '\n');
// Handle "\r\n", otherwise just increment
if (p[0] == '\r' && p[1] == '\n')
p += 2;
else
p++;
return p;
}
static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
wchar_t *dst = buffer;
wchar_t *end = buffer + buflen - 1;
const wchar_t *src = fld_s;
if (*src == L'"')
{
const wchar_t *p = src + 1;
while (p < fld_e && dst < end)
{
if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
{
*dst++ = p[0];
p += 2;
}
else if (p[0] == L'"')
{
p++;
break;
}
else
*dst++ = *p++;
}
src = p;
}
while (src < fld_e && dst < end)
*dst++ = *src++;
if (dst >= end)
return 0;
*dst = L'\0';
return(buffer);
}
static void dissect(const wchar_t *line)
{
const wchar_t *start = line;
const wchar_t *next;
bool eol;
wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
while ((next = nextCsvField(start, L',', &eol)) != 0)
{
wchar_t buffer[1024];
wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
start = next;
}
}
static const wchar_t multiline[] =
L"First field of first row,\"This field is multiline\n"
"\n"
"but that's OK because it's enclosed in double quotes, and this\n"
"is an escaped \"\" double quote\" but this one \"\" is not\n"
" \"This is second field of second row, but it is not multiline\n"
" because it doesn't start \n"
" with an immediate double quote\"\n"
;
int main(void)
{
wchar_t line[1024];
while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
dissect(line);
dissect(multiline);
return 0;
}
示例输出
$ ./wcsv < csv.data
Input 16: [a,bb, c ,d""e,f]
Raw Field: [a,] (eol = 0)
Field 1: [a]
Raw Field: [bb,] (eol = 0)
Field 2: [bb]
Raw Field: [ c ,] (eol = 0)
Field 3: [ c ]
Raw Field: [d""e,] (eol = 0)
Field 4: [d""e]
Raw Field: [f] (eol = 1)
Field 1: [f]
Input 17: [1,"2","",,"""",4]
Raw Field: [1,] (eol = 0)
Field 1: [1]
Raw Field: ["2",] (eol = 0)
Field 1: [2]
Raw Field: ["",] (eol = 0)
Field 0: []
Raw Field: [,] (eol = 0)
Field 0: []
Raw Field: ["""",] (eol = 0)
Field 2: [""]
Raw Field: [4] (eol = 1)
Field 1: [4]
Input 296: [First field of first row,"This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
"This is second field of second row, but it is not multiline
because it doesn't start
with an immediate double quote"]
Raw Field: [First field of first row,] (eol = 0)
Field 24: [First field of first row]
Raw Field: ["This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field 140: [This field is multiline
but that's OK because it's enclosed in double quotes, and this
is an escaped " double quote" but this one "" is not]
Raw Field: [ "This is second field of second row,] (eol = 0)
Field 38: [ "This is second field of second row]
Raw Field: [ but it is not multiline] (eol = 1)
Field 24: [ but it is not multiline]
Raw Field: [ because it doesn't start ] (eol = 1)
Field 28: [ because it doesn't start ]
Raw Field: [ with an immediate double quote"] (eol = 1)
Field 34: [ with an immediate double quote"]
$
我没有测试过\r\n(或普通\r)行尾。