尝试扫描您的大文件而不将其全部存储到内存中,一次只在局部变量中保存一条记录:
void csvReader(FILE *f) {
T_structCDT c;
int count = 0;
c.string = malloc(1000);
while (fscanf(f, "%d,%d,%d,%999[^,],%d\n", &c.a, &c.b, &c.vivienda, c.c, &c.d) == 5) {
// nothing for now
count++;
}
printf("%d records parsed\n");
}
测量这个简单的解析器所花费的时间:
如果速度足够快,请执行选择测试并在解析阶段找到少数匹配记录时一次输出一条。这些步骤的额外时间应该相当少,因为只有少数记录匹配。
时间太长了,你需要一个更花哨的 CSV 解析器,这是很多工作,但可以快速完成,特别是如果你可以假设你的输入文件对所有人都使用这种简单的格式记录。这里的主题过于宽泛,无法详细说明,但可达到的速度应该接近 cat csvfile > /dev/null 或 grep a_short_string_not_present csvfile
在我的系统上(普通硬盘的普通linux服务器),从冷启动开始解析4000万行总计2GB的时间不到20秒,第二次不到4秒:磁盘I/O似乎是瓶颈。
如果您需要经常执行此选择,您可能应该使用不同的数据格式,可能是数据库系统。如果偶尔对格式固定的数据执行扫描,则使用 SSD 等更快的存储会有所帮助,但不要指望奇迹。
编辑为了将文字付诸实践,我编写了一个简单的生成器和提取器:
这是一个生成 CSV 数据的简单程序:
#include <stdio.h>
#include <stdlib.h>
const char *dict[] = {
"Lorem", "ipsum", "dolor", "sit", "amet;", "consectetur", "adipiscing", "elit;",
"sed", "do", "eiusmod", "tempor", "incididunt", "ut", "labore", "et",
"dolore", "magna", "aliqua.", "Ut", "enim", "ad", "minim", "veniam;",
"quis", "nostrud", "exercitation", "ullamco", "laboris", "nisi", "ut", "aliquip",
"ex", "ea", "commodo", "consequat.", "Duis", "aute", "irure", "dolor",
"in", "reprehenderit", "in", "voluptate", "velit", "esse", "cillum", "dolore",
"eu", "fugiat", "nulla", "pariatur.", "Excepteur", "sint", "occaecat", "cupidatat",
"non", "proident;", "sunt", "in", "culpa", "qui", "officia", "deserunt",
"mollit", "anim", "id", "est", "laborum.",
};
int csvgen(const char *fmt, long lines) {
char buf[1024];
if (*fmt == '\0')
return 1;
while (lines > 0) {
size_t pos = 0;
int count = 0;
for (const char *p = fmt; *p && pos < sizeof(buf); p++) {
switch (*p) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
count = count * 10 + *p - '0';
continue;
case 'd':
if (!count) count = 101;
pos += snprintf(buf + pos, sizeof(buf) - pos, "%d",
rand() % (2 + count - 1) - count + 1);
count = 0;
continue;
case 'u':
if (!count) count = 101;
pos += snprintf(buf + pos, sizeof(buf) - pos, "%u",
rand() % count);
count = 0;
continue;
case 's':
if (!count) count = 4;
count = rand() % count + 1;
while (count-- > 0 && pos < sizeof(buf)) {
pos += snprintf(buf + pos, sizeof(buf) - pos, "%s ",
dict[rand() % (sizeof(dict) / sizeof(*dict))]);
}
if (pos < sizeof(buf)) {
pos--;
}
count = 0;
continue;
default:
buf[pos++] = *p;
count = 0;
continue;
}
}
if (pos < sizeof(buf)) {
buf[pos++] = '\n';
fwrite(buf, 1, pos, stdout);
lines--;
}
}
return 0;
}
int main(int argc, char *argv[]) {
if (argc < 3) {
fprintf(stderr, "usage: csvgen format number\n");
return 2;
}
return csvgen(argv[1], strtol(argv[2], NULL, 0));
}
这是一个具有 3 种不同解析方法的提取器:
#include <stdio.h>
#include <stdlib.h>
static inline unsigned int getuint(const char *p, const char **pp) {
unsigned int d, n = 0;
while ((d = *p - '0') <= 9) {
n = n * 10 + d;
p++;
}
*pp = p;
return n;
}
int csvgrep(FILE *f, int method) {
struct {
int a, b, c, d;
int spos, slen;
char s[1000];
} c;
int count = 0, line = 0;
// select 500 out of 43M
#define select(c) ((c).a == 100 && (c).b == 100 && (c).c > 74 && (c).d > 50)
if (method == 0) {
// default method: fscanf
while (fscanf(f, "%d,%d,%d,%999[^,],%d\n", &c.a, &c.b, &c.c, c.s, &c.d) == 5) {
line++;
if (select(c)) {
count++;
printf("%d,%d,%d,%s,%d\n", c.a, c.b, c.c, c.s, c.d);
}
}
} else
if (method == 1) {
// use fgets and simple parser
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
char *p = buf;
int i;
line++;
c.a = strtol(p, &p, 10);
p += (*p == ',');
c.b = strtol(p, &p, 10);
p += (*p == ',');
c.c = strtol(p, &p, 10);
p += (*p == ',');
for (i = 0; *p && *p != ','; p++) {
c.s[i++] = *p;
}
c.s[i] = '\0';
p += (*p == ',');
c.d = strtol(p, &p, 10);
if (*p != '\n') {
fprintf(stderr, "csvgrep: invalid format at line %d\n", line);
continue;
}
if (select(c)) {
count++;
printf("%d,%d,%d,%s,%d\n", c.a, c.b, c.c, c.s, c.d);
}
}
} else
if (method == 2) {
// use fgets and hand coded parser, positive numbers only, no string copy
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
const char *p = buf;
line++;
c.a = getuint(p, &p);
p += (*p == ',');
c.b = getuint(p, &p);
p += (*p == ',');
c.c = getuint(p, &p);
p += (*p == ',');
c.spos = p - buf;
while (*p && *p != ',') p++;
c.slen = p - buf - c.spos;
p += (*p == ',');
c.d = getuint(p, &p);
if (*p != '\n') {
fprintf(stderr, "csvgrep: invalid format at line %d\n", line);
continue;
}
if (select(c)) {
count++;
printf("%d,%d,%d,%.*s,%d\n", c.a, c.b, c.c, c.slen, buf + c.spos, c.d);
}
}
} else {
fprintf(stderr, "csvgrep: unknown method: %d\n", method);
return 1;
}
fprintf(stderr, "csvgrep: %d records selected from %d lines\n", count, line);
return 0;
}
int main(int argc, char *argv[]) {
if (argc > 2 && strtol(argv[2], NULL, 0)) {
// non zero second argument -> set a 1M I/O buffer
setvbuf(stdin, NULL, _IOFBF, 1024 * 1024);
}
return csvgrep(stdin, argc > 1 ? strtol(argv[1], NULL, 0) : 0);
}
以下是一些比较基准数据:
$ time ./csvgen "u,u,u,s,u" 43000000 > 43m
real 0m34.428s user 0m32.911s sys 0m1.358s
$ time grep zz 43m
real 0m10.338s user 0m10.069s sys 0m0.211s
$ time wc -lc 43m
43000000 1195458701 43m
real 0m1.043s user 0m0.839s sys 0m0.196s
$ time cat 43m > /dev/null
real 0m0.201s user 0m0.004s sys 0m0.195s
$ time ./csvgrep 0 < 43m > x0
csvgrep: 508 records selected from 43000000 lines
real 0m14.271s user 0m13.856s sys 0m0.341s
$ time ./csvgrep 1 < 43m > x1
csvgrep: 508 records selected from 43000000 lines
real 0m8.235s user 0m7.856s sys 0m0.331s
$ time ./csvgrep 2 < 43m > x2
csvgrep: 508 records selected from 43000000 lines
real 0m3.892s user 0m3.555s sys 0m0.312s
$ time ./csvgrep 2 1 < 43m > x3
csvgrep: 508 records selected from 43000000 lines
real 0m3.706s user 0m3.488s sys 0m0.203s
$ cmp x0 x1
$ cmp x0 x2
$ cmp x0 x3
如您所见,专门解析方法提供了近 50% 的增益,而手动编码整数转换和字符串扫描又获得了 50%。使用 1 MB 缓冲区而不是默认大小只能提供 0.2 秒的边际增益。
为了进一步提高速度,可以使用mmap()绕过I/O流接口,对文件内容做更强的假设。在上面的代码中,仍然可以优雅地处理无效格式,但是您可以删除一些测试并以可靠性为代价将执行时间额外减少 5%。
上述基准测试是在具有 SSD 驱动器的系统上执行的,并且文件 43m 适合 RAM,因此计时不包括太多磁盘 I/O 延迟。 grep 速度出奇的慢,增加搜索字符串长度会使它变得更糟……wc -lc 为扫描性能设定了一个目标,为 4 倍,但 cat 似乎遥不可及。