本文对C的正则库regex和pcre在做域名验证的场景下做评测。
验证DNS域名的正则表达式为:
"^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$"
对于正常DNS请求日志中的6177578条日志做正则验证处理。
1,pcre
评测所用的pcre的版本号是:7.8.3
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#include <stdio.h>#include <string.h>#include <pcre.h>#define OVECCOUNT 30 /* should be a multiple of 3 */#define EBUFLEN 128#define BUFLEN 1024int main(int argc, char *argv[])
{ pcre *re;
const char *error;
int erroffset;
FILE *fd;
int ovector[OVECCOUNT];
int rc, i;
int succ = 0, fail = 0;
char src[1024];
char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$";
printf("Pattern: %s\n", pattern);
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
if (re == NULL) {
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
return 1;
}
if ((fd = fopen(argv[1], "r")) == NULL) {
printf("open file error\n");
return 1;
}
while(fgets(src, 1024, fd)) {
rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);
if (rc < 0) {
fail++;
} else {
succ++;
}
}
printf("success:%d fail:%d\n", succ, fail);
fclose(fd);
free(re);
return 0;
} |
处理完所有数据的耗时是:
$time ./pcre_t query_domains
Pattern: ^[0-9a-zA-Z_-]+(\.[0-9a-zA-Z_-]+)*(\.[a-zA-Z]{2,}\.)$
success:6177443 fail:135
real 0m8.257s
user 0m8.194s
sys 0m0.058s
2,regex
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
#include <stdio.h>#include <string.h>#include <regex.h>#define SUBSLEN 100#define EBUFLEN 1280 #define BUFLEN 1024int main(int argc, char *argv[])
{ size_t len;
regex_t re;
regmatch_t subs[SUBSLEN];
char matched[BUFLEN];
char errbuf[EBUFLEN];
int err, i, succ=0, fail=0;
FILE *fd;
char *src;
char line[1024];
char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]+\\.)$";
printf("Pattern: %s\n", pattern);
if(regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE)) {
len = regerror(err, &re, errbuf, sizeof(errbuf));
printf("error: regcomp: %s\n", errbuf);
return 1;
}
if ((fd = fopen(argv[1], "r")) == NULL) {
printf("open file error\n");
return 1;
}
while(fgets(line, 1024, fd)) {
err = regexec(&re, line, (size_t) SUBSLEN, subs, 0);
if (err == REG_NOMATCH) {
fail++;
} else {
succ++;
}
}
printf("success:%d, fails:%d\n", succ, fail);
fclose(fd);
regfree(&re);
return (0);
} |
处理完所有数据耗时:
$time ./regex_t query_domains
Pattern: ^[0-9a-zA-Z_-]+(\.[0-9a-zA-Z_-]+)*(\.[a-zA-Z]+\.)$
success:6177443, fails:135
real 0m50.876s
user 0m50.783s
sys 0m0.058s
3,结论。
可以看到,对于域名验证的场景。pcre明显优于POSIX regex库。在规则已经编译好的情况下,pcre每秒大约处理74.8w条域名,而regex每秒大约处理12.1万条。
本文对C的正则库regex和pcre在做域名验证的场景下做评测。
验证DNS域名的正则表达式为:
"^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$"
对于正常DNS请求日志中的6177578条日志做正则验证处理。
1,pcre
评测所用的pcre的版本号是:7.8.3
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#include <stdio.h>#include <string.h>#include <pcre.h>#define OVECCOUNT 30 /* should be a multiple of 3 */#define EBUFLEN 128#define BUFLEN 1024int main(int argc, char *argv[])
{ pcre *re;
const char *error;
int erroffset;
FILE *fd;
int ovector[OVECCOUNT];
int rc, i;
int succ = 0, fail = 0;
char src[1024];
char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]{2,}\\.)$";
printf("Pattern: %s\n", pattern);
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
if (re == NULL) {
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
return 1;
}
if ((fd = fopen(argv[1], "r")) == NULL) {
printf("open file error\n");
return 1;
}
while(fgets(src, 1024, fd)) {
rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);
if (rc < 0) {
fail++;
} else {
succ++;
}
}
printf("success:%d fail:%d\n", succ, fail);
fclose(fd);
free(re);
return 0;
} |
处理完所有数据的耗时是:
$time ./pcre_t query_domains
Pattern: ^[0-9a-zA-Z_-]+(\.[0-9a-zA-Z_-]+)*(\.[a-zA-Z]{2,}\.)$
success:6177443 fail:135
real 0m8.257s
user 0m8.194s
sys 0m0.058s
2,regex
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
#include <stdio.h>#include <string.h>#include <regex.h>#define SUBSLEN 100#define EBUFLEN 1280 #define BUFLEN 1024int main(int argc, char *argv[])
{ size_t len;
regex_t re;
regmatch_t subs[SUBSLEN];
char matched[BUFLEN];
char errbuf[EBUFLEN];
int err, i, succ=0, fail=0;
FILE *fd;
char *src;
char line[1024];
char pattern[] = "^[0-9a-zA-Z_-]+(\\.[0-9a-zA-Z_-]+)*(\\.[a-zA-Z]+\\.)$";
printf("Pattern: %s\n", pattern);
if(regcomp(&re, pattern, REG_EXTENDED | REG_NEWLINE)) {
len = regerror(err, &re, errbuf, sizeof(errbuf));
printf("error: regcomp: %s\n", errbuf);
return 1;
}
if ((fd = fopen(argv[1], "r")) == NULL) {
printf("open file error\n");
return 1;
}
while(fgets(line, 1024, fd)) {
err = regexec(&re, line, (size_t) SUBSLEN, subs, 0);
if (err == REG_NOMATCH) {
fail++;
} else {
succ++;
}
}
printf("success:%d, fails:%d\n", succ, fail);
fclose(fd);
regfree(&re);
return (0);
} |
处理完所有数据耗时:
$time ./regex_t query_domains
Pattern: ^[0-9a-zA-Z_-]+(\.[0-9a-zA-Z_-]+)*(\.[a-zA-Z]+\.)$
success:6177443, fails:135
real 0m50.876s
user 0m50.783s
sys 0m0.058s
3,结论。
可以看到,对于域名验证的场景。pcre明显优于POSIX regex库。在规则已经编译好的情况下,pcre每秒大约处理74.8w条域名,而regex每秒大约处理12.1万条。