解决方案是测试一系列标记/非标记单元,而不是尝试测试分隔单元的正确计数或标记的正确计数。如果标记是分隔符并且您需要最小数量的分隔单元,则要求标记/非标记单元的计数等于所需的单元数少一。正如我们将看到的,这个解决方案具有惊人的性能。
最少行数
此函数检查最少行数,其中\n 分隔行而不是严格结束行,允许最后一行为空:
function hasMinLineCount(text, minLineCount) {
if (minLineCount <= 1)
return true; // always 1+ lines, though perhaps empty
var r = new RegExp('([^\n]*\n){' + (minLineCount-1) + '}');
return r.test(text);
}
或者,\n 可以被假定为结束行,而不是纯粹界定它们,对非空的最后一行进行例外处理。例如,"apple\npear\n" 将是两行,而"apple\npear\ngrape" 将是三行。以下函数以这种方式计算行数:
function hasMinLineCount(text, minLineCount) {
var r = new RegExp('([^\n]*\n|[^\n]+$){' + minLineCount + '}');
return r.test(text);
}
字符串分隔符和标记
更一般地,对于由字符串分隔符分隔的任何单位:
var _ = require('lodash');
function hasMinUnitCount(text, minUnitCount, unitDelim) {
if (minUnitCount <= 1)
return true; // always 1+ units, though perhaps empty
var escDelim = _.escapeRegExp(unitDelim);
var r = new RegExp('(.*?'+ escDelim +'){' + (minUnitCount-1) + '}');
return r.test(text);
}
我们还可以测试是否存在最少数量的字符串标记:
var _ = require('lodash');
function hasMinTokenCount(text, minTokenCount, token) {
var escToken = _.escapeRegExp(token);
var r = new RegExp('(.*?'+ escToken +'){' + minTokenCount + '}');
return r.test(text);
}
正则表达式分隔符和标记
我们可以通过允许单位分隔符和标记包含正则表达式字符来进一步概括。只需确保分隔符或标记可以明确地背靠背出现。示例正则表达式分隔符包括"<br */>" 和"[|,]"。这些是字符串,而不是 RegExp 对象。
function hasMinUnitCount(text, minUnitCount, unitDelimRegexStr) {
if (minUnitCount <= 1)
return true; // always 1+ units, though perhaps empty
var r = new RegExp(
'(.*?'+ unitDelimRegexStr +'){' + (minUnitCount-1) + '}');
return r.test(text);
}
function hasMinTokenCount(text, minTokenCount, tokenRegexStr) {
var r = new RegExp('(.*?'+ tokenRegexStr +'){' + minTokenCount + '}');
return r.test(text);
}
计算成本
泛型函数之所以有效,是因为它们的正则表达式对字符进行非贪婪匹配(注意.*?),直到下一个分隔符或标记。这是一个计算成本高昂的前瞻和回溯过程,因此相对于更硬编码的表达式(例如上面的 hasMinLineCount() 中的表达式)而言,这些会降低性能。
让我们重新审视最初的问题,即我们是否可以胜过使用正则表达式测试拆分字符串。回想一下,我们唯一的目标是测试最少的行数。我使用benchmark.js 进行测试,假设我们知道需要多行。代码如下:
var Benchmark = require('benchmark');
var suite = new Benchmark.Suite;
var line = "Go faster faster faster!\n";
var text = line.repeat(100);
var MIN_LINE_COUNT = 50;
var preBuiltBackingRegex = new RegExp('(.*?\n){'+ MIN_LINE_COUNT +'}');
var preBuiltNoBackRegex = new RegExp('([^\n]*\n){'+ MIN_LINE_COUNT +'}');
suite.add('split string', function() {
if (text.split("\n").length >= MIN_LINE_COUNT)
'has minimum lines';
})
.add('backtracking on-the-fly regex', function() {
if (new RegExp('(.*?\n){'+ MIN_LINE_COUNT +'}').test(text))
'has minimum lines';
})
.add('backtracking pre-built regex', function() {
if (preBuiltBackingRegex.test(text))
'has minimum lines';
})
.add('no-backtrack on-the-fly regex', function() {
if (new RegExp('([^\n]*\n){'+ MIN_LINE_COUNT +'}').test(text))
'has minimum lines';
})
.add('no-backtrack pre-built regex', function() {
if (preBuiltNoBackRegex.test(text))
'has minimum lines';
})
.on('cycle', function(event) {
console.log(String(event.target));
})
.on('complete', function() {
console.log('Fastest is ' + this.filter('fastest').map('name'));
})
.run({ 'async': true });
以下是三轮运行的结果:
split string x 263,260 ops/sec ±0.68% (85 runs sampled)
backtracking on-the-fly regex x 492,671 ops/sec ±1.01% (82 runs sampled)
backtracking pre-built regex x 607,033 ops/sec ±0.72% (87 runs sampled)
no-backtrack on-the-fly regex x 581,681 ops/sec ±0.77% (84 runs sampled)
no-backtrack pre-built regex x 723,075 ops/sec ±0.72% (89 runs sampled)
Fastest is no-backtrack pre-built regex
split string x 260,962 ops/sec ±0.82% (85 runs sampled)
backtracking on-the-fly regex x 502,410 ops/sec ±0.79% (84 runs sampled)
backtracking pre-built regex x 606,220 ops/sec ±0.67% (88 runs sampled)
no-backtrack on-the-fly regex x 578,193 ops/sec ±0.83% (86 runs sampled)
no-backtrack pre-built regex x 741,864 ops/sec ±0.68% (84 runs sampled)
Fastest is no-backtrack pre-built regex
split string x 262,266 ops/sec ±0.76% (87 runs sampled)
backtracking on-the-fly regex x 495,697 ops/sec ±0.82% (87 runs sampled)
backtracking pre-built regex x 608,178 ops/sec ±0.72% (88 runs sampled)
no-backtrack on-the-fly regex x 574,640 ops/sec ±0.92% (87 runs sampled)
no-backtrack pre-built regex x 739,629 ops/sec ±0.72% (86 runs sampled)
Fastest is no-backtrack pre-built regex
所有的正则表达式测试显然比拆分字符串检查行数更快,甚至是回溯测试。我想我会做正则表达式测试。