我一直使用这种单一比较方法(我认为它的流水线更好),因为它比最多进行四次比较要快。
unsigned((ch&(~(1<<5))) - 'A') <= 'Z' - 'A'
我对几种不同的方法进行了基准测试,并考虑了查找表方法的 TLB 缓存未命中。我在 Windows 上运行了基准测试。以下是字符集为 '0'..'z' 的时间:
lookup tbl no tlb miss: 4.8265 ms
lookup table with tlb miss: 7.0217 ms
unsigned((ch&(~(1<<5))) - 'A') <= 'Z' - 'A': 10.5075 ms
(ch>='A' && ch<='Z') || (ch>='a' && ch<='z'): 17.2290 ms
isalpha: 28.0504 ms
您可以清楚地看到语言环境代码是有代价的。
如果字符集为 0..255,则以下是时间:
tbl no tlb miss: 12.6833 ms
unsigned((ch&(~(1<<5))) - 'A') <= 'Z' - 'A': 29.2403 ms
(ch>='A' && ch<='Z') || (ch>='a' && ch<='z'): 34.8818 ms
isalpha: 78.0317 ms
tbl with tlb miss: 143.7135 ms
时间更长,因为测试了更多字符。在第二次测试中,我用于 tlb“flush”的段数更大。可能是表查找方法比第一次运行所表明的 tlb 未命中受到的影响更大。您还可以看到,当字符为 alpha 时,单个 cmp 方法效果更好。
如果比较一行中的多个字符,查找表方法是最好的,但它并不比单个 cmp 方法好多少。如果您在这里和那里比较字符,那么 tlb 缓存未命中可能会使 tbl 方法比单个 cmp 方法更差。当字符更可能是字母时,单个 cmp 方法效果最好。
代码如下:
__forceinline bool isalpha2(BYTE ch) {
return (ch>='A' && ch<='Z') || (ch>='a' && ch<='z');
}
__forceinline bool isalpha1(BYTE ch) {
return unsigned((ch&(~(1<<5))) - 'A') <= 'Z' - 'A';
}
bool* aTbl[256];
int main(int argc, char* argv[])
{
__int64 n = 0, cTries = 100000;
int b=63;
int ch0=' ', ch1 ='z'+1;
ch0=0, ch1 = 256;
// having 255 tables instead of one lets us "flush" the tlb.
// Intel tlb should have about ~32 entries (depending on model!) in it,
// so using more than 32 tables should have a noticable effect.
for (int i1=0 ; i1<256 ; ++i1) {
aTbl[i1] = (bool*)malloc(16384);
for (int i=0 ; i<256 ; ++i)
aTbl[i1][i] = isalpha(i);
}
{ CBench Bench("tbl with tlb miss");
for (int i=1 ; i<cTries ; ++i) {
for (int ch = ch0 ; ch < ch1 ; ++ ch)
n += aTbl[ch][ch]; // tlb buster
}
}
{ CBench Bench("tbl no tlb miss");
for (int i=1 ; i<cTries ; ++i) {
for (int ch = ch0 ; ch < ch1 ; ++ ch)
n += aTbl[0][ch];
}
}
{ CBench Bench("isalpha");
for (int i=1 ; i<cTries ; ++i) {
for (int ch = ch0 ; ch < ch1 ; ++ ch)
n += isalpha(ch);
}
}
{ CBench Bench("unsigned((ch&(~(1<<5))) - 'A') <= 'Z' - 'A'");
for (int i=1 ; i<cTries ; ++i) {
for (int ch = ch0 ; ch < ch1 ; ++ ch)
n += isalpha1(ch);
}
}
{ CBench Bench("(ch>='A' && ch<='Z') || (ch>='a' && ch<='z')");
for (int i=1 ; i<cTries ; ++i) {
for (int ch = ch0 ; ch < ch1 ; ++ ch)
n += isalpha2(ch);
}
}
return n;
}
class CBench {
public:
__declspec(noinline) CBench(CBench* p) : m_fAccumulate(false), m_nTicks(0),
m_cCalls(0), m_pBench(p), m_desc(NULL), m_nStart(GetBenchMark()) { }
__declspec(noinline) CBench(const char *desc_in, bool fAccumulate=false) :
m_fAccumulate(fAccumulate), m_nTicks(0), m_cCalls(0), m_pBench(NULL),
m_desc(desc_in), m_nStart(GetBenchMark()) { }
__declspec(noinline) ~CBench() {
__int64 n = (m_fAccumulate) ? m_nTicks : GetBenchMark() - m_nStart;
if (m_pBench) {
m_pBench->m_nTicks += n;
m_pBench->m_cCalls++;
return;
} else if (!m_fAccumulate) {
m_cCalls++;
}
__int64 nFreq;
QueryPerformanceFrequency((LARGE_INTEGER*)&nFreq);
double ms = ((double)n * 1000)/nFreq;
printf("%s took: %.4f ms, calls: %d, avg:%f\n", m_desc, ms, m_cCalls,
ms/m_cCalls);
}
__declspec(noinline) __int64 GetBenchMark(void) {
__int64 nBenchMark;
QueryPerformanceCounter((LARGE_INTEGER*)&nBenchMark);
return nBenchMark;
}
LPCSTR m_desc;
__int64 m_nStart, m_nTicks;
DWORD m_cCalls;
bool m_fAccumulate;
CBench* m_pBench;
};