这里有一些简单而高效的函数,用于计算编码为 8 个uint64_t 包的 64 字节值集合的最小值和最大值,每个包存储 64 个值中的每个值的 1 位:
#include <stdint.h>
uint8_t maxslice(const uint64_t s[8]) {
uint8_t max = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = s[i] & mask;
if (x) {
max |= bit;
mask &= x;
}
}
return max;
}
uint8_t minslice(const uint64_t s[8]) {
uint8_t min = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = ~s[i] & mask;
if (x) {
min |= bit;
mask &= x;
}
}
return ~min;
}
正如可以在Godbolt's Compiler Explorer 上验证的那样,clang 为这两个函数生成无分支代码。
为了计算以这种方式组织的一组较大值中的最小值的扩展目标uint64_t slices[8][100],您可以简单地在数组上迭代此代码并逐步计算最小值。如果已经找到了0 的绝对最小值,那么在此循环的每个步骤中进行测试可能是值得的。棘手的部分是数组的组织方式:
uint64_t slices[8][100] 定义了一个由 100 个 uint64_t 组成的 8 个数组的数组。换句话说,内存中的布局是 6400 个低阶位,然后是 6400 个 2 阶位,...,最后是 6400 个权重 128 位。
uint8_t minarray(const uint64_t s[8][100]) {
uint8_t all_max = 0;
for (int j = 0; j < 100; j++) {
uint8_t max = 0, bit = 0x80;
uint64_t mask = ~0ULL;
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x = ~s[i][j] & mask;
if (x) {
max |= bit;
mask &= x;
}
}
if (all_max < max) {
all_max = max;
if (all_max == 255)
break;
}
}
return ~all_max;
}
为了向量化这段代码,我们可以转置循环:用x和mask作为100个数组计算uint64_t会产生相同的结果,但会让编译器向量化一些内部循环:
uint8_t minarray1(const uint64_t s[8][100]) {
uint8_t max = 0, bit = 0x80;
uint64_t mask[100] = {
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL, ~0ULL,
};
for (int i = 8; i-- > 0; bit >>= 1) {
uint64_t x[100];
uint64_t xall = 0;
for (int j = 0; j < 100; j++) {
x[j] = ~s[i][j] & mask[j];
xall |= x[j];
}
if (xall) {
max |= bit;
for (int j = 0; j < 100; j++) {
mask[j] &= x[j];
}
}
}
return ~max;
}
再次 clang 生成 unrolled vectorized code。基准测试将判断这种方法是否比前一种方法提供更好的性能。