我有两个 TMP 版本。哪个更好,取决于数据类型,我猜:
解决方案 A:
首先,让我们为分割点找到一个好的偏移量(2 的幂看起来不错):
template<std::ptrdiff_t diff, std::ptrdiff_t V = 2>
struct offset
{
static constexpr std::ptrdiff_t value =
(V * 2 < diff - 1) ? offset<diff, V * 2>::value : V;
};
// End recursion
template<std::ptrdiff_t diff>
struct offset<diff, 1<<16>
{
static constexpr std::ptrdiff_t value = 1<<16;
};
// Some special cases
template<>
struct offset<0, 2>
{
static constexpr std::ptrdiff_t value = 0;
};
template<>
struct offset<1, 2>
{
static constexpr std::ptrdiff_t value = 0;
};
template<>
struct offset<2, 2>
{
static constexpr std::ptrdiff_t value = 0;
};
有了这个,我们可以创建一个递归的 TMP 版本:
template <std::ptrdiff_t diff, class It, class Func>
auto binary_fold_tmp(It begin, It end, Func op)
-> decltype(op(*begin, *end))
{
assert(end - begin == diff);
switch (diff)
{
case 0:
assert(false);
return 0; // This will never happen
case 1:
return *begin;
case 2:
return op(*begin, *(begin + 1));
default:
{ // first round to the nearest multiple of 2 and then advance
It mid{begin};
std::advance(mid, offset<diff>::value);
auto left = binary_fold_tmp<offset<diff>::value>(begin, mid, op);
auto right =
binary_fold_tmp<diff - offset<diff>::value>(mid, end, op);
return op(left, right);
}
}
}
这可以与这样的非 TMP 版本结合使用,例如:
template <class It, class Func>
auto binary_fold(It begin, It end, Func op)
-> decltype(op(*begin, *end))
{
const auto diff = end - begin;
assert(diff > 0);
switch (diff)
{
case 1:
return binary_fold_tmp<1>(begin, end, op);
case 2:
return binary_fold_tmp<2>(begin, end, op);
case 3:
return binary_fold_tmp<3>(begin, end, op);
case 4:
return binary_fold_tmp<4>(begin, end, op);
case 5:
return binary_fold_tmp<5>(begin, end, op);
case 6:
return binary_fold_tmp<6>(begin, end, op);
case 7:
return binary_fold_tmp<7>(begin, end, op);
case 8:
return binary_fold_tmp<8>(begin, end, op);
default:
if (diff < 16)
return op(binary_fold_tmp<8>(begin, begin + 8, op),
binary_fold(begin + 8, end, op));
else if (diff < 32)
return op(binary_fold_tmp<16>(begin, begin + 16, op),
binary_fold(begin + 16, end, op));
else
return op(binary_fold_tmp<32>(begin, begin + 32, op),
binary_fold(begin + 32, end, op));
}
}
解决方案 B:
这会计算成对结果,将它们存储在缓冲区中,然后使用缓冲区调用自身:
template <std::ptrdiff_t diff, class It, class Func, size_t... Is>
auto binary_fold_pairs_impl(It begin,
It end,
Func op,
const std::index_sequence<Is...>&)
-> decltype(op(*begin, *end))
{
std::decay_t<decltype(*begin)> pairs[diff / 2] = {
op(*(begin + 2 * Is), *(begin + 2 * Is + 1))...};
if (diff == 2)
return pairs[0];
else
return binary_fold_pairs_impl<diff / 2>(
&pairs[0],
&pairs[0] + diff / 2,
op,
std::make_index_sequence<diff / 4>{});
}
template <std::ptrdiff_t diff, class It, class Func>
auto binary_fold_pairs(It begin, It end, Func op) -> decltype(op(*begin, *end))
{
return binary_fold_pairs_impl<diff>(
begin, end, op, std::make_index_sequence<diff / 2>{});
}
此模板函数要求diff 是 2 的幂。当然,您也可以将它与非模板版本结合使用:
template <class It, class Func>
auto binary_fold_mix(It begin, It end, Func op) -> decltype(op(*begin, *end))
{
const auto diff = end - begin;
assert(diff > 0);
switch (diff)
{
case 1:
return *begin;
case 2:
return binary_fold_pairs<2>(begin, end, op);
case 3:
return op(binary_fold_pairs<2>(begin, begin + 1, op),
*(begin + (diff - 1)));
case 4:
return binary_fold_pairs<4>(begin, end, op);
case 5:
return op(binary_fold_pairs<4>(begin, begin + 4, op),
*(begin + (diff - 1)));
case 6:
return op(binary_fold_pairs<4>(begin, begin + 4, op),
binary_fold_pairs<4>(begin + 4, begin + 6, op));
case 7:
return op(binary_fold_pairs<4>(begin, begin + 4, op),
binary_fold_mix(begin + 4, begin + 7, op));
case 8:
return binary_fold_pairs<8>(begin, end, op);
default:
if (diff <= 16)
return op(binary_fold_pairs<8>(begin, begin + 8, op),
binary_fold_mix(begin + 8, end, op));
else if (diff <= 32)
return op(binary_fold_pairs<16>(begin, begin + 16, op),
binary_fold_mix(begin + 16, end, op));
else
return op(binary_fold_pairs<32>(begin, begin + 32, op),
binary_fold_mix(begin + 32, end, op));
}
}
我使用与 MtRoad 相同的程序进行测量。在我的机器上,差异没有 MtRoad 报告的那么大。使用-O3 解决方案 A 和 B 似乎比 MtRoad 的版本稍快,但实际上,您需要使用您的类型和数据进行测试。
备注:我没有太严格地测试我的版本。