使用 C++11 可变参数模板在编译时快速排序答案

【问题标题】：Quick sort at compilation time using C++11 variadic templates使用 C++11 可变参数模板在编译时快速排序
【发布时间】：2011-11-03 12:42:16
【问题描述】：

我刚刚实现了快速排序算法，使用 C++11 可变参数模板在编译时对其进行评估。但是，当数据集太大时，我会遇到性能问题。

#include <iostream>

using namespace std;

template<int... vs>
struct Seq
{}; 
template<int v1, int...vs>
struct Seq<v1, vs...>{
};


template<typename newT, typename srcT>
struct PushFront{
};
template<int vadded, int...vs>
struct PushFront<Seq<vadded>, Seq<vs...>>{
  typedef Seq<vadded, vs...> ResultType;
};

template<typename T>
struct PopFront{
};
template<int v1, int...vs>
struct PopFront<Seq<v1, vs...>>{
  typedef Seq<vs...> RemaindType;
  typedef Seq<v1>    ResultType;
};

template<typename T1, typename T2>
struct CatSeq{};
template<int...v, int...us>
struct CatSeq<Seq<v...>, Seq<us...>>{
  typedef Seq< v..., us... >  ResultType;
};


template<bool c, typename NewT, typename TrueClsT, typename FalseClsT>
struct Classify{
};
template<typename NewT, typename TrueClsT, typename FalseClsT>
struct Classify<true, NewT, TrueClsT, FalseClsT>{
  typedef typename PushFront<NewT, TrueClsT>::ResultType NewTrueClsT;
  typedef FalseClsT  NewFalseClsT;
};
template<typename NewT, typename TrueClsT, typename FalseClsT>
struct Classify<false, NewT, TrueClsT, FalseClsT>{
  typedef TrueClsT  NewTrueClsT;
  typedef typename PushFront<NewT, FalseClsT>::ResultType NewFalseClsT;
};

template<typename T1, typename T2>
struct Compare{};
template<int v1, int v2>
struct Compare<Seq<v1>, Seq<v2>>{
  static const bool result=(v1>=v2); 
};


template<typename AnchorT, typename SeqT, typename GESet, typename LSet>
struct PartitionImpl{};
template<typename GESet, typename LSet, int anchorv, int v1>
struct PartitionImpl<Seq<anchorv>, Seq<v1>, GESet, LSet>{
  static const bool isge=Compare<typename PopFront<Seq<v1>>::ResultType, Seq<anchorv>>::result;
  typedef typename Classify<isge, Seq<v1>, GESet, LSet>::NewTrueClsT  RstGESet;
  typedef typename Classify<isge, Seq<v1>, GESet, LSet>::NewFalseClsT  RstLSet;  
};
template<typename GESet, typename LSet, int anchorv, int v1, int...vs>
struct PartitionImpl<Seq<anchorv>, Seq<v1, vs...>, GESet, LSet>{
  static const bool isge=Compare<typename PopFront<Seq<v1, vs...>>::ResultType, Seq<anchorv>>::result;
  typedef typename Classify<isge, Seq<v1>, GESet, LSet>::NewTrueClsT  TmpRstGESet;
  typedef typename Classify<isge, Seq<v1>, GESet, LSet>::NewFalseClsT  TmpRstLSet;

  typedef typename PartitionImpl<Seq<anchorv>, Seq<vs...>, TmpRstGESet, TmpRstLSet>::RstGESet RstGESet;
  typedef typename PartitionImpl<Seq<anchorv>, Seq<vs...>, TmpRstGESet, TmpRstLSet>::RstLSet  RstLSet;
};


template<typename T>
struct Partition{
};
template<int v1, int v2, int...vs>
struct Partition<Seq<v1, v2, vs...>>{
  typedef Seq<v1> AnchorType;
  typedef Seq<> GESet;
  typedef Seq<> LSet;
  typedef typename PartitionImpl<AnchorType, Seq<v1, v2, vs...>, GESet, LSet>::RstGESet  RstGESet;
  typedef typename PartitionImpl<AnchorType, Seq<v1, v2, vs...>, GESet, LSet>::RstLSet   RstLSet;
};

//why introduce this? refer to Sort
template<typename SrcT, typename GESet, typename LSet, template<typename > class SortOp>
struct SortSub{  
  typedef typename SortOp<GESet>::ResultType  TmpGESet2;
  typedef typename SortOp<LSet>::ResultType   TmpLSet2;
};
template<typename SrcT, typename LSet, template<typename> class SortOp>
struct SortSub<SrcT, SrcT, LSet, SortOp>{
  typedef SrcT  TmpGESet2;
  typedef typename SortOp<LSet>::ResultType   TmpLSet2;
};
template<typename SrcT, typename GESet, template<typename> class SortOp>
struct SortSub<SrcT, GESet, SrcT, SortOp>{
  typedef typename SortOp<GESet>::ResultType  TmpGESet2;
  typedef SrcT   TmpLSet2;
};

template<typename T>
struct Sort;
template<>
struct Sort<Seq<>>{
  typedef Seq<> ResultType;
};
template<int v>
struct Sort< Seq<v> >{
  typedef Seq<v> ResultType;
};
template<int v1, int...vs>
struct Sort< Seq<v1, vs...> >{
  typedef Seq<v1, vs...> SrcType;
  typedef typename Partition< Seq<v1, vs...> >::RstGESet TmpGESet;
  typedef typename Partition< Seq<v1, vs...> >::RstLSet TmpLSet;

  //to by pass the case SrcType <==> TmpGESet or  SrcType <==> TmpLSet
  typedef typename SortSub<SrcType, TmpGESet, TmpLSet, Sort>::TmpGESet2  TmpGESet2;
  typedef typename SortSub<SrcType, TmpGESet, TmpLSet, Sort>::TmpLSet2   TmpLSet2;

  typedef typename CatSeq<TmpGESet2, TmpLSet2>::ResultType ResultType;
};


void dumpSeqTypeImpl(Seq<> ){
}
template<int v1>
void dumpSeqTypeImpl(Seq<v1> ){
  cout<<v1<<" ";
}
template<int v1, int...vs>
void dumpSeqTypeImpl(Seq<v1, vs...> ){
  cout<<v1<<" ";
  dumpSeqTypeImpl( Seq<vs...>() );
}
template<int...vs>
void dumpSeqType(Seq<vs...> ){
  cout<<"Seq type < ";
  dumpSeqTypeImpl( Seq<vs...>() );
  cout<<" >"<<endl;
}

    //test data
#include "qsort_input.txt"

int main(){
  //Seq<>  s0;// aggregate ‘Seq<> s0’ has incomplete type and cannot be defined
  Seq<1> s1;
  Seq<1, 2> s2;

  typedef Seq<5, 5, 5> TestType_SAME;
  TestType_SAME same;
  dumpSeqType( same );
  typename Partition< TestType_SAME >::RstGESet _ts1;
  typename Partition< TestType_SAME >::RstLSet _ts2;
  dumpSeqType( _ts1 );
  dumpSeqType( _ts2 );

#if 1
  typedef Seq<4, 7, 3, 9, 1, 2, 5, 5, 19, 5> TestType;
  TestType s3;
  dumpSeqType( s3 );
  typename Partition< TestType >::RstGESet ts1;
  typename Partition< TestType >::RstLSet ts2;
  dumpSeqType( ts1 );
  dumpSeqType( ts2 );

  typename Sort<TestType>::ResultType so1;
  dumpSeqType( so1 );
#endif 

#if 1
  typedef Seq<TEST_DATA_100> TAdvanceType;
  typename Sort<TAdvanceType>::ResultType soadvance;
  dumpSeqType(soadvance);
#endif

  return 0;
}

当数据集为TEST_DATA_100时，编译需要1.7s。
当数据集为 TEST_DATA_1000 时，编译器似乎停止了......

我使用的是 gcc 4.6.0。

【问题讨论】：

嘿，有趣。将性能问题报告为 GCC 错误。 :)
我不知道该说什么（除了“接受更多答案！！”），但在 TMP 中写出这件事绝对是 +1！
Nitpick：您不需要定义而只需要声明以使您的代码更小。例如。编写结构序列；而不是 struct Seq {};

标签： c++ metaprogramming quicksort variadic-templates c++11

【解决方案1】：

您是否也查看过它的内存消耗？请注意，快速排序本身比线性更差，更糟糕的情况运行时非常糟糕。这与模板编译和实例化的某些步骤（有时是指数的）的线性运行时行为相乘。您可能应该绘制各种数据集的编译时间图，以观察代码的真正复杂性类别。通常使用如此大的数据集进行模板元编程是不可行的。

编辑： 出于好奇，我尝试了代码，发现直到 ~500 它大致遵循公式 pow(N*log(N),1.47)*0.0004+0.6 但随后开始变得非常慢，700 个项目需要 155 秒.此外，它开始消耗大量内存（600 个需要 3GiB），这让我得出结论，对于 1000 个元素，它需要比大多数人更多的内存，并且需要数小时才能编译。

进一步注意，当不是每个元素都是唯一的时，代码不起作用。

【讨论】：

gcc.gnu.org/gcc-4.5/changes.html: Compilation time for code that uses templates should now scale linearly with the number of instantiations rather than quadratically, as template instantiations are now looked up using hash tables.
听起来他们对哈希表有一个固定的大小并且不会增长它，并且在 500 个元素之后会退化（这当然会导致更多的实例化）
我拿了 4.6，因为 OP 也用过。
我只能说：有趣。那么，我分享你对哈希表的假设。
很棒的发现。关于“当不是每个元素都是唯一的时，代码不起作用”。我明天修。而且由于我在 Virtual Box 中运行 linux，所以我没有跟踪性能。

【解决方案2】：

您正在使用递归元函数来构建您的快速排序。当您尝试向编译器推送 1000 个递归实例时，您究竟期望会发生什么？

仅仅因为一个函数理论上可以接受任意数量的参数并不意味着编译器实际上可以处理任意数量的参数。编译器有限制。

另外：编译时排序的意义何在？您可以离线执行此操作并将数据复制到 .cpp 文件中。或者在程序启动时运行一次std::sort。

【讨论】：

编译时排序可能如果它是您要排序的类型。
这类代码的重点通常是代码准确地记录了它在做什么。您以在应用程序域中看起来“自然”且最容易维护的方式呈现数据。然后你通过Sort 元函数强制它，以便以算法所需的形式获取数据。这样的事情可能会为您节省半页关于数据是什么、数据来自何处以及如何将其放入您必须嵌入的任何形式的 cmets。与需要大量 cmets 的代码相比，我更喜欢乍一看清晰的代码。
当然，这一切并不排除 OP 这样做可能只是为了好玩。 :)
是的，我这样做只是为了好玩。很久以前我一直想在编译中实现快速排序（当时 C++11 还只是 C++0x）。现在 C++0x 是 C++11。我可以通过使用可变参数模板来实现它。好笑。
如果 OP 要求提供编译时解决方案，则不能说它可以在运行时完成。对事情进行分类也可以在纸上完成，或者作为外部公司的外部任务在某处:-)