【发布时间】:2014-02-25 19:07:18
【问题描述】:
我是 CPU 多线程算法的新手,我正在尝试实现标准的三层神经网络并行前馈算法。 问题在于并行版本慢了大约 10 倍...... 我认为原因是执行了太多线程。 我正在使用 4 核的 Intel i7 920,禁用超线程。 操作系统 Fedora 20,编译器 GCC 4.8.2
任何想法如何提高性能?
template<class T, class TM>
void NeuralNetwork<T, TM>::feedForwardParallel()
{
thread t0(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, 0);
thread t1(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, block_size0);
thread t2(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, 2*block_size0);
thread t3(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, 3*block_size0);
t0.join();
t1.join();
t2.join();
t3.join();
thread t4(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, 0);
thread t5(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, block_size1);
thread t6(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, 2*block_size1);
thread t7(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, 3*block_size1);
t4.join();
t5.join();
t6.join();
t7.join();
thread t8 (&NeuralNetwork<T, TM>::parallel_sum2, this, 1, 0);
thread t9 (&NeuralNetwork<T, TM>::parallel_sum2, this, 1, 1);
t8.join();
t9.join();
}
template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum0(int size, int start)
{
T sum = 0;
for (int i = start; i < start+size; i++)
{
for (int j = 0; j < INPUT_NEURONS; j++)
sum += inputN[j] * weightsIH[j][i];
sum += weightsIH[INPUT_NEURONS][i];
hidden1N[i] = sigmoid(sum);
}
}
template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum1(int size, int start)
{
T sum = 0.0;
for (int i = start; i < start+size; i++)
{
for (int j = 0; j < HIDDEN_NEURONS1; j++)
sum += hidden1N[j] * weightsHH[j][i];
sum += weightsHH[HIDDEN_NEURONS1][i];
hidden2N[i] = sigmoid(sum);
}
}
template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum2(int size, int start)
{
T sum = 0.0;
for (int i = start; i < start+size; i++)
{
for (int j = 0; j < HIDDEN_NEURONS2; j++)
sum += hidden2N[j] * weightsHO[j][i];
sum += weightsHO[HIDDEN_NEURONS2][i];
outputN[i] = sigmoid(sum);
}
}
template<class T, class TM>
T NeuralNetwork<T, TM>::sigmoid(T val) {
return tanh(val);
}
【问题讨论】:
-
你好审查员,他是一个新手,他的问题描述得不是那么糟糕......
-
您可以使用 std::thread::hardware_concurrency 获得最大硬件线程数。您必须将线程“推送”到线程池中,您可以在 git-hub 上找到的“taskqueue”库是一个很容易开始的库。它利用了 boost::asio,它提供了很好的工具来实现你的目标。
标签: c++ multithreading c++11 neural-network