【发布时间】:2017-03-24 18:12:54
【问题描述】:
我有一个奇怪的问题。 我正在使用 OpenMP 库在 C++ 中编写一个简单的异步优化算法。 我编写了代码,它运行良好,没有任何错误。
然后我尝试评估某段代码的计算时间。 在我的系统上,该部分大约需要 12 秒。
然后,我注意到如果我注释与该部分完全无关的代码行,该部分的计算时间会减少很多!它下降到大约 1 秒。
我不知道如何为您提供显示我的问题的简单代码。 我在下面附上的代码是我的原始代码,我从中删除了所有不创建时间问题的部分。 不幸的是,我无法从代码中删除其他行,因为我尝试删除的每一行都会更改我感兴趣的部分的执行时间。
我指的是这一节,它在代码的末尾:
double gradientD_time = omp_get_wtime();
compute_function_gradient_D(gradient_D, DX, K, M, N);
double gradientD_total = (omp_get_wtime()- gradientD_time);
您可能会看到这里我正在评估 compute_function_gradient_D() 函数的计算时间。如果我运行这段代码,执行大约需要 12 秒。 如果从代码中删除行,则该部分的执行时间会下降到 1 秒。 您可以尝试删除的行示例:
std::string str_1 = folder + "parameters.dat";
std::string str_2 = folder + "times.dat";
std::string str_3 = folder + "merits.dat";
std::string str_4 = folder + "values.dat";
std::string str_5 = folder + "lipx.dat";
std::string str_6 = folder + "lipd.dat";
或
throw std::exception();
或
merits[iter] = max_br_init;
这些行与我正在计算执行时间的部分完全无关...如果我删除其中一行,为什么执行时间会改变?这里发生了什么?
#include <omp.h>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <iostream>
#include <stdexcept>
#include <algorithm>
#include "mkl.h"
void compute_function_gradient_D(double *gradient_D, double *DX, int K, int M, int N) {
for (int j = 0; j < K; j++){
for (int i = 0; i < M; i++){
gradient_D[j*M+i] = 0;
for (int k = 0; k < N; k++)
gradient_D[i+M*j] += DX[i+k*M];
}
}
}
double compute_D_const(double *D, int M, int K){
double L1norm_col = 0.0, err0=0, err1 = 0.0, tol=1e-6, normx = 0.0, normy= 0.0, nrm2= 0.0;
int count = 0;
double *Dt_col = new double[K]();
double *DDtb = new double[M]();
double *Dtb = new double[K]();
for (int i = 0; i < M; i++){
Dt_col[0:K:1] = D[i:K:M];
L1norm_col = cblas_dasum(K, Dt_col, 1);
DDtb[i] = L1norm_col;
}
nrm2 = cblas_dnrm2(M, DDtb, 1);
cblas_dscal(M, 1.0/nrm2, DDtb, 1);
err1 = nrm2;
while(std::abs(err1-err0)>tol*err1 && count<20){
err0 = err1;
cblas_dgemv(CblasColMajor, CblasTrans, M, K, 1.0, D, M, DDtb, 1 , 0.0, Dtb, 1);
cblas_dgemv(CblasColMajor, CblasNoTrans, M, K, 1.0, D, M, Dtb, 1, 0.0, DDtb, 1);
normx = cblas_dnrm2(M, DDtb, 1);
normy = cblas_dnrm2(K, Dtb, 1);
err1 = normx/normy;
cblas_dscal(M, 1.0/normx, DDtb, 1);
count++;
if(count>100) break;
}
err1*= err1;
delete [] Dt_col; delete [] DDtb; delete [] Dtb;
return err1;
}
void compute_function_gradient_X(double *gradient_X, double *D, double *DX, int over_X, int fe_X, int K, int M, int kn) {
int current_index_X = 0, col = 0, row = 0;
for (int i = 0; i < (kn+over_X); i++){
gradient_X[i] = 0.0;
current_index_X = fe_X + i;
col = std::floor(current_index_X/K);
row = current_index_X - col*K;
for(int j = 0; j < M; j++)
gradient_X[i] += D[M*row+j]*DX[M*col+j];
}
}
int main (int argc, char **argv) {
srand(time(NULL));
int max_time = 15000;
int max_iter = 1;
int time_flag = 0;
int merit_flag = 0;
int iter_flag = 0;
int iter = 0;
int core_count = 0;
double merit_limit = 1e-6;
double tau_0 = 1;
int number_of_threads = 1;
int M = 0;
int K = 0;
int N = 0;
double entry = 0.0;
int kn = 0.0;
int uneven_X = 0;
int uneven_D = 0;
int k = 0;
double lambda = 1;
double constr = 1;
double warm_up = 10;
std::string data = "../../data/param.dat";
FILE *file = fopen(data.c_str(), "r");
if (file == NULL) {
std::cout << "ERROR" << std::endl;
throw std::exception();
}
fscanf(file, "%lf", &entry); M = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); K = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); N = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); lambda = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); constr = entry; fscanf(file, "\n");
fscanf(file, "%lf", &entry); warm_up = entry;
fclose(file);
double *X = new double[N*K]();
double *D = new double[M*K]();
double *S = new double[N*M]();
double *times = new double[max_iter+2*number_of_threads+1]();
double *merits = new double[max_iter+2*number_of_threads+1]();
double *values = new double[max_iter+2*number_of_threads+1]();
double *Lip_X = new double[max_iter+2*number_of_threads+1]();
double *Lip_D = new double[max_iter+2*number_of_threads+1]();
int *actual_iteration_vector = new int[number_of_threads]();
double f_value = 0.0;
for (int i = 0; i < M*N; i++)
f_value += S[i]*S[i];
double *nabla_X_init = new double[K*N]();
double max_br_init = 0.0;
double x_hat_init = 0.0, gradient_init = 0.0, parameter_init = 0.0, tauX_init = 0.0, LipD_init = 0.0;
double m_value = 9999;
int t_warm_up = warm_up*number_of_threads;
LipD_init = compute_D_const(D, M, K);
tauX_init = std::max(LipD_init, tau_0);
cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, K, N, M, -1.0, D, M, S, M, 0.0, nabla_X_init, K);
for (int i = 0; i < (K*N); i++){
gradient_init = nabla_X_init[i];
x_hat_init = X[i] - gradient_init/tauX_init;
parameter_init = lambda/tauX_init;
if (x_hat_init >= parameter_init)
x_hat_init -= parameter_init;
else {
if (x_hat_init <= -parameter_init)
x_hat_init += parameter_init;
else
x_hat_init = 0.0;
}
if(std::abs(x_hat_init-X[i]) >= max_br_init)
max_br_init = std::abs(x_hat_init-X[i]);
}
double *D_col_init = new double[M]();
double *D_hat_init = new double[M*K]();
double *max_br = new double[number_of_threads]();
std::fill(max_br, max_br+number_of_threads, -9999);
D_hat_init[0:M*K:1] = D[0:M*K:1];
double col_norm_init = 0.0;
for (int i = 0; i < K; i++){
D_col_init[0:M:1] = D[(i*M):M:1];
col_norm_init = cblas_dnrm2(M, D_col_init, 1);
if(col_norm_init > constr)
D_hat_init[(i*M):M:1] *= constr/col_norm_init;
}
for (int i = 0; i < (M*K); i++){
if(std::abs(D_hat_init[i]-D[i]) >= max_br_init)
max_br_init = std::abs(D_hat_init[i]-D[i]);
}
values[iter] = 0.5*f_value;
merits[iter] = max_br_init;
times[iter] = 0.0;
iter++;
kn = std::floor((K*N)/number_of_threads);
uneven_X = (K*N % number_of_threads);
k = std::floor(K/number_of_threads);
uneven_D = (K % number_of_threads);
delete [] nabla_X_init; delete [] D_col_init; delete [] D_hat_init;
double total = omp_get_wtime();
double init_time = omp_get_wtime() - total;
int thread_id = 0;
thread_id = omp_get_thread_num();
int over_X = 0;
int over_D = 0;
if ((uneven_X != 0) && (thread_id == (number_of_threads-1)))
over_X = uneven_X;
if ((uneven_D != 0) && (thread_id == (number_of_threads-1)))
over_D = uneven_D;
double *gradient_X = new double[kn+over_X]();
double *delta_X = new double[kn+over_X]();
double *delta_D = new double[(k+over_D)*M]();
double *D_col = new double[M]();
int fe_X = thread_id*kn;
int fe_D = thread_id*k;
double end = 0.0, LipX = 0.0, LipD = 0.0, tauX = 0.0, tauD = 0.0, X_hat = 0.0, col_norm = 0.0, max_br_local = 0.0;
double *D_hat = new double[(k+over_D)*M]();
double *times_local = new double[max_iter+1]();
double *merits_local = new double[max_iter+1]();
double *values_local = new double[max_iter+1]();
int current_index_X = 0, current_index_D = 0;
int actual_iteration = 1;
times_local[0] = times[0];
merits_local[0] = merits[0];
values_local[0] = values[0];
actual_iteration_vector[thread_id] = 1;
double start = omp_get_wtime();
double gradientX_total = 0.0;
double *gradient_D = new double[(k+over_D)*M]();
double *DX = new double[M*N]();
while (iter_flag == 0 && merit_flag == 0 && time_flag == 0){
double gradientX_time = omp_get_wtime();
compute_function_gradient_X(gradient_X, D, DX, over_X, fe_X, K, M, kn);
gradientX_total += (omp_get_wtime()-gradientX_time);
double gradientD_time = omp_get_wtime();
compute_function_gradient_D(gradient_D, DX, K, M, N);
double gradientD_total = (omp_get_wtime()- gradientD_time);
printf("Gradient D total = %f \n", gradientD_total);
iter++;
if ((omp_get_wtime() - total) >= max_time)
time_flag = 1;
if (m_value <= merit_limit)
merit_flag = 1;
if (iter >= max_iter)
iter_flag = 1;
}
end = omp_get_wtime();
#pragma omp barrier
int value = 0;
for(int i = 0; i < thread_id; i++)
value += (actual_iteration_vector[i]-1);
for (int i = 0; i < (actual_iteration_vector[thread_id]-1); i++){
times[value+1+i] = times_local[i+1];
merits[value+1+i] = merits_local[i+1];
values[value+1+i] = values_local[i+1];
}
delete [] X; delete [] D; delete [] S; delete [] times; delete [] merits; delete [] values; delete [] Lip_X;
delete [] Lip_D; delete [] actual_iteration_vector; delete [] max_br; delete [] gradient_D; delete [] DX;
delete [] gradient_X; delete [] delta_X; delete [] delta_D; delete [] D_col; delete [] D_hat;
delete [] times_local; delete [] merits_local; delete [] values_local;
std::string folder = "../results/";
std::string str_1 = folder + "parameters.dat";
std::string str_2 = folder + "times.dat";
std::string str_3 = folder + "merits.dat";
std::string str_4 = folder + "values.dat";
std::string str_5 = folder + "lipx.dat";
std::string str_6 = folder + "lipd.dat";
return 0;
}
代码的含义真的无关紧要。实际上,由于我删除了很多行,因此代码不再具有任何意义。 一开始读取了一个名为“param”的文件:它只包含六个不为零的输入值:
64
64
255025
0.125
1
1000000
为了运行代码,我使用以下 cmake 文件:
project(example)
cmake_minimum_required(VERSION 2.8)
set(CMAKE_CXX_COMPILER "icc")
set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS)
set(CMAKE_CXX_FLAGS "-qopenmp -mkl=sequential")
add_executable(example main.cpp)
其中,通过命令cmake 创建生成文件。然后我执行make,最后运行二进制文件。
【问题讨论】:
-
您能提供一个示例输入文件吗?多次迭代中观察到的时间是否一致?在讨论性能问题时使用优化标志(例如
-O3)。你永远不会使用gradient_D,编译器可以完全随机优化整个事情。将icpc用作CMAKE_CXX_COMPILER以确保链接C++ 库。 -
输入文件是一个 .dat 文件,包含:
64 64 255025 0.125 1 1000000。每个数字写在不同的行中。 -
即使使用
icpc或-o3它也不会改变任何东西。
标签: c++ performance inline compiler-optimization icc