【发布时间】:2020-11-18 23:35:05
【问题描述】:
我正在对我们的库进行性能分析,并注意到大部分时间都花在了矩阵操作上。 我想看看是否可以通过更改矩阵循环的顺序或将矩阵类定义从行主要更改为列主要来提高性能。 问题:
- 下面我测试了2个案例。测试用例 1 始终是最快的,无论我的矩阵是行还是列为主。这是为什么呢?
- 启用矢量化可将测试用例 1 提高 2 倍,这是为什么呢?
性能分析是通过非常困倦完成的。 我使用 Visual Studio 2019 – platformtoolset v142,并编译为 32 位。
我们的库定义了一个矩阵模板,其中底层是一个动态数组,其中排序是主要列(完整代码如下):
Type& operator()(int row, int col)
{
return pArr[row + col * m_rows];
}
Type operator()(int row, int col) const
{
return pArr[row + col * m_rows];
}
我们还有一个特定于双精度的矩阵类:
class DMatrix : public TMatrix<double>
{
public:
// Constructors:
DMatrix() : TMatrix<double>() { }
DMatrix(int rows, int cols) : TMatrix<double>(rows, cols, true) {}
};
我运行了 2 个测试用例,它们对随机填充的矩阵执行嵌套循环操作。测试用例 1 和 2 的区别在于内部循环的顺序。
int nrep = 10000; // Large number of calculations
int nstate = 400;
int nstep = 400;
int nsec = 3; // 100 times smaller than nstate and nstep
DMatrix value(nstate, nsec);
DMatrix Rc(nstate, 3 * nstep);
DMatrix rhs(nstate, nsec);
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) {
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) {
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
由于矩阵是列主要的,我希望当内部循环跟随列时,我会获得最佳性能,因为附近的元素被 CPU 缓存,但相反它正在做相反的事情。请注意,nstep 和 nstate 通常比 nsec 大 100 倍。
当我打开矢量化: Code Generation/Enable Enhanced Instruction Set 中的“Advanced Vector Extensions 2”,性能差距更大:
当我关闭矢量化并使矩阵行变为主要时:
Type& operator()(int row, int col)
{
return pArr[col + row*m_cols];
}
Type operator()(int row, int col) const
{
return pArr[col + row*m_cols];
}
使用矢量优化:
完整的代码。矩阵.h:
#ifndef __MATRIX_H
#define __MATRIX_H
#include <assert.h>
#include <iostream>
template<class Type>
class TMatrix
{
public:
TMatrix(); // Default constructor
TMatrix(int rows, int cols, bool init = false); // Constructor with dimensions + flag to default initialize or not
TMatrix(const TMatrix& mat); // Copy constructor
TMatrix& operator=(const TMatrix& mat); // Assignment operator
~TMatrix(); // Destructor
// Move constructor/assignment
TMatrix(TMatrix&& mat) noexcept;
TMatrix& operator=(TMatrix&& mat) noexcept;
// Get matrix dimensions
int no_rows() const { return m_rows; }
int no_columns() const { return m_cols; }
Type& operator()(int row, int col)
{
assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
return pArr[row + col * m_rows]; // elements in a column lay next to each other
//return pArr[col + row*m_cols]; // elements in a row lay next to each other
}
Type operator()(int row, int col) const
{
assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
return pArr[row + col * m_rows];
// return pArr[col + row*m_cols];
}
protected:
void clear();
Type* pArr;
int m_rows, m_cols;
};
//**************************************************************
// Implementation of TMatrix
//**************************************************************
// Default constructor
template<class Type>
TMatrix<Type>::TMatrix()
{
m_rows = 0;
m_cols = 0;
pArr = 0;
}
// Constructor with matrix dimensions (rows, cols)
template<class Type>
TMatrix<Type>::TMatrix(int rows, int cols, bool init)
{
pArr = 0;
m_rows = rows;
m_cols = cols;
if (m_rows > 0 && m_cols > 0)
if (init)
pArr = new Type[m_rows * m_cols]();
else
pArr = new Type[m_rows * m_cols]; // TODO: check for p = NULL (memory allocation error, which will triger a GPF)
else
{
m_rows = 0;
m_cols = 0;
}
}
// Copy constructor
template<class Type>
TMatrix<Type>::TMatrix(const TMatrix& mat)
{
pArr = 0;
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
int dim = m_rows * m_cols;
pArr = new Type[dim];
for (int i = 0; i < dim; i++)
pArr[i] = mat.pArr[i];
}
else
{
m_rows = m_cols = 0;
}
}
// Move constructors
template<class Type>
TMatrix<Type>::TMatrix(TMatrix&& mat) noexcept
{
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
pArr = mat.pArr;
}
else
{
m_rows = m_cols = 0;
pArr = 0;
}
mat.pArr = 0;
}
// Clear the matrix
template<class Type>
void TMatrix<Type>::clear()
{
delete[] pArr;
pArr = 0;
m_rows = m_cols = 0;
}
// Destructor
template<class Type>
TMatrix<Type>::~TMatrix()
{
clear();
}
// Move assignment
template<class Type>
TMatrix<Type>& TMatrix<Type>::operator=(TMatrix&& mat) noexcept
{
if (this != &mat) // Check for self assignment
{
clear();
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
pArr = mat.pArr;
}
else
{
m_rows = m_cols = 0;
}
mat.pArr = nullptr;
}
return *this;
}
// Assignment operator with check for self-assignment
template<class Type>
TMatrix<Type>& TMatrix<Type>::operator=(const TMatrix& mat)
{
if (this != &mat) // Guard against self assignment
{
clear();
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
int dim = m_rows * m_cols;
pArr = new Type[dim];
for (int i = 0; i < dim; i++)
pArr[i] = mat.pArr[i];
}
else
{
m_rows = m_cols = 0;
}
}
return *this;
}
#endif
dmatrix.h:
#ifndef __DMATRIX_H
#define __DMATRIX_H
#include "matrix.h"
class DMatrix : public TMatrix<double>
{
public:
// Constructors:
DMatrix() : TMatrix<double>() { }
DMatrix(int rows, int cols) : TMatrix<double>(rows, cols, true) {}
};
#endif
主要:
#include <iostream>
#include "dmatrix.h"
int main()
{
int nrep = 10000; // Large number of calculations
int nstate = 400;
int nstep = 400;
int nsec = 3; // 100 times smaller than nstate and nstep
DMatrix value(nstate, nsec);
DMatrix Rc(nstate, 3 * nstep);
DMatrix rhs(nstate, nsec);
// Give some random input
for (int i = 0; i < Rc.no_rows(); i++) {
for (int j = 0; j < Rc.no_columns(); j++) {
Rc(i, j) = double(std::rand()) / RAND_MAX;
}
}
for (int i = 0; i < value.no_rows(); i++) {
for (int j = 0; j < value.no_columns(); j++) {
value(i, j) = 1 + double(std::rand()) / RAND_MAX;
}
}
for (int i = 0; i < rhs.no_rows(); i++) {
for (int j = 0; j < rhs.no_columns(); j++) {
rhs(i, j) = 1 + double(std::rand()) / RAND_MAX;
}
}
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) { // Expectation: this is fast - inner loop follows row
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) { // Expectation: this is slow - inner loop walks down column
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
return 0;
}
提前感谢您的帮助。 最好的祝福, 内尔
【问题讨论】:
-
#define __DMATRIX_H此标识符保留给语言实现。通过定义它,您的程序将具有未定义的行为。您应该使用另一个标头保护。 -
Rc是最大的矩阵,访问它的索引独立于j。如果内部循环在j上,您可以在所有nsec迭代中使用Rc(i, diag - 1)的值。 -
@BennyK 误读了。感谢您指出。
标签: c++ arrays performance loops dynamic