【发布时间】:2019-07-17 15:43:27
【问题描述】:
我为矩阵乘法编写了以下人为的示例,只是为了检查声明不同类型的数组如何影响性能。令我惊讶的是,我发现声明时已知大小的普通数组的性能不如可分配/指针数组。我认为allocatable 仅适用于不适合堆栈的大型数组。这是使用 gfortran 和英特尔 Fortran 编译器的代码和时序。 Windows 10 平台分别与编译器标志 -Ofast 和 -fast 一起使用。
program matrix_multiply
implicit none
integer, parameter :: n = 1500
real(8) :: a(n,n), b(n,n), c(n,n), aT(n,n) ! plain arrays
integer :: i, j, k, ts, te, count_rate, count_max
real(8) :: tmp
! real(8), allocatable :: A(:,:), B(:,:), C(:,:), aT(:,:) ! allocatable arrays
! allocate ( a(n,n), b(n,n), c(n,n), aT(n,n) )
do i = 1,n
do j = 1,n
a(i,j) = 1.d0/n/n * (i-j) * (i+j)
b(i,j) = 1.d0/n/n * (i-j) * (i+j)
end do
end do
! transpose for cache-friendliness
do i = 1,n
do j = 1,n
aT(j,i) = a(i,j)
end do
end do
call system_clock(ts, count_rate, count_max)
do i = 1,n
do j = 1,n
tmp = 0
do k = 1,n
tmp = tmp + aT(k,i) * b(k,j)
end do
c(i,j) = tmp
end do
end do
call system_clock(te)
print '(4G0)', "Elapsed time: ", real(te-ts)/count_rate,', c_(n/2+1) = ', c(n/2+1,n/2+1)
end program matrix_multiply
时间安排如下:
! Intel Fortran
! -------------
Elapsed time: 1.546000, c_(n/2+1) = -143.8334 ! Plain Arrays
Elapsed time: 1.417000, c_(n/2+1) = -143.8334 ! Allocatable Arrays
! gfortran:
! -------------
Elapsed time: 1.827999, c_(n/2+1) = -143.8334 ! Plain Arrays
Elapsed time: 1.702999, c_(n/2+1) = -143.8334 ! Allocatable Arrays
我的问题是为什么会这样?可分配数组是否为编译器提供了更多保证以更好地优化?在 Fortran 中处理固定大小的数组时,一般来说最好的建议是什么?
冒着延长问题的风险,下面是另一个英特尔 Fortran 编译器表现出相同行为的示例:
program testArrays
implicit none
integer, parameter :: m = 1223, n = 2015
real(8), parameter :: pi = acos(-1.d0)
real(8) :: a(m,n)
real(8), allocatable :: b(:,:)
real(8), pointer :: c(:,:)
integer :: i, sz = min(m, n), t0, t1, count_rate, count_max
allocate( b(m,n), c(m,n) )
call random_seed()
call random_number(a)
call random_number(b)
call random_number(c)
call system_clock(t0, count_rate, count_max)
do i=1,1000
call doit(a,sz)
end do
call system_clock(t1)
print '(4g0)', 'Time plain: ', real(t1-t0)/count_rate, ', sum 3x3 = ', sum( a(1:3,1:3) )
call system_clock(t0)
do i=1,1000
call doit(b,sz)
end do
call system_clock(t1)
print '(4g0)', 'Time alloc: ', real(t1-t0)/count_rate, ', sum 3x3 = ', sum( b(1:3,1:3) )
call system_clock(t0)
do i=1,1000
call doitp(c,sz)
end do
call system_clock(t1)
print '(4g0)', 'Time p.ptr: ', real(t1-t0)/count_rate, ', sum 3x3 = ', sum( c(1:3,1:3) )
contains
subroutine doit(a,sz)
real(8) :: a(:,:)
integer :: sz
a(1:sz,1:sz) = sin(2*pi*a(1:sz,1:sz))/(a(1:sz,1:sz)+1)
end
subroutine doitp(a,sz)
real(8), pointer :: a(:,:)
integer :: sz
a(1:sz,1:sz) = sin(2*pi*a(1:sz,1:sz))/(a(1:sz,1:sz)+1)
end
end program testArrays
ifort计时:
Time plain: 2.857000, sum 3x3 = -.9913536
Time alloc: 2.750000, sum 3x3 = .4471794
Time p.ptr: 2.786000, sum 3x3 = 2.036269
gfortran 的时间要长得多,但符合我的预期:
Time plain: 51.5600014, sum 3x3 = 6.2749456118192093
Time alloc: 54.0300007, sum 3x3 = 6.4144775892064283
Time p.ptr: 54.1900034, sum 3x3 = -2.1546109819149963
【问题讨论】:
标签: arrays performance fortran