Following the (answer of JackOLantern) I'm trying to compute a batch 1D FFTs using cufftPlanMany.
The code below perform nwfs=23 times the 1D FFT forward and the 1D FFT backward of an n=256 complex array. It's to train me to handle the routine cufftPlanMany. As a second step, the nwfs arrays will be differents .At the end, I check the errors of each arrays.
Because of the data are allocate as: cinput_d(n,nwfs)
I use th function like this: cufftPlanMany(planmany, 1, fftsize, inembed, nwfs,1, onembed, nwfs,1, CUFFT_C2C, nwfs)
where :
rank = 1
fftsize = {n} same dim for each FFT
inembed = onembed = {0} ignored
istride = ostride = nwfs distance between two successive input and output
idist = odist = 1 distance between two signals
batch = nwfs number of fft to be done
program fft
use cudafor
use precision_m
use cufft_m
implicit none
integer, allocatable:: kx(:)
complex(fp_kind), allocatable:: matrix(:)
complex(fp_kind), allocatable, pinned :: cinput(:,:),coutput(:,:)
complex(fp_kind), allocatable, device :: cinput_d(:,:),coutput_d(:,:)
integer:: i,j,k,n,nwfs
integer, allocatable :: fftsize(:),inembed(:),onembed(:)
type(c_ptr):: plan,planmany
real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h
integer::clock_start,clock_end,clock_rate,istat
real :: elapsed_time
character*1:: a
real(fp_kind):: w,x,y,z
integer:: nerrors
n=256
nwfs=23
h=twopi/real(n,fp_kind)
! allocate arrays on the host
allocate (cinput(n,nwfs),coutput(n,nwfs))
allocate (kx(n),matrix(n))
allocate (fftsize(nwfs),inembed(nwfs),onembed(nwfs))
! allocate arrays on the device
allocate (cinput_d(n,nwfs),coutput_d(n,nwfs))
fftsize(:) = n
inembed(:) = 0
onembed(:) = 0
!initialize arrays on host
kx =(/ ((i-0.5)*0.1953125, i=1,n/2), ((-n+i-0.5)*0.1953125, i=n/2+1,n) /)
matrix = (/ ... /)
!write(*,*) cinput
!copy arrays to device
do i =1,nwfs
cinput(:,i)=matrix(:)
end do
cinput_d=cinput
! Initialize the plan for complex to complex transform
if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1)
if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1)
if (fp_kind== doublePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, &
nwfs,1, &
onembed, &
nwfs,1, &
CUFFT_Z2Z, nwfs)
if (fp_kind== singlePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, &
nwfs,1, &
onembed, &
nwfs,1, &
CUFFT_C2C, nwfs)
!c_null_ptr fftsize,inembed,onembed
! cufftPlanMany(plan, rank, n, inembed, istride, idist, &
! onembed, ostride, odist, &
! type, batch)
!subroutine cufftPlan1d(plan, nx, type, batch)
call SYSTEM_CLOCK(COUNT_RATE=clock_rate)
istat=cudaThreadSynchronize()
call SYSTEM_CLOCK(count=clock_start)
! Forward transform out of place
call cufftExec(planmany,cinput_d,coutput_d,CUFFT_FORWARD)
!$cuf kernel do <<<*,*>>>
do i=1,n
do j =1,n
coutput_d(i,j) = coutput_d(i,j)/real(n,fp_kind)!sqrt(twopi*real(n,fp_kind))*sqrt(2.*pi)/sqrt(real(maxn))
end do
end do
call cufftExec(planmany,coutput_d,coutput_d,CUFFT_INVERSE)
istat=cudaThreadSynchronize()
call SYSTEM_CLOCK(count=clock_end)
! Copy results back to host
coutput=coutput_d
do i=1,n
! write(*,'(i2,1x,2(f8.4),1x,2(f8.4),2x,e13.7)') i,cinput(i),coutput(i),abs(coutput(i)-cinput(i))
end do
nerrors=0
do i=1,n
!write(*,'(i2,5(1x,2(f8.4),1x,2(f8.4),2x,3(e13.7,2x)))') i,cinput(i,1),coutput(i,1),abs(coutput(i,1)-cinput(i,1)),abs(coutput(i,6)-cinput(i,6)),abs(coutput(i,nwfs)-cinput(i,nwfs))
do j=1,nwfs
if (abs(coutput(i,j)-cinput(i,j))>1.d-5) then
write(*,'(i3,i3,1x,e13.7,2x,4(f8.4))') i,j,abs(coutput(i,j)-cinput(i,j)),cinput(i,j),coutput(i,j)
nerrors = nerrors + 1
end if
end do
end do
elapsed_time = REAL(clock_end-clock_start)/REAL(clock_rate)
write(*,*) 'elapsed_time :',elapsed_time,clock_start,clock_end,clock_rate
if (nerrors .eq. 0) then
print *, "Test Passed"
else
print *, "Test Failed"
endif
!release memory on the host and on the device
deallocate (cinput,coutput,kx,cinput_d,coutput_d)
! Destroy the plans
call cufftDestroy(plan)
end program fft
Is somebody can tell me why the following "many-FFT" sometimes failed in double precision but never in single precision ?
Single precision: "Test Passed" ALWAYS !
Double precision: "Test Failed" Sometimes !
Indeed, I checked the Device to Host data transfer. That doesn't seem to be it.
Thanks for any help.
Thanks to talonmies. It was the WDDM Timeout Detection & Recovery limit.
See the link, to change the TDR
Related
I created a CUDA stream in this way:
integer(kind=cuda_stream_kind) :: stream1
istat = cudaStreamCreate(stream1)
to use it for the plan of a cufft:
err_dir = err_dir + cufftPlan2D(plan_dir1,NY,NY,CUFFT_D2Z)
err_dir = err_dir + cufftSetStream(plan_dir1,stream1)
In the routine that executes the cufft, I pass plan_dir1 and I have
subroutine new_fft_dir(z,plan)
!$acc host_data use_device(z)
ierr = ierr + cufftExecD2Z(plan,z,z)
!$acc end host_data
!$acc parallel loop collapse(2) present(z)
do i=1,NXP2
do j=1,NY
z(i,j) = z(i,j)/NY**2
enddo
enddo
!$acc end parallel loop
I would like to set an OpenACC stream equal to the CUDA stream stream1, but using :
integer(kind=cuda_stream_kind) :: stream1
istat = cudaStreamCreate(stream1)
integer :: stream
istat = cudaStreamCreate(stream1)
acc_set_cuda_stream(stream,stream1)
I get **NVFORTRAN-S-0034-Syntax error at or near end of line (main.f90: 48)
**
My goal is to add the async clause to
!$acc parallel loop collapse(2) present(z) async(stream)
do i=1,NXP2
do j=1,NY
z(i,j) = z(i,j)/NY**2
enddo
enddo
!$acc end parallel loop
to have this loop and the fft on the same CUDA stream.
Could the problem be that I use integer(kind=cuda_stream_kind) intead of cudaStream_t stream?
"acc_set_cuda_stream" is a subroutine so you do need to add "call " before it. Also, variables need to be declared before executable code, hence "integer :: stream" needs to be moved up a line.
use cudafor
use openacc
integer(kind=cuda_stream_kind) :: stream1
integer :: stream
istat = cudaStreamCreate(stream1)
call acc_set_cuda_stream(stream,stream1)
Consider the following CUDA kernel, which computes the mean of each row of a 2-D matrix.
using CUDA
function mean!(x, n, out)
"""out = sum(x, dims=2)"""
row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
for i = 1:n
#inbounds out[row_idx] += x[row_idx, i]
end
out[row_idx] /= n
return
end
using Test
nrow, ncol = 1024, 10
x = CuArray{Float64, 2}(rand(nrow, ncol))
y = CuArray{Float64, 1}(zeros(nrow))
#cuda threads=256 blocks=4 row_sum!(x, size(x)[2], y)
#test isapprox(y, sum(x, dims=2)) # test passed
Also consider the following CUDA kernel
function add!(a, b, c)
""" c = a .+ b """
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
c[i] = a[i] + b[i]
return
end
a = CuArray{Float64, 1}(zeros(nrow))
b = CuArray{Float64, 1}(ones(nrow))
c = CuArray{Float64, 1}(zeros(nrow))
#cuda threads=256 blocks=4 add!(a, b, c)
#test all(c .== a .+ b) # test passed
Now, suppose I wanted to write another kernel that uses the intermediate results of mean!(). For example,
function g(x, y)
""" mean(x, dims=2) + mean(y, dims=2) """
xrow, xcol = size(x)
yrow, ycol = size(y)
mean1 = CuArray{Float64, 1}(undef, xrow)
#cuda threads=256 blocks=4 mean!(x, xcol, mean1)
mean2 = CuArray{Float64, 1}(zeros(yrow))
#cuda threads=256 blocks=4 mean!(y, ycol, mean2)
out = CuArray{Float64, 1}(zeros(yrow))
#cuda threads=256 blocks=4 add!(mean1, mean2, out)
return out
end
(Of course, g() isn't technically a kernel since it returns something.)
My question is whether g() is "correct". In particular, is g() wasting time by transferring data between the GPU/CPU?
For example, if my understanding is correct, one way g() could be optimized is by initializing mean2 the same way we initialize mean1. This is because when constructing mean2, we're actually first creating zeros(yrow) on the CPU, then passing this to the CuArray constructor to be copied to the GPU. In contrast, mean1 is constructed but uninitialized (due to the undef) and therefore avoids this extra transfer.
To summarize, how do I save/use intermediate kernel results while avoiding data transfers between the CPU/GPU as much as possible?
You can generate arrays or vectors of zeros directly on GPU!
Try:
CUDA.zeros(Float64, nrow)
Some benchmarks:
julia> #btime CUDA.zeros(Float64, 1000,1000)
12.600 μs (26 allocations: 1.22 KiB)
1000×1000 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
...
julia> #btime CuArray(zeros(1000,1000))
3.551 ms (8 allocations: 7.63 MiB)
1000×1000 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
...
I'm trying to write a program to interface cusolverSp onto fortran. although I'm no stranger for coding cuda in C, I'm unsure how to get it on fortran.
The following is my code:
! Fortran Console Application
!
module cuda_cusolverSP
interface
! cudaMalloc
integer (c_int) function cudaMalloc ( buffer, size ) bind (C, name="cudaMalloc" )
use iso_c_binding
implicit none
type (c_ptr) :: buffer
integer (c_size_t), value :: size
end function cudaMalloc
! cudaMemcpy
integer (c_int) function cudaMemcpy ( dst, src, count, kind ) bind (C, name="cudaMemcpy" )
! note: cudaMemcpyHostToDevice = 1
! note: cudaMemcpyDeviceToHost = 2
use iso_c_binding
type (C_PTR), value :: dst, src
integer (c_size_t), value :: count, kind
end function cudaMemcpy
! cudaFree
integer (c_int) function cudaFree(buffer) bind(C, name="cudaFree")
use iso_c_binding
implicit none
type (C_PTR), value :: buffer
end function cudaFree
integer (c_int) function cudaMemGetInfo(fre, tot) bind(C, name="cudaMemGetInfo")
use iso_c_binding
implicit none
type(c_ptr),value :: fre
type(c_ptr),value :: tot
end function cudaMemGetInfo
integer(c_int) function cusolverSpCreate(cusolver_Hndl) bind(C,name="cusolverSpCreate")
use iso_c_binding
implicit none
type(c_ptr)::cusolver_Hndl
end function
integer(c_int) function cusolverSpDestroy(cusolver_Hndl) bind(C,name="cusolverSpDestroy")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
end function
integer(c_int) function cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,Lwork) bind(C,name="cusolverSpSgetrf_bufferSize")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int),value::m
integer(c_int),value::n
type(c_ptr),value::d_A
integer(c_int),value::lda
type(c_ptr),value::Lwork
end function
integer(c_int) function cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo) bind(C, name="cusolverSpSgetrf")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int),value::m
integer(c_int),value::n
type(c_ptr),value::d_A
integer(c_int),value::lda
type(c_ptr),value::d_WS
type(c_ptr),value::d_Ipiv
type(c_ptr),value::d_devInfo
end function
integer (c_int) function cusolverSpSgetrs(cusolver_Hndl,trans,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo) bind(C, name="cusolverSpSgetrs")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int), value::trans
integer(c_int), value::n
integer(c_int), value::nrhs
type(c_ptr),value::d_A
integer(c_int), value::lda
type(c_ptr),value::d_Ipiv
type(c_ptr),value::d_B
integer(c_int),value::ldb
type(c_ptr),value::d_devInfo
end function
end interface
end module
program prog
use iso_c_binding
use cuda_cusolverSP
! ------ Matrix Definition & host CPU storage variables
integer(c_int) rowsA ! number of rows of A
integer(c_int) colsA ! number of columns of A
integer(c_int) nnzA ! number of nonzeros of A
integer(c_int) baseA ! base index in CSR format
! CSR(A) from I/O <--- pointers to host CPU memory
type(c_ptr) :: h_csrRowPtrA
type(c_ptr) :: h_csrColIndA(:)
type(c_ptr) :: h_csrValA(:)
type(c_ptr) :: h_x ! x = A \ b
type(c_ptr) :: h_b ! b = ones(m,1)
type(c_ptr) :: h_r ! r = b - A*x
type(c_ptr) :: h_Q ! <int> n
! reorder to reduce zero fill-in
! Q = symrcm(A) or Q = symamd(A)
! B = Q*A*Q^T
type(c_ptr) :: h_csrRowPtrB ! <int> n+1
type(c_ptr) :: h_csrColIndB ! <int> nnzA
type(c_ptr) :: h_csrValB ! <double> nnzA
type(c_ptr) :: h_mapBfromA ! <int> nnzA
integer size_perm
type(c_ptr) :: buffer_cpu ! working space for permutation: B = Q*A*Q^T
! -------------------- pointers to device memory
type(c_ptr) :: d_csrRowPtrA
type(c_ptr) :: d_csrColIndA
type(c_ptr) :: d_csrValA
type(c_ptr) :: d_x ! x = A \ b
type(c_ptr) :: d_b ! a copy of h_b
type(c_ptr) :: d_r ! r = b - A*x
doubleprecision tol
integer reorder
integer singularity
type(c_ptr)::cpfre,cptot
integer*8,target::free,total
integer res
integer*8 cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice
integer*4 CUBLAS_OP_N, CUBLAS_OP_T
parameter (cudaMemcpyHostToDevice=1)
parameter (cudaMemcpyDeviceToHost=2)
parameter (CUBLAS_OP_N=0)
parameter (CUBLAS_OP_T=1)
! ==================================================================
rowsA = 0
colsA = 0
nnzA = 0
baseA = 0
A_size = SIZEOF(rowsA)
B_size = SIZEOF(B)
X_size = SIZEOF(X)
size_perm = 0
tol = 1.e-12
reorder = 0 ! no reordering
singularity = 0 ! -1 if A is invertible under tol.
! Step 1: Create cudense handle ---------------
cusolver_stat = cusolverSpCreate(cusolver_Hndl)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpCreate error: ", cusolver_stat
write (*,*)
stop
end if
! Step 2: copy A and B to Device
A_mem_stat = cudaMalloc(d_A,A_size)
if (A_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 1 error: ", A_mem_stat
write (*,*)
stop
end if
B_mem_stat = cudaMalloc(d_B,B_size)
if (B_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 2 error: ", B_mem_stat
write (*,*)
stop
end if
! ---------- copy A and B to Device
A_mem_stat = cudaMemcpy(d_A,CPU_A_ptr,A_size,cudaMemcpyHostToDevice)
if (A_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 1 error: ", A_mem_stat
write (*,*)
! stop
end if
B_mem_stat = cudaMemcpy(d_B,CPU_B_ptr,B_size,cudaMemcpyHostToDevice)
if (B_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 2 error: ", B_mem_stat
write (*,*)
! stop
end if
! Step 3: query working space of Sgetrf (and allocate memory on device)
Lwork = 5
cusolver_stat = cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,CPU_Lwork_ptr)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " SpSgetrf_bufferSize error: ", cusolver_stat
write (*,*)
! stop
end if
write (*,*)
write (*, '(A, I12)') " Lwork: ", Lwork
write (*,*)
Workspace = 4*Lwork
WS_mem_stat = cudaMalloc(d_WS,Workspace)
if (WS_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 6 error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 4: compute LU factorization of [A]
cusolver_stat = cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpSgetrf error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 5: compute solution vector [X] for Right hand side [B]
cusolver_stat = cusolverSpSgetrs(cusolver_Hndl,CUBLAS_OP_N,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpSgetrs error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 6: copy solution vector stored in [B] on device into [X] vector on host
X_mem_stat = cudaMemcpy(CPU_X_ptr,d_B,B_size,cudaMemcpyDeviceToHost)
if (X_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 4 error: ", WS_mem_stat
write (*,*)
! stop
end if
! do i = 1, n
! print *, x(i,1)
! enddo
! step 7: free memory on device and release CPU-side resources
A_mem_Stat = cudafree(d_A)
B_mem_Stat = cudafree(d_B)
Ipiv_mem_stat = cudafree(d_Ipiv)
WS_mem_stat = cudafree(d_WS)
Lwork_mem_stat = cudafree(d_Lwork)
cusolver_stat = cusolverSpDestroy(cusolver_Hndl)
! Step 8: deallocate memory on host before exit
! deallocate(A)
! deallocate(ATest)
! deallocate(B)
! deallocate(X)
! deallocate(Ipiv)
end program prog
The current errors during my build is
error S0188: Argument number # to cusolverspcreate/etc : type mismatch
which I have no idea how to fix it. This program is a modification of a working cusolverDn which i'm sure means I've made a bunch of mistakes as there aren't many interfacing samples I can refer to.
You have no implicit none in your main program and cusolver_Hndl is not declared, so it is assumed to be real.
Use implicit none and declare all your variables. cusolver_Hndl should be type(ptr) and don't forget to set its value (if it is not an output argument, the interface does not show any intent).
I am trying to perform reduction in CUDA Fortran; what I did so far is something like that, performing the reduction in two steps (see the CUDA kernels below).
In the first kernel I am doing some simple computation and I declare a shared array for a block of threads to store the value of abs(a - anew); once the threads are synchronized, I compute the max value of this shared array, that I store in an intermediate array of dimension gridDim%x * gridDim%y.
In the second kernel, I am reading this array (in a single block of threads) and try to compute the max value of it.
Here is the whole code:
module commons
integer, parameter :: dp=kind(1.d0)
integer, parameter :: nx=1024, ny=1024
integer, parameter :: block_dimx=16, block_dimy=32
end module commons
module kernels
use commons
contains
attributes(global) subroutine kernel_gpu_reduce(a, anew, error, nxi, nyi)
implicit none
integer, value, intent(in) :: nxi, nyi
real(dp), dimension(nxi,nyi), intent(in) :: a
real(dp), dimension(nxi,nyi), intent(inout) :: anew
real(dp), dimension(nxi/block_dimx+1,nyi/block_dimy+1), intent(inout) :: error
real(dp), shared, dimension(block_dimx,block_dimy) :: err_sh
integer :: i, j, k, tx, ty
i = (blockIdx%x - 1)*blockDim%x + threadIdx%x
j = (blockIdx%y - 1)*blockDim%y + threadIdx%y
tx = threadIdx%x
ty = threadIdx%y
if (i > 1 .and. i < nxi .and. j > 1 .and. j < nyi) then
anew(i,j) = 0.25d0*(a(i-1,j) + a(i+1,j) &
& + a(i,j-1) + a(i,j+1))
err_sh(tx,ty) = abs(anew(i,j) - a(i,j))
endif
call syncthreads()
error(blockIdx%x,blockIdx%y) = maxval(err_sh)
end subroutine kernel_gpu_reduce
attributes(global) subroutine max_reduce(local_error, error, nxi, nyi)
implicit none
integer, value, intent(in) :: nxi, nyi
real(dp), dimension(nxi,nyi), intent(in) :: local_error
real(dp), intent(out) :: error
real(dp), shared, dimension(nxi) :: shared_error
integer :: tx, i
tx = threadIdx%x
shared_error(tx) = 0.d0
if (tx >=1 .and. tx <= nxi) shared_error(tx) = maxval(local_error(tx,:))
call syncthreads()
error = maxval(shared_error)
end subroutine max_reduce
end module kernels
program laplace
use cudafor
use kernels
use commons
implicit none
real(dp), allocatable, dimension(:,:) :: a, anew
real(dp) :: error=1.d0
real(dp), device, allocatable, dimension(:,:) :: adev, adevnew
real(dp), device, allocatable, dimension(:,:) :: edev
real(dp), allocatable, dimension(:,:) :: ehost
real(dp), device :: error_dev
integer :: i
integer :: num_device, h_status, ierrSync, ierrAsync
type(dim3) :: dimGrid, dimBlock
num_device = 0
h_status = cudaSetDevice(num_device)
dimGrid = dim3(nx/block_dimx+1, ny/block_dimy+1, 1)
dimBlock = dim3(block_dimx, block_dimy, 1)
allocate(a(nx,ny), anew(nx,ny))
allocate(adev(nx,ny), adevnew(nx,ny))
allocate(edev(dimGrid%x,dimGrid%y), ehost(dimGrid%x,dimGrid%y))
do i = 1, nx
a(i,:) = 1.d0
anew(i,:) = 1.d0
enddo
adev = a
adevnew = anew
call kernel_gpu_reduce<<<dimGrid, dimBlock>>>(adev, adevnew, edev, nx, ny)
ierrSync = cudaGetLastError()
ierrAsync = cudaDeviceSynchronize()
if (ierrSync /= cudaSuccess) write(*,*) &
& 'Sync kernel error - 1st kernel:', cudaGetErrorString(ierrSync)
if (ierrAsync /= cudaSuccess) write(*,*) &
& 'Async kernel error - 1st kernel:', cudaGetErrorString(ierrAsync)
call max_reduce<<<1, dimGrid%x>>>(edev, error_dev, dimGrid%x, dimGrid%y)
ierrSync = cudaGetLastError()
ierrAsync = cudaDeviceSynchronize()
if (ierrSync /= cudaSuccess) write(*,*) &
& 'Sync kernel error - 2nd kernel:', cudaGetErrorString(ierrSync)
if (ierrAsync /= cudaSuccess) write(*,*) &
& 'Async kernel error - 2nd kernel:', cudaGetErrorString(ierrAsync)
error = error_dev
print*, 'error from kernel: ', error
ehost = edev
error = maxval(ehost)
print*, 'error from host: ', error
deallocate(a, anew, adev, adevnew, edev, ehost)
end program laplace
I first had a problem because of the kernel configuration of the second kernel (which was <<<1, dimGrid>>>); I modified the code following Robert's answer. Now I have a memory access error:
Async kernel error - 2nd kernel:
an illegal memory access was encountered
0: copyout Memcpy (host=0x666bf0, dev=0x4203e20000, size=8) FAILED: 77(an illegal memory access was encountered)
And, if I run it with cuda-memcheck:
========= Invalid __shared__ write of size 8
========= at 0x00000060 in kernels_max_reduce_
========= by thread (1,0,0) in block (0,0,0)
========= Address 0x00000008 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x2c5) [0x14ad95]
for every thread.
The code is compiled with PGI Fortran 14.9 and CUDA 6.5 on a Tesla K20 card (with CUDA capability 3.5). I compile it with:
pgfortran -Mcuda -ta:nvidia,cc35 laplace.f90 -o laplace
You can do proper cuda error checking in CUDA Fortran. You should do so in your code.
One problem is that you're trying to launch too many threads (per block) in your second kernel:
call max_reduce<<<1, dimGrid>>>(edev, error_dev, dimGrid%x, dimGrid%y)
^^^^^^^
The dimGrid parameter has previously been computed to be:
dimGrid = dim3(nx/block_dimx+1, ny/block_dimy+1, 1);
Substituting actual values, we have:
dimGrid = dim3(1024/16 + 1, 1024/32 +1);
i.e.
dimGrid = dim3(65,33);
But you are not allowed to request 65*33 = 2145 threads per block. The maximum is either 512 or 1024 depending on what device architecture target you are compiling for.
Because of this error, your second kernel is not running at all.
I've been working on a Fortran code which uses the cuBLAS batched LU and cuSPARSE batched tridiagonal solver as part of a BiCG iterative solver with ADI preconditioner.I'm using a Kepler K20X with compute capability 3.5 and CUDA 5.5. I'm doing this without PGI's CUDA Fortran, so I'm writing my own interfaces:
FUNCTION cublasDgetrfBatched(handle, n, dA, ldda, dP, dInfo, nbatch) BIND(C, NAME="cublasDgetrfBatched")
USE, INTRINSIC :: ISO_C_BINDING
INTEGER(KIND(CUBLAS_STATUS_SUCCESS)) :: cublasDgetrfBatched
TYPE(C_PTR), VALUE :: handle
INTEGER(C_INT), VALUE :: n
TYPE(C_PTR), VALUE :: dA
INTEGER(C_INT), VALUE :: ldda
TYPE(C_PTR), VALUE :: dP
TYPE(C_PTR), VALUE :: dInfo
INTEGER(C_INT), VALUE :: nbatch
END FUNCTION cublasDgetrfBatched
I allocate pinned memory on the host with cudaHostAlloc, allocate the device memory for the matrices and the device array containing the device pointers to the matrices, asynchronously copy each matrix to the device, perform the operations, and then asynchronously copy the decomposed matrix and pivots back to the host to perform the back-substitution with a single right-hand side:
REAL(8), POINTER, DIMENSION(:,:,:) :: A
INTEGER, DIMENSION(:,:), POINTER :: ipiv
TYPE(C_PTR) :: cPtr_A, cPtr_ipiv
TYPE(C_PTR), ALLOCATABLE, DIMENSION(:), TARGET :: dPtr_A
TYPE(C_PTR) :: dPtr_ipiv, dPtr_A_d, dPtr_info
INTEGER(C_SIZE_T) :: sizeof_A, sizeof_ipiv
...
stat = cudaHostAlloc(cPtr_A, sizeof_A, cudaHostAllocDefault)
CALL C_F_POINTER(cPtr_A, A, (/m,m,nbatch/))
stat = cudaHostAlloc(cPtr_ipiv, sizeof_ipiv, cudaHostAllocDefault)
CALL C_F_POINTER(cPtr_ipiv, ipiv, (/m,nbatch/))
ALLOCATE(dPtr_A(nbatch))
DO ibatch=1,nbatch
stat = cudaMalloc(dPtr_A(ibatch), m*m*sizeof_double)
END DO
stat = cudaMalloc(dPtr_A_d, nbatch*sizeof_cptr)
stat = cublasSetVector(nbatch, sizeof_cptr, C_LOC(dPtr_A(1)), 1, dPtr_A_d, 1)
stat = cudaMalloc(dPtr_ipiv, m*nbatch*sizeof_cint)
stat = cudaMalloc(dPtr_info, nbatch*sizeof_cint)
...
!$OMP PARALLEL DEFAULT(shared) PRIVATE( stat, ibatch )
!$OMP DO
DO ibatch = 1,nbatch
stat = cublasSetMatrixAsync(m, m, sizeof_double, C_LOC(A(1,1,ibatch)), m, dPtr_A(ibatch), m, mystream)
END DO
!$OMP END DO
!$OMP END PARALLEL
...
stat = cublasDgetrfBatched(cublas_handle, m, dPtr_A_d, m, dPtr_ipiv, dPtr_info, nbatch)
...
stat = cublasGetMatrixAsync(m, nbatch, sizeof_cint, dPtr_ipiv, m, C_LOC(ipiv(1,1)), m, mystream)
!$OMP PARALLEL DEFAULT(shared) PRIVATE( ibatch, stat )
!$OMP DO
DO ibatch = 1,nbatch
stat = cublasGetMatrixAsync(m, m, sizeof_double, dPtr_A(ibatch), m, C_LOC(A(1,1,ibatch)), m, mystream)
END DO
!$OMP END DO
!$OMP END PARALLEL
...
!$OMP PARALLEL DEFAULT(shared) PRIVATE( ibatch, x, stat )
!$OMP DO
DO ibatch = 1,nbatch
x = rhs(:,ibatch)
CALL dgetrs( 'N', m, 1, A(1,1,ibatch), m, ipiv(1,ibatch), x(1), m, info )
rhs(:,ibatch) = x
END DO
!$OMP END DO
!$OMP END PARALLEL
...
I'd rather not have to do this last step, but the cublasDtrsmBatched routine limits the matrix size to 32, and mine are size 80 (a batched Dtrsv would be better, but this doesn't exist). The cost of launching multiple individual cublasDtrsv kernels makes performing the back-sub on the device untenable.
There are other operations which I need to perform between calls to cublasDgetrfBatched and cusparseDgtsvStridedBatch. Most of these are currently being performed on the host with OpenMP being used to parallelize the loops at the batched level. Some of the operations, like matrix-vector multiplication for each of the matrices being decomposed for example, are being computed on the device with OpenACC:
!$ACC DATA COPYIN(A) COPYIN(x) COPYOUT(Ax)
...
!$ACC KERNELS
DO ibatch = 1,nbatch
DO i = 1,m
Ax(i,ibatch) = zero
END DO
DO j = 1,m
DO i = 1,m
Ax(i,ibatch) = Ax(i,ibatch) + A(i,j,ibatch)*x(j,ibatch)
END DO
END DO
END DO
!$ACC END KERNELS
...
!$ACC END DATA
I'd like to place more of the computation on the GPU with OpenACC, but to do so I need to be able to interface the two. Something like the following:
!$ACC DATA COPYIN(A) CREATE(info,A_d) COPYOUT(ipiv)
!$ACC HOST_DATA USE_DEVICE(A)
DO ibatch = 1,nbatch
A_d(ibatch) = acc_deviceptr(A(1,1,ibatch))
END DO
!$ACC END HOST_DATA
...
!$ACC HOST_DATA USE_DEVICE(ipiv,info)
stat = cublasDgetrfBatched(cublas_handle, m, A_d, m, ipiv, info, nbatch)
!$ACC END HOST_DATA
...
!$ACC END DATA
I know the host_data construct with the host_device clauses would be appropriate in most cases, but since I need to actually pass to cuBLAS a device array containing the pointers to the matrices on the device, I'm not sure how to proceed.
Can anyone offer any insight?
Thanks
!! Put everything on the device
!$ACC DATA COPYIN(A) CREATE(info,A_d) COPYOUT(ipiv)
!! populate the device A_d array
!$ACC parallel loop
DO ibatch = 1,nbatch
A_d(ibatch) = A(1,1,ibatch)
END DO
!$ACC end parallel
...
!! send the device address of A_d to the device
!$ACC HOST_DATA USE_DEVICE(A_d,ipiv,info)
stat = cublasDgetrfBatched(cublas_handle, m, A_d, m, ipiv, info, nbatch)
!$ACC END HOST_DATA
...
!$ACC END DATA
or
!! Put everything but A_d on the device
!$ACC DATA COPYIN(A) CREATE(info) COPYOUT(ipiv)
!! populate the host A_d array
DO ibatch = 1,nbatch
A_d(ibatch) = acc_deviceptr( A(1,1,ibatch) )
END DO
!! copy A_d to the device
!$acc data copyin( A_d )
...
!! send the device address of A_d and others to the device
!$ACC HOST_DATA USE_DEVICE(A_d,ipiv,info)
stat = cublasDgetrfBatched(cublas_handle, m, A_d, m, ipiv, info, nbatch)
!$ACC END HOST_DATA
...
!$acc end data
!$ACC END DATA