CuSolver Sparse interface on Fortran - cuda

I'm trying to write a program to interface cusolverSp onto fortran. although I'm no stranger for coding cuda in C, I'm unsure how to get it on fortran.
The following is my code:
! Fortran Console Application
!
module cuda_cusolverSP
interface
! cudaMalloc
integer (c_int) function cudaMalloc ( buffer, size ) bind (C, name="cudaMalloc" )
use iso_c_binding
implicit none
type (c_ptr) :: buffer
integer (c_size_t), value :: size
end function cudaMalloc
! cudaMemcpy
integer (c_int) function cudaMemcpy ( dst, src, count, kind ) bind (C, name="cudaMemcpy" )
! note: cudaMemcpyHostToDevice = 1
! note: cudaMemcpyDeviceToHost = 2
use iso_c_binding
type (C_PTR), value :: dst, src
integer (c_size_t), value :: count, kind
end function cudaMemcpy
! cudaFree
integer (c_int) function cudaFree(buffer) bind(C, name="cudaFree")
use iso_c_binding
implicit none
type (C_PTR), value :: buffer
end function cudaFree
integer (c_int) function cudaMemGetInfo(fre, tot) bind(C, name="cudaMemGetInfo")
use iso_c_binding
implicit none
type(c_ptr),value :: fre
type(c_ptr),value :: tot
end function cudaMemGetInfo
integer(c_int) function cusolverSpCreate(cusolver_Hndl) bind(C,name="cusolverSpCreate")
use iso_c_binding
implicit none
type(c_ptr)::cusolver_Hndl
end function
integer(c_int) function cusolverSpDestroy(cusolver_Hndl) bind(C,name="cusolverSpDestroy")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
end function
integer(c_int) function cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,Lwork) bind(C,name="cusolverSpSgetrf_bufferSize")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int),value::m
integer(c_int),value::n
type(c_ptr),value::d_A
integer(c_int),value::lda
type(c_ptr),value::Lwork
end function
integer(c_int) function cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo) bind(C, name="cusolverSpSgetrf")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int),value::m
integer(c_int),value::n
type(c_ptr),value::d_A
integer(c_int),value::lda
type(c_ptr),value::d_WS
type(c_ptr),value::d_Ipiv
type(c_ptr),value::d_devInfo
end function
integer (c_int) function cusolverSpSgetrs(cusolver_Hndl,trans,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo) bind(C, name="cusolverSpSgetrs")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int), value::trans
integer(c_int), value::n
integer(c_int), value::nrhs
type(c_ptr),value::d_A
integer(c_int), value::lda
type(c_ptr),value::d_Ipiv
type(c_ptr),value::d_B
integer(c_int),value::ldb
type(c_ptr),value::d_devInfo
end function
end interface
end module
program prog
use iso_c_binding
use cuda_cusolverSP
! ------ Matrix Definition & host CPU storage variables
integer(c_int) rowsA ! number of rows of A
integer(c_int) colsA ! number of columns of A
integer(c_int) nnzA ! number of nonzeros of A
integer(c_int) baseA ! base index in CSR format
! CSR(A) from I/O <--- pointers to host CPU memory
type(c_ptr) :: h_csrRowPtrA
type(c_ptr) :: h_csrColIndA(:)
type(c_ptr) :: h_csrValA(:)
type(c_ptr) :: h_x ! x = A \ b
type(c_ptr) :: h_b ! b = ones(m,1)
type(c_ptr) :: h_r ! r = b - A*x
type(c_ptr) :: h_Q ! <int> n
! reorder to reduce zero fill-in
! Q = symrcm(A) or Q = symamd(A)
! B = Q*A*Q^T
type(c_ptr) :: h_csrRowPtrB ! <int> n+1
type(c_ptr) :: h_csrColIndB ! <int> nnzA
type(c_ptr) :: h_csrValB ! <double> nnzA
type(c_ptr) :: h_mapBfromA ! <int> nnzA
integer size_perm
type(c_ptr) :: buffer_cpu ! working space for permutation: B = Q*A*Q^T
! -------------------- pointers to device memory
type(c_ptr) :: d_csrRowPtrA
type(c_ptr) :: d_csrColIndA
type(c_ptr) :: d_csrValA
type(c_ptr) :: d_x ! x = A \ b
type(c_ptr) :: d_b ! a copy of h_b
type(c_ptr) :: d_r ! r = b - A*x
doubleprecision tol
integer reorder
integer singularity
type(c_ptr)::cpfre,cptot
integer*8,target::free,total
integer res
integer*8 cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice
integer*4 CUBLAS_OP_N, CUBLAS_OP_T
parameter (cudaMemcpyHostToDevice=1)
parameter (cudaMemcpyDeviceToHost=2)
parameter (CUBLAS_OP_N=0)
parameter (CUBLAS_OP_T=1)
! ==================================================================
rowsA = 0
colsA = 0
nnzA = 0
baseA = 0
A_size = SIZEOF(rowsA)
B_size = SIZEOF(B)
X_size = SIZEOF(X)
size_perm = 0
tol = 1.e-12
reorder = 0 ! no reordering
singularity = 0 ! -1 if A is invertible under tol.
! Step 1: Create cudense handle ---------------
cusolver_stat = cusolverSpCreate(cusolver_Hndl)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpCreate error: ", cusolver_stat
write (*,*)
stop
end if
! Step 2: copy A and B to Device
A_mem_stat = cudaMalloc(d_A,A_size)
if (A_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 1 error: ", A_mem_stat
write (*,*)
stop
end if
B_mem_stat = cudaMalloc(d_B,B_size)
if (B_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 2 error: ", B_mem_stat
write (*,*)
stop
end if
! ---------- copy A and B to Device
A_mem_stat = cudaMemcpy(d_A,CPU_A_ptr,A_size,cudaMemcpyHostToDevice)
if (A_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 1 error: ", A_mem_stat
write (*,*)
! stop
end if
B_mem_stat = cudaMemcpy(d_B,CPU_B_ptr,B_size,cudaMemcpyHostToDevice)
if (B_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 2 error: ", B_mem_stat
write (*,*)
! stop
end if
! Step 3: query working space of Sgetrf (and allocate memory on device)
Lwork = 5
cusolver_stat = cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,CPU_Lwork_ptr)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " SpSgetrf_bufferSize error: ", cusolver_stat
write (*,*)
! stop
end if
write (*,*)
write (*, '(A, I12)') " Lwork: ", Lwork
write (*,*)
Workspace = 4*Lwork
WS_mem_stat = cudaMalloc(d_WS,Workspace)
if (WS_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 6 error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 4: compute LU factorization of [A]
cusolver_stat = cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpSgetrf error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 5: compute solution vector [X] for Right hand side [B]
cusolver_stat = cusolverSpSgetrs(cusolver_Hndl,CUBLAS_OP_N,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpSgetrs error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 6: copy solution vector stored in [B] on device into [X] vector on host
X_mem_stat = cudaMemcpy(CPU_X_ptr,d_B,B_size,cudaMemcpyDeviceToHost)
if (X_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 4 error: ", WS_mem_stat
write (*,*)
! stop
end if
! do i = 1, n
! print *, x(i,1)
! enddo
! step 7: free memory on device and release CPU-side resources
A_mem_Stat = cudafree(d_A)
B_mem_Stat = cudafree(d_B)
Ipiv_mem_stat = cudafree(d_Ipiv)
WS_mem_stat = cudafree(d_WS)
Lwork_mem_stat = cudafree(d_Lwork)
cusolver_stat = cusolverSpDestroy(cusolver_Hndl)
! Step 8: deallocate memory on host before exit
! deallocate(A)
! deallocate(ATest)
! deallocate(B)
! deallocate(X)
! deallocate(Ipiv)
end program prog
The current errors during my build is
error S0188: Argument number # to cusolverspcreate/etc : type mismatch
which I have no idea how to fix it. This program is a modification of a working cusolverDn which i'm sure means I've made a bunch of mistakes as there aren't many interfacing samples I can refer to.

You have no implicit none in your main program and cusolver_Hndl is not declared, so it is assumed to be real.
Use implicit none and declare all your variables. cusolver_Hndl should be type(ptr) and don't forget to set its value (if it is not an output argument, the interface does not show any intent).

Related

Adding a print statement in a Fortran 90 function this do not work [duplicate]

I'm trying to learn Fortran (unfortunately a necessity for my research group) - one of the tasks I set myself was to package one of the necessary functions (Associated Legendre polynomials) from the Numerical Recipes book into a fortran 03 compliant module. The original program (f77) has some error handling in the form of the following:
if(m.lt.0.or.m.gt.1.or.abs(x).gt.1)pause 'bad arguments in plgndr'
Pause seems to have been deprecated since f77 as using this line gives me a compiling error, so I tried the following:
module sha_helper
implicit none
public :: plgndr, factorial!, ylm
contains
! numerical recipes Associated Legendre Polynomials rewritten for f03
function plgndr(l,m,x) result(res_plgndr)
integer, intent(in) :: l, m
real, intent(in) :: x
real :: res_plgndr, fact, pll, pmm, pmmp1, somx2
integer :: i,ll
if (m.lt.0.or.m.gt.l.or.abs(x).gt.1) then
write (*, *) "bad arguments to plgndr, aborting", m, x
res_plgndr=-10e6 !return a ridiculous value
else
pmm = 1.
if (m.gt.0) then
somx2 = sqrt((1.-x)*(1.+x))
fact = 1.
do i = 1, m
pmm = -pmm*fact*somx2
fact = fact+2
end do
end if
if (l.eq.m) then
res_plgndr = pmm
else
pmmp1 = x*(2*m+1)*pmm
if(l.eq.m+1) then
res_plgndr = pmmp1
else
do ll = m+2, l
pll = (x*(2*ll-1)*pmmp1-(ll+m-1)*pmm)/(ll-m)
pmm = pmmp1
pmmp1 = pll
end do
res_plgndr = pll
end if
end if
end if
end function plgndr
recursive function factorial(n) result(factorial_result)
integer, intent(in) :: n
integer, parameter :: RegInt_K = selected_int_kind(20) !should be enough for the factorials I am using
integer (kind = RegInt_K) :: factorial_result
if (n <= 0) then
factorial_result = 1
else
factorial_result = n * factorial(n-1)
end if
end function factorial
! function ylm(l,m,theta,phi) result(res_ylm)
! integer, intent(in) :: l, m
! real, intent(in) :: theta, phi
! real :: res_ylm, front_block
! real, parameter :: pi = 3.1415926536
! front_block = sqrt((2*l+1)*factorial(l-abs(m))/(4*pi*))
! end function ylm
end module sha_helper
The main code after the else works, but if I execute my main program and call the function with bad values, the program freezes before executing the print statement. I know that the print statement is the problem, as commenting it out allows the function to execute normally, returning -10e6 as the value. Ideally, I would like the program to crash after giving a user readable error message, as giving bad values to the plgndr function is a fatal error for the program. The function plgndr is being used by the program sha_lmc. Currently all this does is read some arrays and then print a value of plgndr for testing (early days). The function ylm in the module sha_helper is also not finished, hence it is commented out. The code compiles using gfortran sha_helper.f03 sha_lmc.f03 -o sha_lmc, and
gfortran --version
GNU Fortran (GCC) 4.8.2
!Spherical Harmonic Bayesian Analysis testbed for Lagrangian Dynamical Monte Carlo
program sha_analysis
use sha_helper
implicit none
!Analysis Parameters
integer, parameter :: harm_order = 6
integer, parameter :: harm_array_length = (harm_order+1)**2
real, parameter :: coeff_lo = -0.1, coeff_hi = 0.1, data_err = 0.01 !for now, data_err fixed rather than heirarchical
!Monte Carlo Parameters
integer, parameter :: run = 100000, burn = 50000, thin = 100
real, parameter :: L = 1.0, e = 1.0
!Variables needed by the program
integer :: points, r, h, p, counter = 1
real, dimension(:), allocatable :: x, y, z
real, dimension(harm_array_length) :: l_index_list, m_index_list
real, dimension(:,:), allocatable :: g_matrix
!Open the file, allocate the x,y,z arrays and read the file
open(1, file = 'Average_H_M_C_PcP_boschi_1200.xyz', status = 'old')
read(1,*) points
allocate(x(points))
allocate(y(points))
allocate(z(points))
print *, "Number of Points: ", points
readloop: do r = 1, points
read(1,*) x(r), y(r), z(r)
end do readloop
!Set up the forwards model
allocate(g_matrix(harm_array_length,points))
!Generate the l and m values of spherical harmonics
hloop: do h = 0, harm_order
ploop: do p = -h,h
l_index_list(counter) = h
m_index_list(counter) = p
counter = counter + 1
end do ploop
end do hloop
print *, plgndr(1,2,0.1)
!print *, ylm(1,1,0.1,0.1)
end program sha_analysis
Your program does what is known as recursive IO - the initial call to plgndr is in the output item list of an IO statement (a print statement) [directing output to the console] - inside that function you then also attempt to execute another IO statement [that outputs to the console]. This is not permitted - see 9.11p2 and p3 of F2003 or 9.12p2 of F2008.
A solution is to separate the function invocation from the io statement in the main program, i.e.
REAL :: a_temporary
...
a_temporary = plgndr(1,2,0.1)
PRINT *, a_temporary
Other alternatives in F2008 (but not F2003 - hence the [ ] parts in the first paragraph) include directing the output from the function to a different logical unit (note that WRITE (*, ... and PRINT ... reference the same unit).
In F2008 you could also replace the WRITE statement with a STOP statement with a message (the message must be a constant - which wouldn't let you report the problematic values).
The potential for inadvertently invoking recursive IO is part of the reason that some programming styles discourage conducting IO in functions.
Try:
if (m.lt.0.or.m.gt.l.or.abs(x).gt.1) then
write (*, *) "bad arguments to plgndr, aborting", m, x
stop
else
...
end if

Constructor of derived types

I am trying to write a constructor for a derived type of an abstract one to solve this other question, but it seems that it's not working, or better, it isn't called at all.
The aim is to have a runtime polymorphism setting the correct number of legs of an animal.
These are the two modules:
animal
module animal_module
implicit none
type, abstract :: animal
private
integer, public :: nlegs = -1
contains
procedure :: legs
end type animal
contains
function legs(this) result(n)
class(animal), intent(in) :: this
integer :: n
n = this%nlegs
end function legs
cat
module cat_module
use animal_module, only : animal
implicit none
type, extends(animal) :: cat
private
contains
procedure :: setlegs => setlegs
end type cat
interface cat
module procedure init_cat
end interface cat
contains
type(cat) function init_cat(this)
class(cat), intent(inout) :: this
print *, "Cat!"
this%nlegs = -4
end function init_cat
main program
program oo
use animal_module
use cat_module
implicit none
type(cat) :: c
type(bee) :: b
character(len = 3) :: what = "cat"
class(animal), allocatable :: q
select case(what)
case("cat")
print *, "you will see a cat"
allocate(cat :: q)
q = cat() ! <----- this line does not change anything
case default
print *, "ohnoes, nothing is prepared!"
stop 1
end select
print *, "this animal has ", q%legs(), " legs."
print *, "cat animal has ", c%legs(), " legs."
end program
The constructor isn't called at all, and the number of legs still remains to -1.
The available non-default constructor for the cat type is given by the module procedure init_cat. This function you have defined like
type(cat) function init_cat(this)
class(cat), intent(inout) :: this
end function init_cat
It is a function with one argument, of class(cat). In your later reference
q = cat()
There is no specific function under the generic cat which matches that reference: the function init_cat does not accept a no-argument reference. The default structure constructor is instead used.
You must reference the generic cat in a way matching your init_cat interface to have that specific function called.
You want to change your init_cat function to look like
type(cat) function init_cat()
! print*, "Making a cat"
init_cat%nlegs = -4
end function init_cat
Then you can reference q=cat() as desired.
Note that in the original, you are attempting to "construct" a cat instance, but you aren't returning this constructed entity as the function result. Instead, you are modifying an argument (already constructed). Structure constructors are intended to be used returning such useful things.
Note also that you don't need to
allocate (cat :: q)
q = cat()
The intrinsic assignment to q already handles q's allocation.
FWIW, here is some sample code comparing three approaches (method = 1: sourced allocation, 2: polymorphic assignment, 3: mixed approach).
module animal_module
implicit none
type, abstract :: animal_t
integer :: nlegs = -1
contains
procedure :: legs !! defines a binding to some procedure
endtype
contains
function legs(this) result(n)
class(animal_t), intent(in) :: this
!! The passed variable needs to be declared as "class"
!! to use this routine as a type-bound procedure (TBP).
integer :: n
n = this % nlegs
end
end
module cat_module
use animal_module, only : animal_t
implicit none
type, extends(animal_t) :: cat_t
endtype
interface cat_t !! overloads the definition of cat_t() (as a procedure)
module procedure make_cat
end interface
contains
function make_cat() result( ret ) !! a usual function
type(cat_t) :: ret !<-- returns a concrete-type object
ret % nlegs = -4
end
end
program main
use cat_module, only: cat_t, animal_t
implicit none
integer :: method
type(cat_t) :: c
class(animal_t), allocatable :: q
print *, "How to create a cat? [method = 1,2,3]"
read *, method
select case ( method )
case ( 1 )
print *, "1: sourced allocation"
allocate( q, source = cat_t() )
!! An object created by a function "cat_t()" is used to
!! allocate "q" with the type and value taken from source=.
!! (Empirically most stable for different compilers/versions.)
case ( 2 )
print *, "2: polymorphic assignment"
q = cat_t()
!! Similar to sourced allocation. "q" is automatically allocated.
!! (Note: Old compilers may have bugs, so tests are recommended...)
case ( 3 )
print *, "3: mixed approach"
allocate( cat_t :: q )
q = cat_t()
!! First allocate "q" with a concrete type "cat_t"
!! and then assign a value obtained from cat_t().
case default ; stop "unknown method"
endselect
c = cat_t()
!! "c" is just a concrete-type variable (not "allocatable")
!! and assigned with a value obtained from cat_t().
print *, "c % legs() = ", c % legs()
print *, "q % legs() = ", q % legs()
end
--------------------------------------------------
Test
$ gfortran test.f90 # using version 8 or 9
$ echo 1 | ./a.out
How to create a cat? [method = 1,2,3]
1: sourced allocation
c % legs() = -4
q % legs() = -4
$ echo 2 | ./a.out
How to create a cat? [method = 1,2,3]
2: polymorphic assignment
c % legs() = -4
q % legs() = -4
$ echo 3 | ./a.out
How to create a cat? [method = 1,2,3]
3: mixed approach
c % legs() = -4
q % legs() = -4
--------------------------------------------------
Side notes
* It is also OK to directly use make_cat() to generate a value of cat_t:
e.g., allocate( q, source = make_cat() ) or q = make_cat().
In this case, we do not need to overload cat_t() via interface.
* Another approach is to write an "initializer" as a type-bound procedure,
and call it explicitly as q % init() (after allocating it via
allocate( cat_t :: q )). If the type contains pointer components,
this approach may be more straightforward by avoiding copy of
components (which can be problematic for pointer components).

Double precision error of cuFFT with PlanMany in Fortran

Following the (answer of JackOLantern) I'm trying to compute a batch 1D FFTs using cufftPlanMany.
The code below perform nwfs=23 times the 1D FFT forward and the 1D FFT backward of an n=256 complex array. It's to train me to handle the routine cufftPlanMany. As a second step, the nwfs arrays will be differents .At the end, I check the errors of each arrays.
Because of the data are allocate as: cinput_d(n,nwfs)
I use th function like this: cufftPlanMany(planmany, 1, fftsize, inembed, nwfs,1, onembed, nwfs,1, CUFFT_C2C, nwfs)
where :
rank = 1
fftsize = {n} same dim for each FFT
inembed = onembed = {0} ignored
istride = ostride = nwfs distance between two successive input and output
idist = odist = 1 distance between two signals
batch = nwfs number of fft to be done
program fft
use cudafor
use precision_m
use cufft_m
implicit none
integer, allocatable:: kx(:)
complex(fp_kind), allocatable:: matrix(:)
complex(fp_kind), allocatable, pinned :: cinput(:,:),coutput(:,:)
complex(fp_kind), allocatable, device :: cinput_d(:,:),coutput_d(:,:)
integer:: i,j,k,n,nwfs
integer, allocatable :: fftsize(:),inembed(:),onembed(:)
type(c_ptr):: plan,planmany
real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h
integer::clock_start,clock_end,clock_rate,istat
real :: elapsed_time
character*1:: a
real(fp_kind):: w,x,y,z
integer:: nerrors
n=256
nwfs=23
h=twopi/real(n,fp_kind)
! allocate arrays on the host
allocate (cinput(n,nwfs),coutput(n,nwfs))
allocate (kx(n),matrix(n))
allocate (fftsize(nwfs),inembed(nwfs),onembed(nwfs))
! allocate arrays on the device
allocate (cinput_d(n,nwfs),coutput_d(n,nwfs))
fftsize(:) = n
inembed(:) = 0
onembed(:) = 0
!initialize arrays on host
kx =(/ ((i-0.5)*0.1953125, i=1,n/2), ((-n+i-0.5)*0.1953125, i=n/2+1,n) /)
matrix = (/ ... /)
!write(*,*) cinput
!copy arrays to device
do i =1,nwfs
cinput(:,i)=matrix(:)
end do
cinput_d=cinput
! Initialize the plan for complex to complex transform
if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1)
if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1)
if (fp_kind== doublePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, &
nwfs,1, &
onembed, &
nwfs,1, &
CUFFT_Z2Z, nwfs)
if (fp_kind== singlePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, &
nwfs,1, &
onembed, &
nwfs,1, &
CUFFT_C2C, nwfs)
!c_null_ptr fftsize,inembed,onembed
! cufftPlanMany(plan, rank, n, inembed, istride, idist, &
! onembed, ostride, odist, &
! type, batch)
!subroutine cufftPlan1d(plan, nx, type, batch)
call SYSTEM_CLOCK(COUNT_RATE=clock_rate)
istat=cudaThreadSynchronize()
call SYSTEM_CLOCK(count=clock_start)
! Forward transform out of place
call cufftExec(planmany,cinput_d,coutput_d,CUFFT_FORWARD)
!$cuf kernel do <<<*,*>>>
do i=1,n
do j =1,n
coutput_d(i,j) = coutput_d(i,j)/real(n,fp_kind)!sqrt(twopi*real(n,fp_kind))*sqrt(2.*pi)/sqrt(real(maxn))
end do
end do
call cufftExec(planmany,coutput_d,coutput_d,CUFFT_INVERSE)
istat=cudaThreadSynchronize()
call SYSTEM_CLOCK(count=clock_end)
! Copy results back to host
coutput=coutput_d
do i=1,n
! write(*,'(i2,1x,2(f8.4),1x,2(f8.4),2x,e13.7)') i,cinput(i),coutput(i),abs(coutput(i)-cinput(i))
end do
nerrors=0
do i=1,n
!write(*,'(i2,5(1x,2(f8.4),1x,2(f8.4),2x,3(e13.7,2x)))') i,cinput(i,1),coutput(i,1),abs(coutput(i,1)-cinput(i,1)),abs(coutput(i,6)-cinput(i,6)),abs(coutput(i,nwfs)-cinput(i,nwfs))
do j=1,nwfs
if (abs(coutput(i,j)-cinput(i,j))>1.d-5) then
write(*,'(i3,i3,1x,e13.7,2x,4(f8.4))') i,j,abs(coutput(i,j)-cinput(i,j)),cinput(i,j),coutput(i,j)
nerrors = nerrors + 1
end if
end do
end do
elapsed_time = REAL(clock_end-clock_start)/REAL(clock_rate)
write(*,*) 'elapsed_time :',elapsed_time,clock_start,clock_end,clock_rate
if (nerrors .eq. 0) then
print *, "Test Passed"
else
print *, "Test Failed"
endif
!release memory on the host and on the device
deallocate (cinput,coutput,kx,cinput_d,coutput_d)
! Destroy the plans
call cufftDestroy(plan)
end program fft
Is somebody can tell me why the following "many-FFT" sometimes failed in double precision but never in single precision ?
Single precision: "Test Passed" ALWAYS !
Double precision: "Test Failed" Sometimes !
Indeed, I checked the Device to Host data transfer. That doesn't seem to be it.
Thanks for any help.
Thanks to talonmies. It was the WDDM Timeout Detection & Recovery limit.
See the link, to change the TDR

fortran cudamalloc and cublassetmatrix wrapper problem

I maybe a problem, either with "cudamalloc" or "cublassetmatrix" not appropriately called from fortran:
module cuda
interface
integer (C_INT) function cudaMallocHost(buffer, size) bind(C,name="cudaMallocHost")
use iso_c_binding
implicit none
type (C_PTR) :: buffer
integer (C_SIZE_T), value :: size
end function cudaMallocHost
integer (C_INT) function cudaFreeHost(buffer) bind(C,name="cudaFreeHost")
use iso_c_binding
implicit none
type (C_PTR), value :: buffer
end function cudaFreeHost
integer (C_INT) function cudaMalloc(buffer, size) bind(C,name="cudaMalloc")
use iso_c_binding
implicit none
type (C_PTR) :: buffer
integer (C_SIZE_T), value :: size
end function cudaMalloc
integer (C_INT) function cudaFree(buffer) bind(C,name="cudaFree")
use iso_c_binding
implicit none
type (C_PTR), value :: buffer
end function cudaFree
integer (C_INT) function cublassetmatrix(M,N,size,A_host,lda_h&
&,A_dev,lda_d) bind(C,name="cublasSetMatrix")
use iso_c_binding
implicit none
type (C_PTR) :: A_dev
type(c_ptr) :: A_host
integer (C_SIZE_T), value :: size
integer(C_Int) :: M,N,lda_h,lda_d
end function cublassetmatrix
Type(c_ptr) function cudaGetErrorString(err) bind(C,name="cudaGetErrorString")
use iso_c_binding
implicit none
integer (C_SIZE_T), value :: err
end function cudaGetErrorString
integer (C_INT) function cublasCreate(handle) bind(C,name="cublasCreate_v2")
use iso_c_binding
implicit none
Type(C_Ptr) :: handle
end function cublasCreate
end interface
end module cuda
program test
use iso_c_binding
use cuda
implicit none
integer, parameter :: fp_kind = kind(0.0)
type(C_PTR) :: cptr_A, cptr_A_D
real (fp_kind), dimension(:,:), pointer :: A=>null()
real :: time_start,time_end
integer:: i,j, res, m1
integer(c_int) :: x
type(c_ptr) :: handle
logical:: lsexit
CHARACTER(len=50), POINTER :: errchar
m1=500
res=cublasCreate(handle)
if(res/=0) Then
write(*,*) "ERROR 1 ",res;
end if
res = cudaMallocHost ( cptr_A, m1*m1*sizeof(fp_kind) )
if(res/=0) Then
write(*,*) "ERROR 2 ",res;
end if
call c_f_pointer ( cptr_A, A, (/ m1, m1 /) )
A=1._fp_kind
res = cudaMalloc ( cptr_A_D, m1*m1*sizeof(fp_kind) )
if(res/=0) Then
write(*,*) "ERROR 3 ",res;
end if
res=cublasSetMatrix (m1,m1,sizeof(fp_kind),cptr_A,m1,cptr_A_D,m1)
if(res/=0) Then
write(*,*) "ERROR 4 ",res,sizeof(fp_kind)
call c_f_pointer ( cudageterrorstring(int(res,kind=8)),&
& errchar, [ len(errchar) ] )
write(*,*) trim(adjustl(errchar))
end if
end program test
The make command is:
tmp:
ifort -O3 -o tmp $(FFLAGS) tmp.f90 -L/opt/cuda/lib64 -lcublas -lcudart
clean:
rm tmp cuda.mod
Although "cudamallochost" expects "void ** ptr" it seems to work as the fortran pointer is usable, and when put in a loop with "cudafree" does not produce memory leaks.
The code fails at "cublassetmatrix" function either with error code 7 ("too many resources") or 11 ("invalid argument").
Any idea?

Max reduce in CUDA Fortran

I am trying to perform reduction in CUDA Fortran; what I did so far is something like that, performing the reduction in two steps (see the CUDA kernels below).
In the first kernel I am doing some simple computation and I declare a shared array for a block of threads to store the value of abs(a - anew); once the threads are synchronized, I compute the max value of this shared array, that I store in an intermediate array of dimension gridDim%x * gridDim%y.
In the second kernel, I am reading this array (in a single block of threads) and try to compute the max value of it.
Here is the whole code:
module commons
integer, parameter :: dp=kind(1.d0)
integer, parameter :: nx=1024, ny=1024
integer, parameter :: block_dimx=16, block_dimy=32
end module commons
module kernels
use commons
contains
attributes(global) subroutine kernel_gpu_reduce(a, anew, error, nxi, nyi)
implicit none
integer, value, intent(in) :: nxi, nyi
real(dp), dimension(nxi,nyi), intent(in) :: a
real(dp), dimension(nxi,nyi), intent(inout) :: anew
real(dp), dimension(nxi/block_dimx+1,nyi/block_dimy+1), intent(inout) :: error
real(dp), shared, dimension(block_dimx,block_dimy) :: err_sh
integer :: i, j, k, tx, ty
i = (blockIdx%x - 1)*blockDim%x + threadIdx%x
j = (blockIdx%y - 1)*blockDim%y + threadIdx%y
tx = threadIdx%x
ty = threadIdx%y
if (i > 1 .and. i < nxi .and. j > 1 .and. j < nyi) then
anew(i,j) = 0.25d0*(a(i-1,j) + a(i+1,j) &
& + a(i,j-1) + a(i,j+1))
err_sh(tx,ty) = abs(anew(i,j) - a(i,j))
endif
call syncthreads()
error(blockIdx%x,blockIdx%y) = maxval(err_sh)
end subroutine kernel_gpu_reduce
attributes(global) subroutine max_reduce(local_error, error, nxi, nyi)
implicit none
integer, value, intent(in) :: nxi, nyi
real(dp), dimension(nxi,nyi), intent(in) :: local_error
real(dp), intent(out) :: error
real(dp), shared, dimension(nxi) :: shared_error
integer :: tx, i
tx = threadIdx%x
shared_error(tx) = 0.d0
if (tx >=1 .and. tx <= nxi) shared_error(tx) = maxval(local_error(tx,:))
call syncthreads()
error = maxval(shared_error)
end subroutine max_reduce
end module kernels
program laplace
use cudafor
use kernels
use commons
implicit none
real(dp), allocatable, dimension(:,:) :: a, anew
real(dp) :: error=1.d0
real(dp), device, allocatable, dimension(:,:) :: adev, adevnew
real(dp), device, allocatable, dimension(:,:) :: edev
real(dp), allocatable, dimension(:,:) :: ehost
real(dp), device :: error_dev
integer :: i
integer :: num_device, h_status, ierrSync, ierrAsync
type(dim3) :: dimGrid, dimBlock
num_device = 0
h_status = cudaSetDevice(num_device)
dimGrid = dim3(nx/block_dimx+1, ny/block_dimy+1, 1)
dimBlock = dim3(block_dimx, block_dimy, 1)
allocate(a(nx,ny), anew(nx,ny))
allocate(adev(nx,ny), adevnew(nx,ny))
allocate(edev(dimGrid%x,dimGrid%y), ehost(dimGrid%x,dimGrid%y))
do i = 1, nx
a(i,:) = 1.d0
anew(i,:) = 1.d0
enddo
adev = a
adevnew = anew
call kernel_gpu_reduce<<<dimGrid, dimBlock>>>(adev, adevnew, edev, nx, ny)
ierrSync = cudaGetLastError()
ierrAsync = cudaDeviceSynchronize()
if (ierrSync /= cudaSuccess) write(*,*) &
& 'Sync kernel error - 1st kernel:', cudaGetErrorString(ierrSync)
if (ierrAsync /= cudaSuccess) write(*,*) &
& 'Async kernel error - 1st kernel:', cudaGetErrorString(ierrAsync)
call max_reduce<<<1, dimGrid%x>>>(edev, error_dev, dimGrid%x, dimGrid%y)
ierrSync = cudaGetLastError()
ierrAsync = cudaDeviceSynchronize()
if (ierrSync /= cudaSuccess) write(*,*) &
& 'Sync kernel error - 2nd kernel:', cudaGetErrorString(ierrSync)
if (ierrAsync /= cudaSuccess) write(*,*) &
& 'Async kernel error - 2nd kernel:', cudaGetErrorString(ierrAsync)
error = error_dev
print*, 'error from kernel: ', error
ehost = edev
error = maxval(ehost)
print*, 'error from host: ', error
deallocate(a, anew, adev, adevnew, edev, ehost)
end program laplace
I first had a problem because of the kernel configuration of the second kernel (which was <<<1, dimGrid>>>); I modified the code following Robert's answer. Now I have a memory access error:
Async kernel error - 2nd kernel:
an illegal memory access was encountered
0: copyout Memcpy (host=0x666bf0, dev=0x4203e20000, size=8) FAILED: 77(an illegal memory access was encountered)
And, if I run it with cuda-memcheck:
========= Invalid __shared__ write of size 8
========= at 0x00000060 in kernels_max_reduce_
========= by thread (1,0,0) in block (0,0,0)
========= Address 0x00000008 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x2c5) [0x14ad95]
for every thread.
The code is compiled with PGI Fortran 14.9 and CUDA 6.5 on a Tesla K20 card (with CUDA capability 3.5). I compile it with:
pgfortran -Mcuda -ta:nvidia,cc35 laplace.f90 -o laplace
You can do proper cuda error checking in CUDA Fortran. You should do so in your code.
One problem is that you're trying to launch too many threads (per block) in your second kernel:
call max_reduce<<<1, dimGrid>>>(edev, error_dev, dimGrid%x, dimGrid%y)
^^^^^^^
The dimGrid parameter has previously been computed to be:
dimGrid = dim3(nx/block_dimx+1, ny/block_dimy+1, 1);
Substituting actual values, we have:
dimGrid = dim3(1024/16 + 1, 1024/32 +1);
i.e.
dimGrid = dim3(65,33);
But you are not allowed to request 65*33 = 2145 threads per block. The maximum is either 512 or 1024 depending on what device architecture target you are compiling for.
Because of this error, your second kernel is not running at all.