cudaMalloc an array within a structure creates an "illegal memory access" - cuda

below is a simplified version of a problem that I am trying to solve. Both code snipets compile, but #2 throws an "illegal memory access". Basically, if an array is encapsulated in a structure, passing a pointer to that structure to cudaMalloc creates all kind of problems -- at least the way I do it. I am pretty sure this is due to the fact that the address of dum in the code below is on the host, and so is not accessible inside the kernel. Problem is, I don't know how to create a device version of dum... E.g., using cudaMalloc( (void**)&dum , sizeof(dummy) * 1 ) instead of the new dummy syntax below does not solve the problem. I think I am getting confused with the double pointer used by cudaMalloc.
Of course it may seem silly in this example to encapsulate an array of double in a structure, in the actual code I really need to do this though.
struct dummy
{
double *arr;
};
void allocate( dummy *dum , int n )
{
cudaMalloc( (double**)&(dum->arr) , sizeof(double) * n );
}
__global__ void test( double val , dummy *dum , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
dum->arr[ii] = val;
}
__global__ void test2( double val , double *arr , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
arr[ii] = val;
}
int main()
{
int n = 10;
dummy *dum = new dummy;
/* CODE 1: the piece of code below works */
double *p;
gpu_err_chk( cudaMalloc( &p , sizeof(double) * n ) );
test2<<< 1 , 1 >>>( 123.0 , p , n );
gpu_err_chk( cudaDeviceSynchronize() );
/* CODE 2: the piece of code below does not... */
allocate( dum , n );
test<<< 1 , 1 >>>( 123.0 , dum , n );
gpu_err_chk( cudaDeviceSynchronize() );
return 1;
}

After digging through some example in previous posts by Robert, I was able to re-write the code so that it works:
struct dummy
{
double *arr;
};
__global__ void test( dummy *dum , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
printf( "dum->arr[%d] = %f\n" , ii , dum->arr[ii] );
}
int main()
{
int n = 10;
dummy *dum_d , *dum_h;
srand( time(0) );
dum_h = new dummy;
dum_h->arr = new double[n];
for( int ii = 0 ; ii < n ; ii++ ){
dum_h->arr[ii] = double( rand() ) / RAND_MAX;
printf( "reference data %d = %f\n" , ii , dum_h->arr[ii] );
}
cudaMalloc( &dum_d , sizeof(dummy) * 1 );
cudaMemcpy( dum_d , dum_h , sizeof(dummy) * 1 , cudaMemcpyHostToDevice );
double *tmp;
cudaMalloc( &tmp , sizeof(double) * n );
cudaMemcpy( &( dum_d->arr ) , &tmp , sizeof(double*) , cudaMemcpyHostToDevice ); // copy the pointer (host) to the device structre to a device pointer
cudaMemcpy( tmp , dum_h->arr , sizeof(double) * n , cudaMemcpyHostToDevice );
delete [] dum_h->arr;
delete dum_h;
test<<< 1 , 1 >>>( dum_d , n );
gpu_err_chk( cudaDeviceSynchronize() );
cudaFree( tmp );
cudaFree( dum_d );
return 1;
}
However, I am still confused why this works. Does anyone have a visual diagram of what's going on? I am getting lost with the different pointers...
Moreover, there is one thing that really blows my mind: I can free tmp right before the kernel launch and the code still works, i.e.:
cudaFree( tmp );
test<<< 1 , 1 >>>( dum_d , n );
gpu_err_chk( cudaDeviceSynchronize() );
How is this the case? In my mind (clearly wrong), the device array containing the random values is gone...
Another point of confusion is that I can't free dum_d->arr directly (freeCuda(dum_d->arr)), this throws a segmentation fault.

Related

when i compile this code some error has ocures here 1)expression must be a modifiable lvalue 2)identifier "__synchthreads" is undefined

#include <iostream>
using namespace std ;
#define min(x,y) (x>y?x:y)
#define N 33*1024
#define ThreadPerBlock 256
//smallest multiple of threadsPerBlock that is greater than or equal to N
#define blockPerGrid min(32 , (N+ThreadPerBlock-1) / ThreadPerBlock )
__global__ void Vector_Dot_Product ( const float *V1 , const float *V2 , float *V3 )
{
__shared__ float chache[ThreadPerBlock] ;
float temp ;
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;
const unsigned int chacheindex = threadIdx.x ;
while ( tid < N )
{
temp += V1[tid] * V2[tid] ;
tid += blockDim.x * gridDim.x ;
}
chache[chacheindex] = temp ;
__synchthreads () ;
int i = blockDim.x / 2 ;
while ( i!=0 )
{
if ( chacheindex < i )
chache[chacheindex] += chache [chacheindex + i] ;
__synchthreads () ;
i/=2 ;
}
if ( chacheindex == 0 )
V3[blockIdx.x] = chache [0] ;
}
int main ( int argv , char *argc )
{
float *V1_H , *V2_H , *V3_H ;
float *V1_D , *V2_D , *V3_D ;
V1_H = new float [N] ;
V2_H = new float [N] ;
V3_H = new float [blockPerGrid] ;
cudaMalloc ( (void **)&V1_D , N*sizeof(float)) ;
cudaMalloc ( (void **)&V2_D , N*sizeof(float)) ;
cudaMalloc ( (void **)&V3_D , blockPerGrid*sizeof(float)) ;
for ( int i = 0 ; i<N ; i++ )
{
V1_H[i] = i ;
V2_H[i] = i*2 ;
}
cudaMemcpy ( V1_D , V1_H , N*sizeof(float) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( V2_D , V2_H , N*sizeof(float) , cudaMemcpyHostToDevice ) ;
Vector_Dot_Product <<<blockPerGrid , ThreadPerBlock >>> (V1_D , V2_D , V3_D ) ;
cudaMemcpy ( V3_H , V3_D , N*sizeof(float) , cudaMemcpyDeviceToHost ) ;
cout <<"\n Vector Dot Prodcut is : " ;
float sum = 0 ;
for ( int i = 0 ; i<blockPerGrid ; i++ )
sum+=V3_H[i] ;
cout << sum << endl ;
cudaFree ( V1_D) ;
cudaFree ( V2_D) ;
cudaFree ( V3_D) ;
delete [] V1_H ;
delete [] V2_H ;
delete [] V3_H ;
}
please tell me what is the problem in this coding......i cant understand ....thanks in advance..
Regarding this:
identifier “__synchthreads” is undefined
Wherever you have this:
__synchthreads();
You should change it to this:
__syncthreads();
Regarding this:
expression must be a modifiable lvalue
Since you have defined tid as const here:
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;
You are not allowed to try and change it here:
tid += blockDim.x * gridDim.x ;
So the simplest solution might be to just drop the const from the tid definition:
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;

from thrust to arrayfire - gfor usage?

I am trying to replace some thrust calls to arrayfire to check the performance.
I am not sure if I am using properly arrayfire because the results I am taking do not match at all.
So , the thrust code for example I am using is:
cudaMalloc( (void**) &devRow, N * sizeof(float) );
...//devRow is filled
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < N; i++, SlBegin += PerSlElmts )
{
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts, SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow...
Arrayfire:
af::array SlBegin( N , devRow );
for ( int i = 0;i < N; i++,SlBegin += PerSlElmts )
{
accum( SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow..
I am not sure how arrayfire handles the copy : af::array SlBegin( N , devRow ); .In thrust we have the device pointer which points from devRow to SlBegin , but in arrayfire..?
Also , I wanted to ask about using gfor .
In arrayfire webpage , it states that
Do not use this function directly; see GFOR: Parallel For-Loops.
And then for GFOR :
GFOR is disabled in the current version of ArrayFire
So , we can't use gfor?
---------UPDATE---------------------------
I have a small running example which shows the different results:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include "arrayfire.h"
#include <thrust/scan.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
__global__ void Kernel( const int N ,float * const devRow )
{
int i = threadIdx.x;
if ( i < N )
devRow[ i ] = i;
}
int main(){
int N = 6;
int Slices = 2;
int PerSlElmts = 3;
float * theRow = (float*) malloc ( N * sizeof( float ));
for ( int i = 0; i < N; i ++ )
theRow[ i ] = 0;
// raw pointer to device memory
float * devRow;
cudaMalloc( (void **) &devRow, N * sizeof( float ) );
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
// wrap raw pointer with a device_ptr
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < Slices; i++ , SlBegin += PerSlElmts )
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts , SlBegin );
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
for ( int i = 0; i < N; i++ )
printf("\n Thrust accum : %f",theRow[ i ] );
//--------------------------------------------------------------------//
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
af::array SlBeginFire( N, devRow );
for ( int i = 0; i < Slices; i++ , SlBeginFire += PerSlElmts )
af::accum( SlBeginFire );
SlBeginFire.host( theRow );
for ( int i = 0; i < N; i++ )
printf("\n Arrayfire accum : %f",theRow[ i ] );
cudaFree( devRow );
free( theRow );
return 0;
}
It looks like you are trying to run a column-wise (0th-dim in ArrayFire) scan on a 2D array. Here is some code that you could use:
af::array SlBegin(N, devRow);
af::array result = accum(SlBegin, 0);
Here is a sample output
A [5 3 1 1]
0.7402 0.4464 0.7762
0.9210 0.6673 0.2948
0.0390 0.1099 0.7140
0.9690 0.4702 0.3585
0.9251 0.5132 0.6814
accum(A, 0) [5 3 1 1]
0.7402 0.4464 0.7762
1.6612 1.1137 1.0709
1.7002 1.2236 1.7850
2.6692 1.6938 2.1435
3.5943 2.2070 2.8249
This runs and inclusive scan on each column independently.
As for gfor, it has been added to the Open Source version of ArrayFire. As this code base is still a beta, improvements and fixes are taking place very rapidly. So keep a watch on our github page.

thrust operations empty host array

I want to do some thrust operations but I am not sure how exactly.
Right now , I am receiving am array full of zeros ( the h_a array)
I have :
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/device_ptr.h>
#include <thrust/fill.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/generate.h>
template <typename T>
struct square
{
__host__ __device__
T operator()( const T& x ) const
{
return x * x;
}
};
int
main(
int argc,
const char * argv[] )
{
const size_t NbOfPoints = 256;
int BlocksPerGridX = 16;
int BlocksPerGridY = 16;
int ThreadsPerBlockX = 16;
int ThreadsPerBlockY = 16;
// generate random data on the host
thrust::host_vector<float> h_Kx ( NbOfPoints );
thrust::generate( h_Kx.begin(), h_Kx.end(), rand );
thrust::host_vector<float> h_Ky ( NbOfPoints );
thrust::generate( h_Ky.begin(), h_Ky.end(), rand );
// transfer to device
thrust::device_vector<float> dev_Kx = h_Kx;
thrust::device_vector<float> dev_Ky = h_Ky;
// create arrays for holding the number of threads per block in each dimension
int * X , * Y;
cudaMalloc((void **) &X, ThreadsPerBlockX * BlocksPerGridX * sizeof(*X) );
cudaMalloc((void **) &Y, ThreadsPerBlockY * BlocksPerGridY * sizeof(*Y) );
// wrap raw pointer with a device_ptr
thrust::device_ptr<int> dev_X ( X );
thrust::device_ptr<int> dev_Y ( Y );
// use device_ptr in Thrust algorithms
thrust::fill( dev_X, dev_X + ( ThreadsPerBlockX * BlocksPerGridX ) , (int) 0 );
thrust::fill( dev_Y, dev_Y + ( ThreadsPerBlockY * BlocksPerGridY ) , (int) 0 );
// setup arguments
square<float> square_op;
// create various vectors
thrust::device_vector<int> distX ( NbOfPoints );
thrust::device_vector<int> distY ( NbOfPoints );
thrust::device_vector<unsigned int> Tmp ( NbOfPoints );
thrust::host_vector<unsigned int> h_a ( NbOfPoints );
thrust::device_vector<unsigned int> distXSquared ( NbOfPoints );
thrust::device_vector<unsigned int> distYSquared ( NbOfPoints );
// compute distX = dev_Kx - dev_X and distY = dev_Ky - dev_Y
thrust::transform( dev_Kx.begin(), dev_Kx.begin(), dev_X , distX.begin() , thrust::minus<float>() );
thrust::transform( dev_Ky.begin(), dev_Ky.begin(), dev_Y , distY.begin() , thrust::minus<float>() );
//square distances
thrust::transform( distX.begin(), distX.end(), distXSquared.begin(), square_op );
thrust::transform( distY.begin(), distY.end(), distYSquared.begin(), square_op );
// compute Tmp = distX + distY
thrust::transform( distXSquared.begin() ,distXSquared.begin() , distYSquared.begin() , Tmp.begin() , thrust::plus<unsigned int>() );
thrust::copy( Tmp.begin(), Tmp.end(), h_a.begin() );
for ( int i = 0; i < 5; i ++ )
printf("\n temp = %u",h_a[ i ] );
return 0;
}
UPDATE:
Apart the edits from Robert Crovella , you must edit to integers:
square<int> square_op;
thrust::transform( dev_Kx.begin(), dev_Kx.end(), dev_X , distX.begin() , thrust::minus<int>() );
thrust::transform( dev_Ky.begin(), dev_Ky.end(), dev_Y , distY.begin() , thrust::minus<int>() );
You've got several instances of doing zero-length transforms:
thrust::transform( dev_Kx.begin(), dev_Kx.begin(), dev_X , distX.begin() , thrust::minus<float>() );
thrust::transform( dev_Ky.begin(), dev_Ky.begin(), dev_Y , distY.begin() , thrust::minus<float>() );
and:
thrust::transform( distXSquared.begin() ,distXSquared.begin() , distYSquared.begin() , Tmp.begin() , thrust::plus<unsigned int>() );
Since the first two parameters to each of the above transforms is the same, the work being done is zero. Presumably you want the corresponding .end() iterators in the second position rather than .begin()
When I make those changes, I got non-zero values printed out. They are quite large, but you appear to be squaring large values, so I'm not sure what your intent is.

Multi GPU performance degrade when allocated memory increases

I've tested the following on a GTX 690 GPU with 4GB RAM in Windows 7 x64, Visual C++ 10:
I've written a function that receives 2 vectors and adds into a 3rd vector. The task is broken over 2 GPU devices. I gradually increased the vector size to benchmark GPU performance. The required time linearly increases relative to vector size up to a certain point and then it abruptly jumps up. When I disable each of the GPU cores, the required time stays linear to the end of available memory. I've enclosed a diagram displaying required time versus allocated memory.
You can see the speed diagram here: Speed Comparison Diagram!
Can you tell me what is wrong?
Bests,
Ramin
This is my code:
unsigned BenchMark( unsigned VectorSize )
{
unsigned * D[ 2 ][ 3 ] ;
for ( int i = 0 ; i < 2 ; i++ )
{
cudaSetDevice( i ) ;
for ( int j = 0 ; j < 3 ; j++ )
cudaMalloc( & D[ i ][ j ] , VectorSize * sizeof( unsigned ) ) ;
}
unsigned uStartTime = clock() ;
// TEST
for ( int i = 0 ; i < 2 ; i++ )
{
cudaSetDevice( i ) ;
AddKernel<<<VectorSize/256,256>>>(
D[ i ][ 0 ] ,
D[ i ][ 1 ] ,
D[ i ][ 2 ] ,
VectorSize ) ;
}
cudaDeviceSynchronize() ;
cudaSetDevice( 0 ) ;
cudaDeviceSynchronize() ;
unsigned uEndTime = clock() ;
for ( int i = 0 ; i < 2 ; i++ )
{
cudaSetDevice( i ) ;
for ( int j = 0 ; j < 3 ; j++ )
cudaFree( D[ i ][ j ] ) ;
}
return uEndTime - uStartTime ;
}
__global__ void AddKernel(
const Npp32u * __restrict__ pSource1 ,
const Npp32u * __restrict__ pSource2 ,
Npp32u * __restrict__ pDestination ,
unsigned uLength )
{
unsigned x = blockIdx.x * blockDim.x + threadIdx.x ;
if ( x < uLength )
pDestination[ x ] = pSource1[ x ] + pSource2[ x ] ;
}
I found the answer. The problem happened as SLI was active, I disabled it and now it is working smoothly.

Learning CUDA, but currently stuck

So I've been trying to learn CUDA as of late, but am currently stuck and don't know what I'm doing wrong. I am trying to set the initial value of the opool array based on a random float between 0 and 1. If anyone could shed some light on what I did wrong it would be greatly appreciated.
Note - I omitted some code for brevity (cudaFree() & free() calls mainly). I apologize if I left any code of importance out.
__global__ void FirstLoop( int *opool, float *randomSet, int omax, int anumber )
{
int tid_loci = threadIdx.x;
int tid_2 = threadIdx.y;
int bid_omax = blockIdx.x;
int index = omax*tid_loci*2 + omax*tid_2 + bid_omax;
float r = randomSet[ index ];
// Commented out code is what it should be set to, but they are set to 5 or 15
// to determine if the values are correctly being set.
if ( r < 0.99 )
opool[ index ] = 15; //(int)((r * 100.0) * -1.0);
else
opool[ index ] = 5; //(int)((r)*(float)(anumber-4)) +5;
}
int main()
{
int loci = 10;
int omax = 20;
// Data stored on the host
int *h_opool;
float *h_randomSet;
// Data stored on the device
int *d_opool;
float *d_randomSet;
int poolSize = helpSize * omax;
int randomSize = loci * 2 * omax * sizeof(float);
// RESIZE ARRAYS TO NEEDED SIZE
h_opool = (int*)malloc( poolSize );
h_randomSet= (float*)malloc( randomSize );
cudaMalloc( &d_opool, poolSize );
cudaMalloc( &d_randomSet,randomSize );
for (sim=0; sim<smax; sim++)
{
for (i=0; i<poolSize; i++)
h_randomSet[i] = rndm();
dim3 blocks(omax);
dim3 thread(loci, 2);
cudaMemcpy( d_randomSet, h_randomSet, randomSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_opool, h_opool, poolSize, cudaMemcpyHostToDevice );
FirstLoop<<< blocks, thread >>>(d_opool, d_randomSet, omax, anumber );
cudaMemcpy( h_opool, d_opool, poolSize, cudaMemcpyDeviceToHost );
// Here is when I call printf to see the values stored in h_opool, but they are
// completely wrong
}
}
float rndm()
{
int random = rand();
return ((float)random / (float)RAND_MAX);
}
Change the following
int index = omax*tid_loci*2 + omax*tid_2 + bid_omax;
to
int index = bid_omax * tid_2 + tid_loci;
However a block configuration of 10x2 may not be the most ideal one. Try using 32 x 1 or 16 x 2.