cuda kernel seems not to be called - cuda

May I know why the following simple cuda program fails on a device with CC5.2?
#include <iostream>
#include <math.h>
#include <stdlib.h>
#define N 1
__global__ void vector_addition( int *a, int *b, int *c )
{
int i = blockDim.x + blockIdx.x + threadIdx.x;
if ( i < N )
c[ i ] = a[ i ] + b[ i ];
}
int main()
{
size_t bytes = N*sizeof( int );
int *A = (int *)malloc(bytes);
int *B = (int *)malloc(bytes);
int *C = (int *)malloc(bytes);
int *d_A, *d_B, *d_C;
cudaMalloc( &d_A, bytes );
cudaMalloc( &d_B, bytes );
cudaMalloc( &d_C, bytes );
for ( int i = 0; i < N; i++ ) {
A[ i ] = 1; B[ i ] = 2; C[ i ] = 0;
}
cudaMemcpy( d_A, A, bytes, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, B, bytes, cudaMemcpyHostToDevice );
int thr_per_blk = 1024;
int blk_in_grid = ceil( float( N ) / thr_per_blk );
vector_addition<<< blk_in_grid, thr_per_blk >>>( d_A, d_B, d_C );
cudaMemcpy( C, d_C, bytes, cudaMemcpyDeviceToHost );
for ( int i = 0; i < N; i++ ) {
if ( C[ i ] != 3 ) {
std::cout << "error\n";
}
}
free( A ); free( B ); free( C );
cudaFree( d_A ); cudaFree( d_B ); cudaFree( d_C );
return 0;
}
The output is error message.

This line in your kernel is not correct:
int i = blockDim.x + blockIdx.x + threadIdx.x;
That is not the proper way to generate a 1D index. It should be:
int i = blockDim.x * blockIdx.x + threadIdx.x;
With your incorrect indexing, the first thread, which should generate a 0 for a globally unique index, generates 1024+0+0 = 1024. This fails the if test in your kernel, so no threads actually do anything.

Related

from thrust to arrayfire - gfor usage?

I am trying to replace some thrust calls to arrayfire to check the performance.
I am not sure if I am using properly arrayfire because the results I am taking do not match at all.
So , the thrust code for example I am using is:
cudaMalloc( (void**) &devRow, N * sizeof(float) );
...//devRow is filled
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < N; i++, SlBegin += PerSlElmts )
{
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts, SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow...
Arrayfire:
af::array SlBegin( N , devRow );
for ( int i = 0;i < N; i++,SlBegin += PerSlElmts )
{
accum( SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow..
I am not sure how arrayfire handles the copy : af::array SlBegin( N , devRow ); .In thrust we have the device pointer which points from devRow to SlBegin , but in arrayfire..?
Also , I wanted to ask about using gfor .
In arrayfire webpage , it states that
Do not use this function directly; see GFOR: Parallel For-Loops.
And then for GFOR :
GFOR is disabled in the current version of ArrayFire
So , we can't use gfor?
---------UPDATE---------------------------
I have a small running example which shows the different results:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include "arrayfire.h"
#include <thrust/scan.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
__global__ void Kernel( const int N ,float * const devRow )
{
int i = threadIdx.x;
if ( i < N )
devRow[ i ] = i;
}
int main(){
int N = 6;
int Slices = 2;
int PerSlElmts = 3;
float * theRow = (float*) malloc ( N * sizeof( float ));
for ( int i = 0; i < N; i ++ )
theRow[ i ] = 0;
// raw pointer to device memory
float * devRow;
cudaMalloc( (void **) &devRow, N * sizeof( float ) );
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
// wrap raw pointer with a device_ptr
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < Slices; i++ , SlBegin += PerSlElmts )
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts , SlBegin );
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
for ( int i = 0; i < N; i++ )
printf("\n Thrust accum : %f",theRow[ i ] );
//--------------------------------------------------------------------//
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
af::array SlBeginFire( N, devRow );
for ( int i = 0; i < Slices; i++ , SlBeginFire += PerSlElmts )
af::accum( SlBeginFire );
SlBeginFire.host( theRow );
for ( int i = 0; i < N; i++ )
printf("\n Arrayfire accum : %f",theRow[ i ] );
cudaFree( devRow );
free( theRow );
return 0;
}
It looks like you are trying to run a column-wise (0th-dim in ArrayFire) scan on a 2D array. Here is some code that you could use:
af::array SlBegin(N, devRow);
af::array result = accum(SlBegin, 0);
Here is a sample output
A [5 3 1 1]
0.7402 0.4464 0.7762
0.9210 0.6673 0.2948
0.0390 0.1099 0.7140
0.9690 0.4702 0.3585
0.9251 0.5132 0.6814
accum(A, 0) [5 3 1 1]
0.7402 0.4464 0.7762
1.6612 1.1137 1.0709
1.7002 1.2236 1.7850
2.6692 1.6938 2.1435
3.5943 2.2070 2.8249
This runs and inclusive scan on each column independently.
As for gfor, it has been added to the Open Source version of ArrayFire. As this code base is still a beta, improvements and fixes are taking place very rapidly. So keep a watch on our github page.

From given vector find max value and its index by reduction method in CUDA

I am new to CUDA, for the vector to find max value and its index I use CUDA
here its my code:
#include < cuda.h >
#include < stdio.h >
#include < time.h >
#include <iostream>
using namespace std;
#define tbp 256
#define nblocks 1
__global__ void kernel_max(int *a, int *d, int *index,int *idx)
{
__shared__ int sdata[tbp]; //"static" shared memory
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = a[i];
index[tid] = i;
__syncthreads();
for(int s=tbp/2 ; s >= 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] < sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
index[tid] = index[tid+s];
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
if(tid == 0 )
{
d[blockIdx.x] = sdata[0];
idx[blockIdx.x] = index[0];
}
__syncthreads();
}
int main()
{
int i;
const int N=tbp*nblocks;
srand(time(NULL));
int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));
int *index;
index = (int*)malloc(N * sizeof(int));
int *idx;
idx = (int*)malloc(nblocks * sizeof(int));
int *dev_a, *dev_d, *dev_index,*dev_idx;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
cudaMalloc((void **) &dev_index, N*sizeof(int));
cudaMalloc((void **) &dev_idx, nblocks*sizeof(int));
int mmm=0;
int ddd=0;
for( i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
index[i]=i;
//printf("%d\n",a[i]);
if(mmm<a[i])
{
mmm=a[i];
ddd=i;
}
}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_index , index, N*sizeof(int),cudaMemcpyHostToDevice);
kernel_max <<< nblocks,tbp >>>(dev_a,dev_d,dev_index,dev_idx);
cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(index, dev_index, N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(idx, dev_idx, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
printf("cpu max= %d, gpu_max = %d ,cpu index: %d, gpu index: %d",mmm,d[0],ddd,idx[0]);
printf("\n");
if(ddd!=idx[0])
{
cout<<"index mismatch!damn!!"<<endl;
}
else
{
cout<<"congratulations!!"<<endl;
}
/*
for(i=0;i<N;i++)
cout<<*(index+i)<<endl;
*/
cudaFree(dev_a);
cudaFree(dev_d);
cudaFree(dev_index);
cudaFree(dev_idx);
free(a);
free(d);
free(index);
free(idx);
return 0;
}
The problem is that for the tbp < 128 it can get correct result both in value and index
when increase to 256,512,1024, the result will sometimes go wrong.
Can anyone given a explanation for this situation?Thanks.
Use another loop to deal with the index to avoid same max value with different index problem in this computation
int temp=0;
for(i=0;i<tbp;i++)
{
if(d[blockIdx.x]==a[i] && temp==0)
{temp = i;}
}
idx[0] = temp;
you need set int temp= -1 instead 0 to avoid the case of maximum value lcoated at 0.

CUDA Performance - Always return different values

This my code:
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int N = 8000;
void fillArray(int *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() % 100;
}
__global__ void add(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
__global__ void subtract(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] - b[tid];
}
}
__global__ void multiply(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] * b[tid];
}
}
__global__ void divide(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] / b[tid];
}
}
__global__ void modu(int* a, int *b, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = a[tid] % b[tid];
}
}
__global__ void neg(int *data, int *c) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
c[tid] = -data[tid];
}
}
float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
int hArrayC[N];
add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsedTime;
}
int main(void) {
int *a, *b;
a = new int[N];
b = new int [N];
float dur = 0;
int *devA, *devB,*devC;
cudaMalloc((void**) &devA, N * sizeof(int));
cudaMalloc((void**) &devB, N * sizeof(int));
cudaMalloc((void**) &devC, N * sizeof(int));
fillArray(a, N);
fillArray(b, N);
cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
dur = duration(devA, devB, devC,N, 1);
cout << "Global memory version:\n";
cout << "Process completed in " << dur;
cout << " for a data set of " << N << " integers.";
cudaFree(devA);
cudaFree(devB);
delete [] a;
delete [] b;
return 0;
}
What i want to know the total miliseconds in duration function. But miliseconds always return in different values. Sometimes it is 10 ms sometimes it is 0.78652 sometimes it is 30 miliseconds.Why? What is wrong with my code?
This may be caused by the loading/unloading of the NVIDIA drivers. Think of it as an initialization step for the GPU.
You can either set your GPU to persistence mode:
nvidia-smi -pm 1
Or you could run a dummy kernel before timing your GPU code to trigger the loading of the drivers:
__global__ void dummy()
{
// This kernel does nothing, this is just a "warm-up"
}
// Before your cudaEventRecord etc.
dummy<<<blocksPerGrid, threadsPerBlock>>>();
Or maybe just use cudaThreadSynchronize() before timing your kernels.

wrong results in cudaMemcpyDeviceToHost

I am having problems with my first CUDA program. In it I try to add two arrays but when I print the results are completly wrong.
This is my code:
#include <stdio.h>
const int N=10;
__global__ void add(int *a, int *b, int *c) {
int tid = threadIdx.x;
c[tid] = a[tid] + b[tid];
}
int main(){
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );
for (int i=0; i<N; i++) {
a[i] = -i; b[i] = i * i;
}
cudaMemcpy ( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy ( dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice );
add<<<1,N>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost );
for (int i=0; i<N; i++) {
printf("%d + %d = %d\n", a[i],b[i],c[i]);
}
cudaFree (dev_a); cudaFree (dev_b); cudaFree (dev_c);
return 0;
}
As you will see I try to add A and B in C, but, I don't know why,it don't work.
And this is the output:
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 1139031032
-3 + 9 = 32737
-4 + 16 = 0
-5 + 25 = 0
-6 + 36 = 1203890432
-7 + 49 = 32767
-8 + 64 = 6299112
-9 + 81 = 0
Finally, I apologize for my English ... after learn cuda the next thing I will do is improve it.
There is something wrong with your machine. Please do error checking, and you will get an idea of what it is.
When I compile and run your exact code, I get the following:
$ ./t81
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72
$
Here is your code modified with error checking. If you compile it and run it, I'm sure you'll discover a problem with your machine:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int N=10;
__global__ void add(int *a, int *b, int *c) {
int tid = threadIdx.x;
c[tid] = a[tid] + b[tid];
}
int main(){
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );
cudaCheckErrors("cudamalloc fail");
for (int i=0; i<N; i++) {
a[i] = -i; b[i] = i * i;
}
cudaMemcpy ( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy ( dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice );
cudaCheckErrors("cuda memcpy fail");
add<<<1,N>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost );
cudaCheckErrors("cudamemcpy or cuda kernel fail");
for (int i=0; i<N; i++) {
printf("%d + %d = %d\n", a[i],b[i],c[i]);
}
cudaFree (dev_a); cudaFree (dev_b); cudaFree (dev_c);
return 0;
}
This poster did the same thing and discovered a problem with their machine.

Cannot read out Values from Texture Memory

Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);