cuda programming problem - cuda

I'm very new to cuda .I'm using cuda on my ubuntu 10.04 in device emulation mode.
I write a code to compute the square of array which is following :
#include <stdio.h>
#include <cuda.h>
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x + threadIdx.x;
if (idx<=N)
a[idx] = a[idx] * a[idx];
}
int main(void)
{
float *a_h, *a_d;
const int N = 10;
size_t size = N * sizeof(float);
a_h = (float *)malloc(size);
cudaMalloc((void **) &a_d, size);
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
square_array <<< 1,10>>> (a_d, N);
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf(" %f\n", a_h[i]);
free(a_h);
cudaFree(a_d);
return 0;
}
When I run this code it show no problem it give me proper output.
Now my problem is that when i use <<<2,5>>> or<<<5,2>>> the result is same . what is happening on gpu ?
All I understand is that I just launch cuda kernel with 5 blocks containing 2 thread.
Can anyone explain me how Gpu handle this or implement the launch(kernel call)?
Now my real problem is that when i call the kernel with <<<1,10>>> It is ok . It shows the perfect result.
but when i call the kernel with <<<1,5>> the result is following:
0.000000
1.000000
4.000000
9.000000
16.000000
5.000000
6.000000
7.000000
8.000000
9.000000
similarly when i reduce or increase the second parameter in kernel call it show different result for example when i change it to <<1,4>> it shows following result:
0.000000
1.000000
4.000000
9.000000
4.000000
5.000000
6.000000
7.000000
8.000000
9.000000
Why this result is coming ?
Can any body explain the working of kernel launch call ?
what is blockdim type variable contain ?
Please help me to understand the concept of kernel call launching and working ?
I searched the programming guide but they didn't explain it very well.

The calculation of idx in your kernel code is incorrect. If you change it to:
int idx = blockDim.x * blockIdx.x + threadIdx.x;
You might find the results a little easier to understand.
EDIT: For any given kernel launch
square_array<<<gridDim,blockDim>>>(...)
in the GPU, the automatic variable blockDim will contain the x,y, and z components of the blockDim argument passed in the host side kernel launch. Similarly gridDim will contain the x and y components of the gridDim argument passed in the launch.

Apart from what talonmies has said, you may need to do the following to have better performance in real world applications.
if (idx < N) {
tmp = a[idx];
a[idx] = tmp * tmp;
}

The way kernels are invoked in CUDA is like so:
kernel<<<numBlocks,numThreads>>>(Kernel arguments);
This means that there will be numBlocks blocks with numThreads threads running in each block. For example, if you call
kernel<<<1,5>>>(Kernel args);
then 1 block will run with 5 threads running in parallel. and if you call
kernel<<<2,5>>>(Kernel args);
then there well be 2 blocks with 5 threads running in each. Unless you alter your device code, the maximum dimension of the array that you are "squaring" is the product numBlocks*numThreads. This explains why not all of the values in your original array were squared.
I suggest you read through the CUDA_C_Programming_Guide.pdf that comes with the CUDA toolkit.

Related

Initialize constant global array CUDA C

I have a problem! I need to initialize a constant global array in cuda c. To initialize the array i need to use a for! I need to do this because I have to use this array in some kernels and my professor told me to define as a constant visible only in the device.
How can I do this??
I want to do something like this:
#include <stdio.h>
#include <math.h>
#define N 8
__constant__ double H[N*N];
__global__ void prodotto(double *v, double *w){
int k=threadIdx.x+blockDim.x*blockIdx.x;
w[k]=0;
for(int i=0;i<N;i++) w[k]=w[k]+H[k*N+i]*v[i];
}
int main(){
double v[8]={1, 1, 1, 1, 1, 1, 1, 1};
double *dev_v, *dev_w, *w;
double *host_H;
host_H=(double*)malloc((N*N)*sizeof(double));
cudaMalloc((void**)&dev_v,sizeof(double));
cudaMalloc((void**)&dev_w,sizeof(double));
for(int k=0;k<N;k++){
host_H[2*N*k+2*k]=1/1.414;
host_H[2*N*k+2*k+1]=1/1.414;
host_H[(2*k+1)*N+2*k]=1/1.414;
host_H[(2*k+1)+2*k+1]=-1/1.414;
}
cudaMemcpyToSymbol(H, host_H, (N*N)*sizeof(double));
cudaMemcpy(dev_v, v, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_w, w, N*sizeof(double), cudaMemcpyHostToDevice);
prodotto<<<1,N>>>(dev_v, dev_w);
cudaMemcpy(v, dev_v, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(w, dev_w, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++) printf("\n%f %f", v[i], w[i]);
return 0;
}
But the output is an array of zeros...I want the output array to be filled with the product of the matrix H(here seen as an array) and the array v.
Thanks !!!!!
Something like this should work:
#define DSIZE 32
__constant__ int mydata[DSIZE];
int main(){
...
int *h_mydata;
h_mydata = new int[DSIZE];
for (int i = 0; i < DSIZE; i++)
h_mydata[i] = ....; // initialize however you wish
cudaMemcpyToSymbol(mydata, h_mydata, DSIZE*sizeof(int));
...
}
Not difficult. You can then use the __constant__ data directly in a kernel:
__global__ void mykernel(...){
...
int myval = mydata[threadIdx.x];
...
}
You can read about __constant__ variables in the programming guide. __constant__ variables are read-only from the perspective of device code (kernel code). But from the host, they can be read from or written to using the cudaMemcpyToSymbol/cudaMemcpyFromSymbol API.
EDIT: Based on the code you've now posted, there were at least 2 errors:
Your allocation sizes for dev_v and dev_w were not correct.
You had no host allocation for w.
The following code seems to work correctly for me with those 2 fixes:
$ cat t579.cu
#include <stdio.h>
#include <math.h>
#define N 8
__constant__ double H[N*N];
__global__ void prodotto(double *v, double *w){
int k=threadIdx.x+blockDim.x*blockIdx.x;
w[k]=0;
for(int i=0;i<N;i++) w[k]=w[k]+H[k*N+i]*v[i];
}
int main(){
double v[N]={1, 1, 1, 1, 1, 1, 1, 1};
double *dev_v, *dev_w, *w;
double *host_H;
host_H=(double*)malloc((N*N)*sizeof(double));
w =(double*)malloc( (N)*sizeof(double));
cudaMalloc((void**)&dev_v,N*sizeof(double));
cudaMalloc((void**)&dev_w,N*sizeof(double));
for(int k=0;k<N;k++){
host_H[2*N*k+2*k]=1/1.414;
host_H[2*N*k+2*k+1]=1/1.414;
host_H[(2*k+1)*N+2*k]=1/1.414;
host_H[(2*k+1)+2*k+1]=-1/1.414;
}
cudaMemcpyToSymbol(H, host_H, (N*N)*sizeof(double));
cudaMemcpy(dev_v, v, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_w, w, N*sizeof(double), cudaMemcpyHostToDevice);
prodotto<<<1,N>>>(dev_v, dev_w);
cudaMemcpy(v, dev_v, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(w, dev_w, N*sizeof(double), cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++) printf("\n%f %f", v[i], w[i]);
printf("\n");
return 0;
}
$ nvcc -arch=sm_20 -o t579 t579.cu
$ cuda-memcheck ./t579
========= CUDA-MEMCHECK
1.000000 0.000000
1.000000 -0.707214
1.000000 -0.707214
1.000000 -1.414427
1.000000 1.414427
1.000000 0.707214
1.000000 1.414427
1.000000 0.707214
========= ERROR SUMMARY: 0 errors
$
A few notes:
Any time you're having trouble with a CUDA code, it's good practice to use proper cuda error checking.
You can run your code with cuda-memcheck (just as I have above) to get a quick read of whether any CUDA errors are encountered.
I've not verified the numerical results or worked through the math. If it's not what you wanted, I assume you can sort it out.
I've not made any changes to your code other than what seemed sensible to me to fix the obvious errors and make the results presentable for educational purposes. Certainly there can be discussions about preferred allocation methods, printf vs. cout, and what have you. I'm focused primarily on CUDA topics in this answer.

Cuda kernel work very slow

I have a problem with accelerate my cuda kernel. It looks that kernel works singly thread. Every thread waiting for previously thread. It doesn't work parallel.
Here is my kernel (I modify library LibTom for Cuda kernel)
__global__ void kernel(char* BiExponent, int lines)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
if(threadID<1000){
mp_int BiNumber; //various from LibTom
mp_int RNumber;
mp_int ANumber;
mp_int MNumber;
mp_int TempNumber;
mp_init_device(&RNumber);
mp_init_device(&ANumber);
mp_init_device(&MNumber);
mp_init_device(&TempNumber);
mp_init_device(&BiNumber);
mp_read_radix_device(&RNumber, "100648686727131257488671170806992645347098870006145705670894593595064198763504906829344253213869592972491529868272101131220921074193778137252965944155929765587582637231372264910012095142603377767870875822235936330880194126549443874542394830706956638044950273189050162374717380508672959124318834975983480937576",10);
mp_read_radix_device(&ANumber, "39805067790951086730573861588172121787196543962580983242598202413750011891252460890446709601730030154661775311984755147556289281733978635511703976267279217024606927800989962204783456250825578178354787716873876536014210063984216741307040544888447847197648475195752689213083224036785420625437224428658490304276",10);
mp_read_radix_device(&MNumber, "129135516335051440235803237491679224882957576030599162234748304648924718545589827797866156951847154321645009878340207570056281485244329202363518578978799475118300745910542939512857296428327440920812107991347416747733387762031164387998805210106456861835748765549471962882426089437101578019500113090139371006775",10);
mp_read_radix_device(&TempNumber, "0",10);
char* cstr = new char[YDIM];
for(int i=0; i<YDIM; i++){
cstr[i] = BiExponent[(threadID*YDIM)+i];
}
mp_read_radix_device(&BiNumber,cstr ,10);
mp_exptmod_device(&ANumber, &BiNumber, &MNumber,&TempNumber); //TEMP = (A^Bi)mod M
if(mp_cmp_device(&RNumber,&TempNumber)==MP_EQ){ // IF(TEMP==R)
printf("TRUE\n");
}
mp_clear_device(&BiNumber);
mp_clear_device(&RNumber);
mp_clear_device(&MNumber);
mp_clear_device(&ANumber);
mp_clear_device(&TempNumber);
delete [] cstr;
// printf("x = %d\n", threadID);
}}
start kernel in host:
kernel <<< 1024, 1 >>> (dev_Bi2dChar, lines);
Operation runs for 1000 numbers: 80s. It's very slow and I don't know when it is a bug. :/ I need some tips how I can the apps accelerate.
You are creating 1024 blocks of 1 thread when using:
kernel <<< 1024, 1 >>>
Is this really what you want? I would suggest to create 1 block with 1024 threads instead:
kernel <<< 1, 1024 >>>.

Dot Product in CUDA using atomic operations - getting wrong results

I am trying to implement the dot product in CUDA and compare the result with what MATLAB returns. My CUDA code (based on this tutorial) is the following:
#include <stdio.h>
#define N (2048 * 8)
#define THREADS_PER_BLOCK 512
#define num_t float
// The kernel - DOT PRODUCT
__global__ void dot(num_t *a, num_t *b, num_t *c)
{
__shared__ num_t temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads(); //Synchronize!
*c = 0.00;
// Does it need to be tid==0 that
// undertakes this task?
if (0 == threadIdx.x) {
num_t sum = 0.00;
int i;
for (i=0; i<THREADS_PER_BLOCK; i++)
sum += temp[i];
atomicAdd(c, sum);
//WRONG: *c += sum; This read-write operation must be atomic!
}
}
// Initialize the vectors:
void init_vector(num_t *x)
{
int i;
for (i=0 ; i<N ; i++){
x[i] = 0.001 * i;
}
}
// MAIN
int main(void)
{
num_t *a, *b, *c;
num_t *dev_a, *dev_b, *dev_c;
size_t size = N * sizeof(num_t);
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
a = (num_t*)malloc(size);
b = (num_t*)malloc(size);
c = (num_t*)malloc(size);
init_vector(a);
init_vector(b);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dot<<<N/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, sizeof(num_t), cudaMemcpyDeviceToHost);
printf("a = [\n");
int i;
for (i=0;i<10;i++){
printf("%g\n",a[i]);
}
printf("...\n");
for (i=N-10;i<N;i++){
printf("%g\n",a[i]);
}
printf("]\n\n");
printf("a*b = %g.\n", *c);
free(a); free(b); free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
and I compile it with:
/usr/local/cuda-5.0/bin/nvcc -m64 -I/usr/local/cuda-5.0/include -gencode arch=compute_20,code=sm_20 -o multi_dot_product.o -c multi_dot_product.cu
g++ -m64 -o multi_dot_product multi_dot_product.o -L/usr/local/cuda-5.0/lib64 -lcudart
Information about my NVIDIA cards can be found at http://pastebin.com/8yTzXUuK. I tried to verify the result in MATLAB using the following simple code:
N = 2048 * 8;
a = zeros(N,1);
for i=1:N
a(i) = 0.001*(i-1);
end
dot_product = a'*a;
But as N increases, I'm getting significantly different results (For instance, for N=2048*32 CUDA reutrns 6.73066e+07 while MATLAB returns 9.3823e+07. For N=2048*64 CUDA gives 3.28033e+08 while MATLAB gives 7.5059e+08). I incline to believe that the discrepancy stems from the use of float in my C code, but if I replace it with double the compiler complains that atomicAdd does not support double parameters. How should I fix this problem?
Update: Also, for high values of N (e.g. 2048*64), I noticed that the result returned by CUDA changes at every run. This does not happen if N is low (e.g. 2048*8).
At the same time I have a more fundamental question: The variable temp is an array of size THREADS_PER_BLOCK and is shared between threads in the same block. Is it also shared between blocks or every block operates on a different copy of this variable? Should I think of the method dot as instructions to every block? Can someone elaborate on how exactly the jobs are split and how the variables are shared in this example
Comment this line out of your kernel:
// *c = 0.00;
And add these lines to your host code, before the kernel call (after the cudaMalloc of dev_c):
num_t h_c = 0.0f;
cudaMemcpy(dev_c, &h_c, sizeof(num_t), cudaMemcpyHostToDevice);
And I believe you'll get results that match matlab, more or less.
The fact that you have this line in your kernel unprotected by any synchronization is messing you up. Every thread of every block, whenever they happen to execute, is zeroing out c as you have written it.
By the way, we can do significantly better with this operation in general by using a classical parallel reduction method. A basic (not optimized) illustration is here. If you combine that method with your usage of shared memory and a single atomicAdd at the end (one atomicAdd per block) you'll have a significantly improved implementation. Although it's not a dot product, this example combines those ideas.
Edit: responding to a question below in the comments:
A kernel function is the set of instructions that all threads in the grid (all threads associated with a kernel launch, by definition) execute. However, it's reasonable to think of execution as being managed by threadblock, since the threads in a threadblock are executing together to a large extent. However, even within a threadblock, execution is not in perfect lockstep across all threads, necessarily. Normally when we think of lockstep execution, we think of a warp which is a group of 32 threads in a single threadblock. Therefore, since execution amongst warps within a block can be skewed, this hazard was present even for a single threadblock. However, if there were only one threadblock, we could have gotten rid of the hazard in your code using appropriate sync and control mechanisms like __syncthreads() and (if threadIdx.x == 0) etc. But these mechanisms are useless for the general case of controlling execution across multiple threadsblocks. Multiple threadblocks can execute in any order. The only defined sync mechanism across an entire grid is the kernel launch itself. Therefore to fix your issue, we had to zero out c prior to the kernel launch.

Using cuda texture memory for 1D interpolation

I'm trying to use texture memory to solve an interpolation problem, hopefully in a faster way than using global memory. Being the very first time for me to use texture memory, I'm oversimplifying my interpolation problem to a linear interpolation one. So, I'm already aware there are smarter and faster ways to make linear interpolation than the one reported below.
Here is the file Kernels_Interpolation.cuh. The __device__ function linear_kernel_GPU is omitted for simplicity, but is correct.
texture<cuFloatComplex,1> data_d_texture;
__global__ void linear_interpolation_kernel_function_GPU_texture(cuComplex* result_d, float* x_in_d, float* x_out_d, int M, int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
cuComplex datum;
if(j<N)
{
result_d[j] = make_cuComplex(0.,0.);
for(int k=0; k<M; k++)
{
datum = tex1Dfetch(data_d_texture,k);
if (fabs(x_out_d[j]-x_in_d[k])<1.) result_d[j] = cuCaddf(result_d[j],cuCmulf(make_cuComplex(linear_kernel_GPU(x_out_d[j]-x_in_d[k]),0.),datum));
}
}
}
Here is the Kernels_Interpolation.cu function
extern "C" void linear_interpolation_function_GPU_texture(cuComplex* result_d, cuComplex* data_d, float* x_in_d, float* x_out_d, int M, int N){
cudaBindTexture(NULL, data_d_texture, data_d, M);
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
linear_interpolation_kernel_function_GPU_texture<<<dimGrid,dimBlock>>>(result_d, x_in_d, x_out_d, M, N);
}
Finally, in the main program, the data_d array is allocated and initialized as follows
cuComplex* data_d; cudaMalloc((void**)&data_d,sizeof(cuComplex)*M);
cudaMemcpy(data_d,data,sizeof(cuComplex)*M,cudaMemcpyHostToDevice);
The result_d array has length N.
The strange thing is that the output is correctly computed only on the first 16 locations, although N>16, the others being 0s e.g.
result.r[0] 0.563585 result.i[0] 0.001251
result.r[1] 0.481203 result.i[1] 0.584259
result.r[2] 0.746924 result.i[2] 0.820994
result.r[3] 0.510477 result.i[3] 0.708008
result.r[4] 0.362980 result.i[4] 0.091818
result.r[5] 0.443626 result.i[5] 0.984452
result.r[6] 0.378992 result.i[6] 0.011919
result.r[7] 0.607517 result.i[7] 0.599023
result.r[8] 0.353575 result.i[8] 0.448551
result.r[9] 0.798026 result.i[9] 0.780909
result.r[10] 0.728561 result.i[10] 0.876729
result.r[11] 0.143276 result.i[11] 0.538575
result.r[12] 0.216170 result.i[12] 0.861384
result.r[13] 0.994566 result.i[13] 0.993541
result.r[14] 0.295192 result.i[14] 0.270596
result.r[15] 0.092388 result.i[15] 0.377816
result.r[16] 0.000000 result.i[16] 0.000000
result.r[17] 0.000000 result.i[17] 0.000000
result.r[18] 0.000000 result.i[18] 0.000000
result.r[19] 0.000000 result.i[19] 0.000000
The rest of the code is correct, namely, if I replace linear_interpolation_kernel_function_GPU_texture and linear_interpolation_function_GPU_texture with functions using global memory everything is fine.
I have verified that I can correctly access texture memory until a certain location (which depends on M and N), for example 64, after which it returns 0s.
I have the same problem if I replace the cuComplex texture to a float one (forcing the data to be real).
Any ideas?
I can see one logical error in the following line of your program.
cudaBindTexture(NULL, data_d_texture, data_d, M);
The last argument of cudaBindTexture takes the size of data in bytes and you are specifying the number of elements.
You should try the following:
cudaBindTexture(NULL, data_d_texture, data_d, M * sizeof(cuComplex));

cuBLAS argmin -- segfault if outputing to device memory?

In cuBLAS, cublasIsamin() gives the argmin for a single-precision array.
Here's the full function declaration: cublasStatus_t cublasIsamin(cublasHandle_t handle, int n,
const float *x, int incx, int *result)
The cuBLAS programmer guide provides this information about the cublasIsamin() parameters:
If I use host (CPU) memory for result, then cublasIsamin works properly. Here's an example:
void argmin_experiment_hostOutput(){
float h_A[4] = {1, 2, 3, 4}; int N = 4;
float* d_A = 0;
CHECK_CUDART(cudaMalloc((void**)&d_A, N * sizeof(d_A[0])));
CHECK_CUBLAS(cublasSetVector(N, sizeof(h_A[0]), h_A, 1, d_A, 1));
cublasHandle_t handle; CHECK_CUBLAS(cublasCreate(&handle));
int result; //host memory
CHECK_CUBLAS(cublasIsamin(handle, N, d_A, 1, &result));
printf("argmin = %d, min = %f \n", result, h_A[result]);
CHECK_CUBLAS(cublasDestroy(handle));
}
However, if I use device (GPU) memory for result, then cublasIsamin segfaults. Here's an example that segfaults:
void argmin_experiment_deviceOutput(){
float h_A[4] = {1, 2, 3, 4}; int N = 4;
float* d_A = 0;
CHECK_CUDART(cudaMalloc((void**)&d_A, N * sizeof(d_A[0])));
CHECK_CUBLAS(cublasSetVector(N, sizeof(h_A[0]), h_A, 1, d_A, 1));
cublasHandle_t handle; CHECK_CUBLAS(cublasCreate(&handle));
int* d_result = 0;
CHECK_CUDART(cudaMalloc((void**)&d_result, 1 * sizeof(d_result[0]))); //just enough device memory for 1 result
CHECK_CUDART(cudaMemset(d_result, 0, 1 * sizeof(d_result[0])));
CHECK_CUBLAS(cublasIsamin(handle, N, d_A, 1, d_result)); //SEGFAULT!
CHECK_CUBLAS(cublasDestroy(handle));
}
The Nvidia guide says that `cublasIsamin()` can output to device memory. What am I doing wrong?
Motivation: I want to compute the argmin() of several vectors concurrently in multiple streams. Outputting to host memory requires CPU-GPU synchronization and seems to kill the multi-kernel concurrency. So, I want to output the argmin to device memory instead.
The CUBLAS V2 API does support writing scalar results to device memory. But it doesn't support this by default. As per Section 2.4 "Scalar parameters" of the documentation, you need to use cublasSetPointerMode() to make the API aware that scalar argument pointers will reside in device memory. Note this also makes these level 1 BLAS functions asynchronous, so you must ensure that the GPU has completed the kernel(s) before trying to access the result pointer.
See this answer for a complete working example.