Dynamic 2D array using double pointer in CUDA [duplicate] - cuda

I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.

The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.

You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}

Related

CUDA-why it cannot printf the information in cuda code? [duplicate]

This question already has answers here:
Trouble compiling helloworld.cu
(2 answers)
Closed 3 years ago.
I am a beginner for cuda. I wrote a test code for testing GPU device. my gpu model is k80.
There are 8 gpu cards in one node.
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 10000
__global__ void add(int *a, int *b, int *c)
{
int tid = blockIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
for (int i = 0;i < N;i++)
{
a[i] = -i;
b[i] = i*i;
}
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
add << <N, 1 >> > (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0;i < N;i++)
{
printf("%d + %d = %d\\n", a[i], b[i], c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
When i compile the code:
nvcc gputest.cu -o gputest
I got errors :
gputest.cu(38): error: identifier "printf" is undefined
1 error detected in the compilation of "/tmp/tmpxft_000059a6_00000000-4_gputest.cpp4.ii".
I think printf is a function in iostream file, but i have already included the iostream. I don't know why?
Add:
#include <stdio.h>
and it will compile is OK.
printf is a function defined in the C standard library cstdio, so inclusion of stdio.h makes sense here. Different compilers may have different behavior here, but in the case of nvcc this is generally the right way to do it.
(It's not valid to assume in all cases that inclusion of iostream will satisfy the reference here.)

invalid device ordinal on cudaMemPrefetchAsync

I'm running a toy CUDA sample on my GeForce 1080 Ti (Pascal) on windows 10 and CUDA 9.2.
Goal is to test cudaMemPrefetchAsync to the CPU, as it's supposed to work.
However, I get a CUDA error (invalid device ordinal) on this particular line.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
void fill(int* a, int val, int N) {
for (int k = 0; k < N; ++k) {
a[k] = val;
}
}
__global__ void add(int* a, int* b, int N)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) {
a[i] += b[i];
}
}
inline void check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
::fprintf(stderr, "ERROR at %s[%d] : %s\n", file, line, cudaGetErrorString(err));
abort();
}
}
#define CUDA_CHECK(err) do { check(err, __FILE__, __LINE__); } while(0)
int main()
{
int deviceId;
CUDA_CHECK(cudaGetDevice(&deviceId));
const int N = 1024*1024*32;
int *a, *b;
CUDA_CHECK(cudaMallocManaged(&a, N * sizeof(int)));
CUDA_CHECK(cudaMallocManaged(&b, N * sizeof(int)));
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), cudaCpuDeviceId)); // program breaks here
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), cudaCpuDeviceId));
fill(a, 1, N);
fill(a, 2, N);
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), deviceId));
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), deviceId));
add<<<32, 256>>>(a, b, N);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
return 0;
}
Is that a hardware/driver/OS limitation? Can I simply ignore the error?
Is that a hardware/driver/OS limitation?
Yes, the latter. Quoting from the documentation
GPUs with SM architecture 6.x or higher (Pascal class or newer)
provide additional Unified Memory features such as on-demand page
migration and GPU memory oversubscription that are outlined throughout
this document. Note that currently these features are only supported
on Linux operating systems.
So asynchronous page migration is not supported in Windows at the moment and that it why you get an error when you try to enable it.

Using of shared memory not showing desired result

I am trying to learn the usuage of Shared memory with a view to increase the performance . here I am trying to copy the global memory to shared memory. but when I have single block(256 thread) it gives the result and with more than 1 block it gives random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[400];
int t = blockIdx.x * blockDim.x + threadIdx.x;
d[t] = d[t]*d[t];
s[t] =d[t];
__syncthreads();
d[t] = s[t];
}
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = threadIdx.x;
s[t] = d[t]*d[t];
__syncthreads();
d[t] = s[t];
}
int main(void)
{
const int n = 400;
int a[n], d[n];
for (int i = 0; i < n; i++)
{
a[i] = i;
}
int *d_d;
cudaMalloc(&d_d, n * sizeof(int));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
staticReverse<<<n_blocks,block_size>>>(d_d, n);
cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++)
{
printf("%d\n",d[i]);
}
}
1)what does the third argument in dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
kernal call does? does it allocates shared memory for entire block or thread.
2) if I required more than 64kb of shared memory per multiprocessor in compute capability 5.0 what I need to do?
In your static shared memory allocation code you had three issues:
The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
You should use local thread index for indexing shared memory, instead of the global one;
You had no array out of bounds checking.
The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.
Here is the complete working code:
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[256];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
d[t] = d[t]*d[t];
s[threadIdx.x] =d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
s[threadIdx.x] = d[t]*d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
int main(void)
{
const int n = 400;
int* a = (int*) malloc(n*sizeof(int));
int* d = (int*) malloc(n*sizeof(int));
for (int i = 0; i < n; i++) { a[i] = i; }
int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int)));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
//staticReverse<<<n_blocks,block_size>>>(d_d, n);
dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }
}

CUDA branch divergence doesn't make any differences

I'm trying to learn CUDA by myself, and I'm now into the issue of branch divergence. As far as I understand, this is the name given to the problem that arises when several threads in a block are said to take a branch (due to if or switch statements, for example), but others in that block don't have to take it.
In order to investigate a little bit further this phenomena and its consequences, I've written a little file with a couple of CUDA functions. One of them is supposed to take lots of time, since the threads are stopped for much more time (9999... iterations) than in the other one (in which they're only stopped for an assignation).
However, when I run the code, I'm getting very similar times. Furthermore, even measuring the time that running both of them takes I get a time similar to running only one. Did I code anything wrong, or is there a logical explanation for this?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define ITERATIONS 9999999999999999999
#define BLOCK_SIZE 16
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
float result = 0;
if(threadIdx.x % 2 == 0)
{
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*A[threadIdx.x];
}
} else
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*B[threadIdx.x];
}
}
__global__ void betterDivergence(float *A, float *B){
float result = 0;
float *aux;
//This structure should not affect performance that much
if(threadIdx.x % 2 == 0)
aux = A;
else
aux = B;
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*aux[threadIdx.x];
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
elementsB[x] = (x%2==0)?1:3;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(32/dimBlock.x, 32/dimBlock.y);
divergence<<<dimBlock, dimGrid>>>(d_a, d_b);
betterDivergence<<<dimBlock, dimGrid>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer);
printf("kernel execution time (secs): %f s\n", timerValue);
return 0;
}
1) You have no memory writes in your __global__ code except the local variable(result). I'm not sure that cuda compiler does that, but all your code can be safely removed with no side effect(and maybe the compiler had done that).
2) All your reads from device memory in __global__ functions are from one place on each iteration. Cuda will store the value in register memory and the longest operation(memory access) will be done very fast here.
3) May be the compiler had replaced your cycles with single multiplication like `result=ITERATIONS*A[threadIdx.x]*B[threadIdx.x]
4) If all the code in your functions will be executed as you wrote it, your betterDivergence is going to be approximately 2 times faster than your another function because you have the loops in if branches in slower one and no loops in branches in faster one. But there won't be any idle time in threads among the threads that execute same loop because all threads are going to execute the body of the loop each iteration.
I suggest you to write another example where you will store the result in some device memory and then copy that memory back to host and make some more unpredictable calculations to prevent possible optimizations.
Below is shown the final, tested, right example of a code that allows to compare the performance between CUDA code with and without branch divergence:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
//#define ITERATIONS 9999999999999999999
#define ITERATIONS 999999
#define BLOCK_SIZE 16
#define WARP_SIZE 32
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > 2)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
__global__ void noDivergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > WARP_SIZE)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(128/dimBlock.x, 128/dimBlock.y);
//divergence<<<dimGrid, dimBlock>>>(d_a, d_b);
noDivergence<<<dimGrid, dimBlock>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer)/1000;
printf("kernel execution time (secs): %f s\n", timerValue);
cudaMemcpy(elementsB, d_b, BLOCK_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}

why am I seeing a black screen when I try this code in cuda?

I'm using Win8 and Nsight in "visual studio 2010" and I installed "310.90-notebook-win8-win7-winvista-32bit-international-whql" for my Graphic card(9300m Gs).but when I try the code below,I see a black screen!and an error :"Display driver stoped responding and has recoverd"!
I know that the problem is with "cudaMemcpy",but I don't why!?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define N 8
__global__ void kernel(int *a)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int step = x;
while(step<N){
a[step] = threadIdx.x;
step += x;
}
}
int main()
{
int a[N],i=N,j=0;
for(;j<N;j++)
a[j]=i--;
int *dev_a;
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMemcpy( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
kernel<<<2,2>>>(dev_a);
cudaError_t cudaStatus = cudaMemcpy(a, dev_a,N-1 * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
//goto Error;
}
for(j=0;j<N;j++)printf("\n%d",a[j]);
int t;
scanf("%d",&t);
}
In the kernel, the thread with threadIdx.x = 0 and blockIdx.x = 0 i.e. the first thread of the first block will run indefinitely, causing the kernel to crash.
When threadIdx.x = 0 and blockIdx.x = 0 the kernel code will become:
int x = 0;
int step = 0;
while(step<N)
{
a[step] = 0;
step += 0; //This will create infinite loop
}
Also (May be its a typo), there is a logical error in the following line of your code:
cudaError_t cudaStatus = cudaMemcpy(a, dev_a,N-1 * sizeof(int), cudaMemcpyDeviceToHost);
Considering the operator precedence in C, the expression N-1 * sizeof(int) will evaluate to N-4 (if sizeof(int) is 4).