Can an array of shorts be passed into a CUDA kernel - cuda

I have written a CUDA kernel and when I copy of an array of shorts to device memory and then pass it to the kernel it doesn't work. The simplified code below expresses my issue.
KernelCaller()
{
const int size = 1;
short hostArray[size]{41};
short* devPointer;
cudaMalloc((void**)&devicePointer, size * sizeof(short));
cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
cudaKernel<<<1,1>>>(devPointer);
}
__global__
void cudaKernel(short* arr)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
short val = arr[idx];
}
At this point the value of val is 1063714857 and what I want it to be is 41.
I assume the issue is 41 in hex is 0x29 and the value I have is 0x3F670029 so it looks like it read too many bytes cause the 0x29 is at the beginning. When I switch to an array of floats it works perfectly, but I was trying to save memory. Does CUDA not allow an array of shorts?

I have implemented your code and getting the output as expected.
Here's the code
#include<stdio.h>
__global__ void cudaKernel(short* arr)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
short val = arr[idx];
# if __CUDA_ARCH__>=200
printf("Inside kernel %d\n",val);
#endif
arr[idx] = val;
}
int main()
{
const int size = 1;
short hostArray[size]{41};
printf("Before kernel call %d\n",hostArray[0]);
short *devPointer;
cudaMalloc((void**)&devPointer, size * sizeof(short));
cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
cudaKernel<<<1,1>>>(devPointer);
cudaMemcpy(hostArray, devPointer, size * sizeof(short), cudaMemcpyDeviceToHost);
printf("After kernel call %d\n",hostArray[0]);
cudaFree(devPointer);
return 0;
}
And the output is
Before kernel call 41
Inside kernel 41
After kernel call 41
So, yes we can pass array of shorts into a CUDA kernel.

Related

Cuda C threads synchronization with printf or other functions

I have a problem with threads' id during the block executes.
I would like to have sentence like :"My temporary string is printed via GPU!" as you see (on the attached photo ealier) the sentence has been displayed wrongly and I don't know how to fix it.
Code:
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
while (id_x < static_cast<int>(*loop_repeat))
{
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp{};
temp = Get_String_Length(my_string); //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length{};
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), HostToDevice);
char* string_GPU{};
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), HostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("%cKernel executed successfully with code: %d !%\n", NEW_LINE, final_error);
}
else
{
printf("%cKernel executed with code error: %d !\n", NEW_LINE, final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
I will be grateful for any help given.
The main issue here is that you are expecting that the thread or warp execution order has some predictable order. Actually, it does not. Your usage of __syncthreads() doesn't fix or address this issue.
If you want the warps to execute in a predictable order (not recommended) you would need to impose that order yourself. Here is an example that demonstrates that for this very simple code. It is not extensible without modification to larger strings, and this method will completely break down if you introduce more than 1 threadblock.
$ cat t1543.cu
#include <stdio.h>
#include <stdlib.h>
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
int warp_ID = threadIdx.x>>5;
while (id_x < static_cast<int>(*loop_repeat))
{
if (warp_ID == 0)
printf("%c", __string[id_x]);
__syncthreads();
if (warp_ID == 1)
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp;
temp = 40; //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length;
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), cudaMemcpyHostToDevice);
char* string_GPU;
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), cudaMemcpyHostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("\nKernel executed successfully with code: %d !%\n", final_error);
}
else
{
printf("\nKernel executed with code error: %d !\n", final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
$ nvcc -o t1543 t1543.cu
$ cuda-memcheck ./t1543
========= CUDA-MEMCHECK
My temporary string is printed via GPU!
Kernel executed successfully with code: 0 !%
========= ERROR SUMMARY: 0 errors
$
Note that I'm not suggesting the above is good coding style. It's provided for understanding of the issue. Even this code is relying on the idea that the threads within a warp will call the printf function in a predictable order, which is not guaranteed by the CUDA programming model. So the code is really still broken.
This happened because The multiprocessor creates, manages, schedules, and executes threads in groups of 32 parallel threads called warps as you can see in CUDA Programming Guide, so the first 32 threads covers "My temporary string is printed v" and the remaining part covers "ia GPU!". It seems that the kernel put the latter wrap before the first one in execution order.

Optimizing memory access for complex numbers

I have a kernel that operates on complex numbers, and I am loading the values like this:
thrust::complex<float> x = X[tIdx];
where X is in global memory. When I profile this kernel with nvvp, I find that it is memory bandwidth-limited and the profiler suggests that I improve the memory access pattern:
Global Load L2 Transactions/Access=8, Ideal Transactions/Access=4
The disassembly confirms that this line is indeed split into two 32-bit loads, producing a strided access pattern:
LDG.E R9, [R16];
LDG.E R11, [R16+0x4];
How can I get this to compile into a single 64-bit load?
Potential solutions
I realize this is pretty closely related to this earlier question but the proposed solutions (change the global memory layout or use shared memory) seem less ideal than a 64-bit load.
The NVidia developer blog suggests reinterpret_cast to a vector data type such as float2, but I'm a little hazy about how this fits in with pointer aliasing rules.
I must also confess that this is somewhat of a theoretical question. For this particular kernel, I'm limited by the device memory bandwidth, so halving the # of L2 transactions shouldn't significantly improve the overall performance. But I anticipate working with more complex numbers in my future, and if there's a simple solution then I'd like to start using it now.
The basic problem here is that the compiler seems to need explicit alignment specifications for a type before it will generate vector load and store instructions. Consider the following trivial example:
class __align__(8) cplx0
{
public:
__device__ __host__ cplx0(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
class cplx1
{
public:
__device__ __host__ cplx1(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
template<typename T>
__global__ void memsetkernel(T* out, const T val, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = val;
}
template<typename T>
__global__ void memcpykernel(const T* __restrict__ in, T* __restrict__ out, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = in[tid];
}
template<typename T>
void memcpy(const T* in, T* out, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memcpykernel<T><<<nblocks, nthreads>>>(in, out, Nitems);
cudaDeviceSynchronize();
}
template<typename T>
void memset(T* in, const T value, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memsetkernel<T><<<nblocks, nthreads>>>(in, value, Nitems);
cudaDeviceSynchronize();
}
int main(void)
{
const int Nitems = 1 << 24;
typedef cplx0 fcomplex0;
typedef cplx1 fcomplex1;
{
fcomplex0* in;
fcomplex0* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex0));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex0>(in, fcomplex0(1.0f,1.0f), Nitems);
memcpy<fcomplex0>(in, out, Nitems);
}
cudaFree(in);
cudaFree(out);
}
{
fcomplex1* in;
fcomplex1* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex1));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex1>(in, fcomplex1(1.0f,1.0f), Nitems);
memcpy<fcomplex1>(in, out, Nitems);
cudaDeviceSynchronize();
}
cudaFree(in);
cudaFree(out);
}
cudaDeviceReset();
return 0;
}
Here we has two home-baked complex types, one with explicit alignment specifications, and one without. Otherwise they are identical. Putting them through a naïve mempcy and memset kernels in this test harness allows us to inspect the code generation behaviour of the toolchain for each type and benchmark the performance.
Firstly, the code. For cplx0 class, which has explicit 8-byte alignment, the compiler emits vectorized loads and stores in both kernels:
memcpykernel
ld.global.nc.v2.f32 {%f5, %f6}, [%rd17];
st.global.v2.f32 [%rd18], {%f5, %f6};
memsetkernel
st.global.v2.f32 [%rd11], {%f1, %f2};
whereas for the cplx1 case, it does not:
memcpykernel
ld.global.nc.f32 %f1, [%rd16];
ld.global.nc.f32 %f2, [%rd16+4];
st.global.f32 [%rd15+4], %f2;
st.global.f32 [%rd15], %f1;
memsetkernel
st.global.f32 [%rd11+4], %f2;
st.global.f32 [%rd11], %f1;
Looking at performance, there is a non-trivial difference in performance for the memset case (CUDA 8 release toolkit, GTX 970 with Linux 367.48 driver):
$ nvprof ./complex_types
==29074== NVPROF is profiling process 29074, command: ./complex_types
==29074== Profiling application: ./complex_types
==29074== Profiling result:
Time(%) Time Calls Avg Min Max Name
33.04% 19.264ms 10 1.9264ms 1.9238ms 1.9303ms void memcpykernel<cplx1>(cplx1 const *, cplx1*, int)
32.72% 19.080ms 10 1.9080ms 1.9055ms 1.9106ms void memcpykernel<cplx0>(cplx0 const *, cplx0*, int)
19.15% 11.165ms 10 1.1165ms 1.1120ms 1.1217ms void memsetkernel<cplx1>(cplx1*, cplx1, int)
15.09% 8.7985ms 10 879.85us 877.67us 884.13us void memsetkernel<cplx0>(cplx0*, cplx0, int)
The Thrust templated complex type does not have an explicit alignment definition (although it potentially could via specialization, although that would somewhat defeat the purpose). So your only choice here is to either make your own version of the Thrust type with explicit alignment, or use another complex type which does (like the cuComplex type which CUBLAS and CUFFT use).

Passing variables from parent kernel to child kernel in dynamic parallelism in cuda

I am trying to use dynamic parallelism in cuda. I am in a situation such that parent kernel has a variable that needs to be passed to child for further computation. I have gone through the resources in web
here
and it mentions that local variables cannot be passed to the child kernal and has mentioned the ways to pass variables and I have tried to pass the pass the variable as
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(N==10)
{
a[idx] = a[idx] * a[idx];
}
}
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int n=N; // this value of n can be changed locally and need to be passed
printf("%d\n",n);
cudaMalloc((void **) &n, sizeof(int));
square <<< 1, N >>> (arr, n);
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
first <<< 1, 1 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
and the value of parent to child kernel is not passed . how can I pass the value of local variable. Is there any way to do so?
This operation is not appropriate:
int n=N; // this value of n can be changed locally and need to be passed
cudaMalloc((void **) &n, sizeof(int)); // illegal
It is not appropriate in host code, nor in device code. n is an int variable. You are not supposed to assign a pointer to it. When you attempt to do so in a 64-bit environment, you are attempting to write a 64-bit pointer on top of a 32-bit int quantity. It will not work.
It's not clear why you would need it anyway. n is an integer parameter presumably specifying the size of your arr array of float. You don't need to allocate anything on top of it.
If you had run this code with cuda-memcheck, you could easily discover that error. You can also do proper cuda error checking in device code in exactly the same fashion as you do it in host code.
When I comment out that cudaMalloc line in the first kernel, your code runs correctly for me.

cudaMalloc global array cause seg fault

I found some difficulty when I try to access a global array from function that's executed from device:
float globTemp[3][3] = "some value in here";
__device__ float* globTemp_d;
__global__ void compute(int *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = y*w+x;
if(x<3 && y<3)
a[i] = 1+globTemp_d[i];
}
int hostFunc(){
float *a_d;
cudaMalloc((void**)&a_d, 3*3*sizeof(int));
cudaMalloc((void**)&globTemp_d, 3*3*sizeof(int));
cudaMemcpy(globTemp_d,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
compute<<<1,1>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
}
However, I get seg fault when i try to access globTemp_d[i]. Am I doing something wrong in here?
There are a variety of problems with your code:
Your grid is a 1D grid of 1D threadblocks (in fact you are launching a single block of 1 thread) but your kernel is written as if it were expecting a 2D threadblock structure (using .x and .y built-in variables). A single thread won't get the work done certainly, and a 1D threadblock won't work with your kernel code.
__device__ variables are not accessed with cudaMalloc and cudaMemcpy. We use a different set of API calls like cudaMemcpyToSymbol.
You're not doing any cuda error checking which is always recommended when you're having difficulty. You should do cuda error checking on both API calls and kernel calls.
You're mixing float variables (a_d ) with int variables in the kernel parameters (int *a) so I don't think this code would compile without at least a warning. And that can lead to strange behavior of course if you ignore it.
This is the closest I could come to your code while fixing all the errors:
#include <stdio.h>
__device__ float* globTemp_d;
__global__ void compute(float *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = (y*w)+x;
if((x<3) && (y<3))
a[i] = 1.0f+globTemp_d[i];
}
int main(){
float *a_d, *d_globTemp;
float globTemp[3][3] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f};
float a[(3*3)];
dim3 threads(3,3);
dim3 blocks(1);
cudaMalloc((void**)&a_d, 3*3*sizeof(float));
cudaMalloc((void**)&d_globTemp, 3*3*sizeof(float));
cudaMemcpy(d_globTemp,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(globTemp_d, &d_globTemp, sizeof(float *));
compute<<<blocks,threads>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i<(3*3); i++)
printf("a[%d] = %f\n", i, a[i]);
return 0;
}
This code can be simplified by dispensing with the __device__ variable and just passing d_globTemp as a parameter to the kernel, and using it in place of references to globTemp_d. However I did not make that simplification.

CUDA 1-D array not getting updated

this is my first attempt at a CUDA program. This is what it's supposed to do:
Receive 1D Pixel array from host memory
Each Pixel is processed by one thread: it is thread-safe because only "val" is read and only "newval" is updated. Wait for sync.
Each Pixel is processed by one thread: copy "newval" to "val."
Write this array back to host memory.
Repeat 2-4 for several different frames.
What happens, however, is that only a couple of variables, out of about 32000, in the new arrays seem to have decent values at all; the rest are zero.
I've removed the calculations for brevity.
__global__ void kernel(Pixel *array, float dt)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//DO A BUNCH OF CALCULATIONS ON PIXEL KIND OF LIKE THIS
point->newval = point->val + foo;
}
__global__ void copykernel(Pixel *array)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//COPY THE NEWVALS OVER TO THE OLD VALS IN PREPARATION FOR THE NEXT FRAME
point->val = point->newval;
}
extern "C" bool runIt(const int argc, const char **argv, Pixel *inarray, Pixel **outarrays, int arraysize, int numframes, float dt)
{
int memsize = arraysize*sizeof(Pixel);
int i=0;
Pixel *array;
cudaMalloc((void **) &array, memsize);
cudaMemcpy(array, inarray, memsize, cudaMemcpyHostToDevice);
int numthreads = arraysize;
dim3 grid(1,1,1);
dim3 threads(numthreads,1,1);
for(i=0;i<numframes;i++)
{
kernel<<<grid, threads>>>((Pixel *) array, dt);
cudaThreadSynchronize();
copykernel<<<grid, threads>>>((Pixel *) array);
cudaThreadSynchronize();
cudaMemcpy(array, outarrays[i], memsize, cudaMemcpyDeviceToHost);
}
cudaFree(array);
return true;
}
I have a suspicion that I'm setting up the parameters for the device incorrectly, or else I'm getting one of the device-specific keywords wrong or forgetting a crucial step. Does anything jump out at you?
I don't think you can run that many threads, and if you can, its not a good idea. Try setting the number of threads to 256 (16x16 for 2D), then choosing gridsize based on your input size.
dim3 threads(256,1,1);
dim3 grid(arraysize/threads.x,1,1); //Careful of integer division, this is just for example
Also your second copy is incorrect. You need to switch array and out_arrays
cudaMemcpy(outarrays[i], array, memsize, cudaMemcpyDeviceToHost);