How to copy a device variable to host in CUDA [duplicate] - cuda

I have problem with copying data from GPU to CPU. At the beginning I create variable in GPU space:
__device__ float gpu_array;
In this GPU function, I want to copy the data from od.fS[gi] (value = 0,43...) to gpu_array:
__global__ void Collide(.....){
....
//Streaming
od.fS[gi] = fi_S;
od.fN[gi] = fi_N;
od.fE[gi] = fi_E;
od.fW[gi] = fi_W;
gpu_array = od.fC[gi];
}
End here is function to copy data from GPU to CPU:
void showData(){
cudaDeviceSynchronize();
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
float* cpu_array = (float*)malloc(sizeof(float));
cudaMemcpy(cpu_array, temp_array, sizeof(float), cudaMemcpyDeviceToHost);
printf("h_array: %f\n", *cpu_array);
}
In finally copies but wrong values (0.00000, -0.00000 etc.). What am I doing wrong?

There are multiple problems with your use of cudaMemcpyFromSymbol:
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
you haven't allocated storage for data pointed to by temp_array
we don't use quotes around the device symbol name anymore
the destination pointer should be just a pointer, not the address of a pointer
the remaining code around cpu_array is completely unnecessary
Correct usage would be something like this:
void showData(){
cudaDeviceSynchronize();
float* temp_array = (float *)malloc(sizeof(float));
cudaMemcpyFromSymbol(temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", *temp_array);
}
Note that in this example, gpu_array is a scalar quantity, and so we could do something similar with temp_array:
void showData(){
cudaDeviceSynchronize();
float temp_array;
cudaMemcpyFromSymbol(&temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", temp_array);
}

Related

Segmentation fault in CUDA

I receive Segmentation fault (core dumped) when i run this code.
I know the cudaMalloc is the problem, but I have no idea how to solve it. I just started learning some CUDA programming and im not familiar with it. I'm working on wsl if it matters.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
for (int i=1;i<SIZE;i++){
a[i]=i;
b[i]=i;
c[i]=0;
}
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
cudaDeviceSynchronize();
for(int i=1;i<SIZE;i++){
printf("%d \n",c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
As the comments already suggested, you have to initialize values for arrays a and b on the host, copy them to device array, and once computation is completed you have to copy data from c back to the host.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
int *h_a, *h_b, *h_c; /*declare pointers to host arrays*/
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
/* allocate memory for host arrays */
h_a = new int[SIZE];
h_b = new int[SIZE];
h_c = new int[SIZE];
/* initialize values on host arrays */
for (int i = 0; i < SIZE; i++){
h_a[i]=i;
h_b[i]=i;
}
/*copy data from host to device */
cudaMemcpy(a, h_a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b, h_b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
// cudaDeviceSynchronize(); /* this is not needed because cudaMemcpy implies sync. */
/*copy results from device to host*/
cudaMemcpy(h_c, c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < SIZE; i++){
printf("%d \n",h_c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
/* free host memory */
delete [] h_a;
delete [] h_b;
delete [] h_c;
return 0;
}
Notes
For some reason you start iterating from position 1 instead 0 in you for loops! If this is wrong by accident I fixed it!
cudaMemcpy always performs synchronization between host and device. So, cudaDeviceSynchronize() is not necessary after kernel invocation.
To avoid explicit handling of separated host and device data, you could use cudaMallocManaged instead of cudaMalloc.

Copying data from GPU to CPU - CUDA

I have problem with copying data from GPU to CPU. At the beginning I create variable in GPU space:
__device__ float gpu_array;
In this GPU function, I want to copy the data from od.fS[gi] (value = 0,43...) to gpu_array:
__global__ void Collide(.....){
....
//Streaming
od.fS[gi] = fi_S;
od.fN[gi] = fi_N;
od.fE[gi] = fi_E;
od.fW[gi] = fi_W;
gpu_array = od.fC[gi];
}
End here is function to copy data from GPU to CPU:
void showData(){
cudaDeviceSynchronize();
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
float* cpu_array = (float*)malloc(sizeof(float));
cudaMemcpy(cpu_array, temp_array, sizeof(float), cudaMemcpyDeviceToHost);
printf("h_array: %f\n", *cpu_array);
}
In finally copies but wrong values (0.00000, -0.00000 etc.). What am I doing wrong?
There are multiple problems with your use of cudaMemcpyFromSymbol:
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
you haven't allocated storage for data pointed to by temp_array
we don't use quotes around the device symbol name anymore
the destination pointer should be just a pointer, not the address of a pointer
the remaining code around cpu_array is completely unnecessary
Correct usage would be something like this:
void showData(){
cudaDeviceSynchronize();
float* temp_array = (float *)malloc(sizeof(float));
cudaMemcpyFromSymbol(temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", *temp_array);
}
Note that in this example, gpu_array is a scalar quantity, and so we could do something similar with temp_array:
void showData(){
cudaDeviceSynchronize();
float temp_array;
cudaMemcpyFromSymbol(&temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", temp_array);
}

Passing variables from parent kernel to child kernel in dynamic parallelism in cuda

I am trying to use dynamic parallelism in cuda. I am in a situation such that parent kernel has a variable that needs to be passed to child for further computation. I have gone through the resources in web
here
and it mentions that local variables cannot be passed to the child kernal and has mentioned the ways to pass variables and I have tried to pass the pass the variable as
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(N==10)
{
a[idx] = a[idx] * a[idx];
}
}
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int n=N; // this value of n can be changed locally and need to be passed
printf("%d\n",n);
cudaMalloc((void **) &n, sizeof(int));
square <<< 1, N >>> (arr, n);
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
first <<< 1, 1 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
and the value of parent to child kernel is not passed . how can I pass the value of local variable. Is there any way to do so?
This operation is not appropriate:
int n=N; // this value of n can be changed locally and need to be passed
cudaMalloc((void **) &n, sizeof(int)); // illegal
It is not appropriate in host code, nor in device code. n is an int variable. You are not supposed to assign a pointer to it. When you attempt to do so in a 64-bit environment, you are attempting to write a 64-bit pointer on top of a 32-bit int quantity. It will not work.
It's not clear why you would need it anyway. n is an integer parameter presumably specifying the size of your arr array of float. You don't need to allocate anything on top of it.
If you had run this code with cuda-memcheck, you could easily discover that error. You can also do proper cuda error checking in device code in exactly the same fashion as you do it in host code.
When I comment out that cudaMalloc line in the first kernel, your code runs correctly for me.

CUDA branch divergence doesn't make any differences

I'm trying to learn CUDA by myself, and I'm now into the issue of branch divergence. As far as I understand, this is the name given to the problem that arises when several threads in a block are said to take a branch (due to if or switch statements, for example), but others in that block don't have to take it.
In order to investigate a little bit further this phenomena and its consequences, I've written a little file with a couple of CUDA functions. One of them is supposed to take lots of time, since the threads are stopped for much more time (9999... iterations) than in the other one (in which they're only stopped for an assignation).
However, when I run the code, I'm getting very similar times. Furthermore, even measuring the time that running both of them takes I get a time similar to running only one. Did I code anything wrong, or is there a logical explanation for this?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define ITERATIONS 9999999999999999999
#define BLOCK_SIZE 16
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
float result = 0;
if(threadIdx.x % 2 == 0)
{
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*A[threadIdx.x];
}
} else
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*B[threadIdx.x];
}
}
__global__ void betterDivergence(float *A, float *B){
float result = 0;
float *aux;
//This structure should not affect performance that much
if(threadIdx.x % 2 == 0)
aux = A;
else
aux = B;
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*aux[threadIdx.x];
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
elementsB[x] = (x%2==0)?1:3;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(32/dimBlock.x, 32/dimBlock.y);
divergence<<<dimBlock, dimGrid>>>(d_a, d_b);
betterDivergence<<<dimBlock, dimGrid>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer);
printf("kernel execution time (secs): %f s\n", timerValue);
return 0;
}
1) You have no memory writes in your __global__ code except the local variable(result). I'm not sure that cuda compiler does that, but all your code can be safely removed with no side effect(and maybe the compiler had done that).
2) All your reads from device memory in __global__ functions are from one place on each iteration. Cuda will store the value in register memory and the longest operation(memory access) will be done very fast here.
3) May be the compiler had replaced your cycles with single multiplication like `result=ITERATIONS*A[threadIdx.x]*B[threadIdx.x]
4) If all the code in your functions will be executed as you wrote it, your betterDivergence is going to be approximately 2 times faster than your another function because you have the loops in if branches in slower one and no loops in branches in faster one. But there won't be any idle time in threads among the threads that execute same loop because all threads are going to execute the body of the loop each iteration.
I suggest you to write another example where you will store the result in some device memory and then copy that memory back to host and make some more unpredictable calculations to prevent possible optimizations.
Below is shown the final, tested, right example of a code that allows to compare the performance between CUDA code with and without branch divergence:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
//#define ITERATIONS 9999999999999999999
#define ITERATIONS 999999
#define BLOCK_SIZE 16
#define WARP_SIZE 32
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > 2)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
__global__ void noDivergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > WARP_SIZE)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(128/dimBlock.x, 128/dimBlock.y);
//divergence<<<dimGrid, dimBlock>>>(d_a, d_b);
noDivergence<<<dimGrid, dimBlock>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer)/1000;
printf("kernel execution time (secs): %f s\n", timerValue);
cudaMemcpy(elementsB, d_b, BLOCK_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}

Simple constant memory example not working

Please help me why below not working. In my comp, it just prints 0 when I try to print the value of var.
#include<stdio.h>
__constant__ float pivot;
__global__ void kernel(float *set){
*set = pivot;
}
void main(){
float c[] = {1,3,4};
cudaError_t err = cudaMemcpyToSymbol(pivot,&c[2], sizeof(float));
/*
float test;
cudaMemcpyFromSymbol(&test,pivot,sizeof(float));
printf("the value of test is %f",test);
*/
if(err!=0){
printf("some error\n");
getchar();
}
float *st;
cudaMalloc((void**)&st, sizeof(float));
kernel<<<1,1>>>(st);
float *var = (float*)malloc(sizeof(float));
cudaMemcpy(var, st, sizeof(float),cudaMemcpyDeviceToHost);
printf("the value of st is %f",var);
getchar();
}
var is a float *, so when you call printf("... %f, var);, you are actually sending the address of the memory allocated by (float*)malloc(sizeof(float)).
P.s. I can't edit the post, but shouldn't the tags be c and printf, as the problem/question has very little to do with nvidia or cuda (apart from the fact that the two are referenced in the code).