Copying data from GPU to CPU - CUDA - cuda

I have problem with copying data from GPU to CPU. At the beginning I create variable in GPU space:
__device__ float gpu_array;
In this GPU function, I want to copy the data from od.fS[gi] (value = 0,43...) to gpu_array:
__global__ void Collide(.....){
....
//Streaming
od.fS[gi] = fi_S;
od.fN[gi] = fi_N;
od.fE[gi] = fi_E;
od.fW[gi] = fi_W;
gpu_array = od.fC[gi];
}
End here is function to copy data from GPU to CPU:
void showData(){
cudaDeviceSynchronize();
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
float* cpu_array = (float*)malloc(sizeof(float));
cudaMemcpy(cpu_array, temp_array, sizeof(float), cudaMemcpyDeviceToHost);
printf("h_array: %f\n", *cpu_array);
}
In finally copies but wrong values (0.00000, -0.00000 etc.). What am I doing wrong?

There are multiple problems with your use of cudaMemcpyFromSymbol:
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
you haven't allocated storage for data pointed to by temp_array
we don't use quotes around the device symbol name anymore
the destination pointer should be just a pointer, not the address of a pointer
the remaining code around cpu_array is completely unnecessary
Correct usage would be something like this:
void showData(){
cudaDeviceSynchronize();
float* temp_array = (float *)malloc(sizeof(float));
cudaMemcpyFromSymbol(temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", *temp_array);
}
Note that in this example, gpu_array is a scalar quantity, and so we could do something similar with temp_array:
void showData(){
cudaDeviceSynchronize();
float temp_array;
cudaMemcpyFromSymbol(&temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", temp_array);
}

Related

How to copy a device variable to host in CUDA [duplicate]

I have problem with copying data from GPU to CPU. At the beginning I create variable in GPU space:
__device__ float gpu_array;
In this GPU function, I want to copy the data from od.fS[gi] (value = 0,43...) to gpu_array:
__global__ void Collide(.....){
....
//Streaming
od.fS[gi] = fi_S;
od.fN[gi] = fi_N;
od.fE[gi] = fi_E;
od.fW[gi] = fi_W;
gpu_array = od.fC[gi];
}
End here is function to copy data from GPU to CPU:
void showData(){
cudaDeviceSynchronize();
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
float* cpu_array = (float*)malloc(sizeof(float));
cudaMemcpy(cpu_array, temp_array, sizeof(float), cudaMemcpyDeviceToHost);
printf("h_array: %f\n", *cpu_array);
}
In finally copies but wrong values (0.00000, -0.00000 etc.). What am I doing wrong?
There are multiple problems with your use of cudaMemcpyFromSymbol:
float* temp_array = NULL;
cudaMemcpyFromSymbol((void**)&temp_array, "gpu_array", sizeof(temp_array), 0, cudaMemcpyDeviceToHost);
you haven't allocated storage for data pointed to by temp_array
we don't use quotes around the device symbol name anymore
the destination pointer should be just a pointer, not the address of a pointer
the remaining code around cpu_array is completely unnecessary
Correct usage would be something like this:
void showData(){
cudaDeviceSynchronize();
float* temp_array = (float *)malloc(sizeof(float));
cudaMemcpyFromSymbol(temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", *temp_array);
}
Note that in this example, gpu_array is a scalar quantity, and so we could do something similar with temp_array:
void showData(){
cudaDeviceSynchronize();
float temp_array;
cudaMemcpyFromSymbol(&temp_array, gpu_array, sizeof(float));
printf("h_array: %f\n", temp_array);
}

Segmentation fault in CUDA

I receive Segmentation fault (core dumped) when i run this code.
I know the cudaMalloc is the problem, but I have no idea how to solve it. I just started learning some CUDA programming and im not familiar with it. I'm working on wsl if it matters.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
for (int i=1;i<SIZE;i++){
a[i]=i;
b[i]=i;
c[i]=0;
}
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
cudaDeviceSynchronize();
for(int i=1;i<SIZE;i++){
printf("%d \n",c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
As the comments already suggested, you have to initialize values for arrays a and b on the host, copy them to device array, and once computation is completed you have to copy data from c back to the host.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
int *h_a, *h_b, *h_c; /*declare pointers to host arrays*/
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
/* allocate memory for host arrays */
h_a = new int[SIZE];
h_b = new int[SIZE];
h_c = new int[SIZE];
/* initialize values on host arrays */
for (int i = 0; i < SIZE; i++){
h_a[i]=i;
h_b[i]=i;
}
/*copy data from host to device */
cudaMemcpy(a, h_a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b, h_b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
// cudaDeviceSynchronize(); /* this is not needed because cudaMemcpy implies sync. */
/*copy results from device to host*/
cudaMemcpy(h_c, c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < SIZE; i++){
printf("%d \n",h_c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
/* free host memory */
delete [] h_a;
delete [] h_b;
delete [] h_c;
return 0;
}
Notes
For some reason you start iterating from position 1 instead 0 in you for loops! If this is wrong by accident I fixed it!
cudaMemcpy always performs synchronization between host and device. So, cudaDeviceSynchronize() is not necessary after kernel invocation.
To avoid explicit handling of separated host and device data, you could use cudaMallocManaged instead of cudaMalloc.

CUDA device runtime api cudaMemsetAsync doesn't work

I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.
Here is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>
const int size = 5;
__global__ void kernel(int *c)
{
cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}
int main()
{
cudaError_t cudaStatus;
int c[size] = { 12, 12, 12, 12, 12 };
int *dev_c = 0;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaStatus = cudaDeviceReset();
printf("%d\n", cudaStatus);
printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
And if I run it, I got output like this:
>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
Creating library a.lib and object a.exp
0
{0,0,0,0,0}
When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.
Is this a bug? or I did something wrong? I'm using CUDA 8.0
I can confirm this appears not to work in CUDA 8 on the systems I tested it with.
If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.
If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:
#include <cstring>
#include <cstdio>
const int size = 5;
__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned char* _p = (unsigned char*)p;
for(; tid < sz; tid += blockDim.x * gridDim.x) {
_p[tid] = val;
}
}
__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
const dim3 blocksz(256,1,1);
size_t nblocks = (sz + blocksz.x -1) / blocksz.x;
unsigned charval = val & 0xff;
myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz);
}
__global__ void kernel(int *c)
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
myMemset(c, 0x7FFFFFFF, size * 4, s);
cudaDeviceSynchronize();
}
int main()
{
int c[size];
int *dev_c;
memset(&c[0], 0xffffff0c, size * sizeof(int));
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
which compiles and does this:
$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}
A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.

CUDA branch divergence doesn't make any differences

I'm trying to learn CUDA by myself, and I'm now into the issue of branch divergence. As far as I understand, this is the name given to the problem that arises when several threads in a block are said to take a branch (due to if or switch statements, for example), but others in that block don't have to take it.
In order to investigate a little bit further this phenomena and its consequences, I've written a little file with a couple of CUDA functions. One of them is supposed to take lots of time, since the threads are stopped for much more time (9999... iterations) than in the other one (in which they're only stopped for an assignation).
However, when I run the code, I'm getting very similar times. Furthermore, even measuring the time that running both of them takes I get a time similar to running only one. Did I code anything wrong, or is there a logical explanation for this?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define ITERATIONS 9999999999999999999
#define BLOCK_SIZE 16
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
float result = 0;
if(threadIdx.x % 2 == 0)
{
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*A[threadIdx.x];
}
} else
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*B[threadIdx.x];
}
}
__global__ void betterDivergence(float *A, float *B){
float result = 0;
float *aux;
//This structure should not affect performance that much
if(threadIdx.x % 2 == 0)
aux = A;
else
aux = B;
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*aux[threadIdx.x];
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
elementsB[x] = (x%2==0)?1:3;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(32/dimBlock.x, 32/dimBlock.y);
divergence<<<dimBlock, dimGrid>>>(d_a, d_b);
betterDivergence<<<dimBlock, dimGrid>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer);
printf("kernel execution time (secs): %f s\n", timerValue);
return 0;
}
1) You have no memory writes in your __global__ code except the local variable(result). I'm not sure that cuda compiler does that, but all your code can be safely removed with no side effect(and maybe the compiler had done that).
2) All your reads from device memory in __global__ functions are from one place on each iteration. Cuda will store the value in register memory and the longest operation(memory access) will be done very fast here.
3) May be the compiler had replaced your cycles with single multiplication like `result=ITERATIONS*A[threadIdx.x]*B[threadIdx.x]
4) If all the code in your functions will be executed as you wrote it, your betterDivergence is going to be approximately 2 times faster than your another function because you have the loops in if branches in slower one and no loops in branches in faster one. But there won't be any idle time in threads among the threads that execute same loop because all threads are going to execute the body of the loop each iteration.
I suggest you to write another example where you will store the result in some device memory and then copy that memory back to host and make some more unpredictable calculations to prevent possible optimizations.
Below is shown the final, tested, right example of a code that allows to compare the performance between CUDA code with and without branch divergence:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
//#define ITERATIONS 9999999999999999999
#define ITERATIONS 999999
#define BLOCK_SIZE 16
#define WARP_SIZE 32
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > 2)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
__global__ void noDivergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > WARP_SIZE)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(128/dimBlock.x, 128/dimBlock.y);
//divergence<<<dimGrid, dimBlock>>>(d_a, d_b);
noDivergence<<<dimGrid, dimBlock>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer)/1000;
printf("kernel execution time (secs): %f s\n", timerValue);
cudaMemcpy(elementsB, d_b, BLOCK_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}

Simple constant memory example not working

Please help me why below not working. In my comp, it just prints 0 when I try to print the value of var.
#include<stdio.h>
__constant__ float pivot;
__global__ void kernel(float *set){
*set = pivot;
}
void main(){
float c[] = {1,3,4};
cudaError_t err = cudaMemcpyToSymbol(pivot,&c[2], sizeof(float));
/*
float test;
cudaMemcpyFromSymbol(&test,pivot,sizeof(float));
printf("the value of test is %f",test);
*/
if(err!=0){
printf("some error\n");
getchar();
}
float *st;
cudaMalloc((void**)&st, sizeof(float));
kernel<<<1,1>>>(st);
float *var = (float*)malloc(sizeof(float));
cudaMemcpy(var, st, sizeof(float),cudaMemcpyDeviceToHost);
printf("the value of st is %f",var);
getchar();
}
var is a float *, so when you call printf("... %f, var);, you are actually sending the address of the memory allocated by (float*)malloc(sizeof(float)).
P.s. I can't edit the post, but shouldn't the tags be c and printf, as the problem/question has very little to do with nvidia or cuda (apart from the fact that the two are referenced in the code).