CUDA - Copy device data to host? - cuda

I have device variable and in this variable, I allocate and fill an array in the device, but I have a problem to get data to host. cudaMemcpy() return cudaErrorInvalidValue error. how can I do it?
PS: The Code is just example, I know, that In this particular case I can use cudaMalloc because I know the size of the array, but In my REAL code, It computes the size of the array in the device and it needs immediately allocate memory.
PS2: I found a similar problem, but I still don't know, how can I solve it? - copy data which is allocated in device from device to host
PS3: I have updated code, but still doesn't work:{
PS4: I am just trying to run this code on a notebook with Nvidia GT 520MX(latest game driver) and doesn't work too :(
thx
#include <cuda.h>
#include <stdio.h>
#define N 400
__device__ int* d_array;
__global__ void allocDeviceMemory()
{
d_array = new int[N];
for(int i=0; i < N; i++)
d_array[i] = 123;
}
int main()
{
allocDeviceMemory<<<1, 1>>>();
cudaDeviceSynchronize();
int* d_a = NULL;
cudaMemcpyFromSymbol((void**)&d_a, "d_array", sizeof(d_a), 0, cudaMemcpyDeviceToHost);
printf("gpu adress: %lld\n", d_a);
int* h_array = (int*)malloc(N*sizeof(int));
cudaError_t errr = cudaMemcpy(h_array, d_a, N*sizeof(int), cudaMemcpyDeviceToHost);
printf("h_array: %d, %d\n", h_array[0], errr);
getchar();
return 0;
}

You need to synchronize (cudaDeviceSynchronize()) after launching the kernel to allocate the memory.
Can you also check the return value of the sync and all other CUDA API calls?

i have tested your code and there is no error here. I am running CUDA 4.0.

Related

CUDA Seg Fault for Int Device to Host Copy

Why is the following simple program (24 lines) lead to segmentation fault at shrinked_size_host int variable:
#include <stdio.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
__global__ void cuda_set(int* device_var){
*device_var = 12;
printf("Set device variable to: %d\n", *device_var);
}
int main() {
printf("Hello world CPU\n");
int* shrinked_size_device;
cudaMalloc((void**)&shrinked_size_device, sizeof(int));
cudaDeviceSynchronize();
cudaMemset(shrinked_size_device, 0, sizeof(int));
cudaDeviceSynchronize();
cuda_set<<<1,1>>>(shrinked_size_device);
cudaDeviceSynchronize();
int* shrinked_size_host = 0;
cudaMemcpy(shrinked_size_host, shrinked_size_device, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("shrinked_size_host=%d\n", *shrinked_size_host);
return 0;
}
That's the output produced from the program:
Hello world CPU
Set device variable to: 12
Segmentation fault (core dumped)
Not sure why there is a segmentation fault.
I figured out the answer to this question.
Memory for the shrinked_size_host should be allocated. So, either do:
Heap allocation: malloc or new int to allocate an integer of size 1. Remember to delete the allocated memory at the end.
Stack allocation: Use int shrinked_size_host[1];

cuda unified memory and pointer aliasing

I am refreshing my mind with cuda, specially the unify memory (my last real cuda dev was 3 years ago), I am a bit rusted.
The pb:
I am creating a task from a container using unify memory. However, I get a crash, after a few days of investigation,
I am not able to say where is the crash (copy constructor), but not why. Because all pointers are allocated correctly.
I am not in contraction with Nvidia post (https://devblogs.nvidia.com/parallelforall/unified-memory-in-cuda-6/)
about C++ and unify memory
#include <cuda.h>
#include <cstdio>
template<class T>
struct container{
container(int size = 1){ cudaMallocManaged(&p,size*sizeof(T));}
~container(){cudaFree(p);}
__device__ __host__ T& operator[](int i){ return p[i];}
T * p;
};
struct task{
int* a;
};
__global__ void kernel_gpu(task& t, container<task>& v){
printf(" gpu value task %i, should be 2 \n", *(t.a)); // this work
task tmp(v[0]); // BUG
printf(" gpu value task from vector %i, should be 1 \n", *(tmp.a));
}
void kernel_cpu(task& t, container<task>& v){
printf(" cpu value task %i, should be 2 \n", *(t.a)); // this work
task tmp(v[0]);
printf(" cpu value task from vector %i, should be 1 \n", *(tmp.a));
}
int main(int argc, const char * argv[]) {
int* p1;
int* p2;
cudaMallocManaged(&p1,sizeof(int));
cudaMallocManaged(&p2,sizeof(int));
*p1 = 1;
*p2 = 2;
task t1,t2;
t1.a=p1;
t2.a=p2;
container<task> c(2);
c[0] = t1;
c[1] = t2;
//gpu does not work
kernel_gpu<<<1,1>>>(c[1],c);
cudaDeviceSynchronize();
//cpu should work, no concurent access
kernel_cpu(c[1],c);
printf("job done !\n");
cudaFree(p1);
cudaFree(p2);
return 0;
}
Objectively I can pass an object as an argument where the memory has been allocated properly. However, it look like it not possible to use a second degree
of indirection (here the container)
I am doing a conceptual mistake, but I do not see where.
Best,
Timocafe
my machine: cuda 7.5, gcc 4.8.2, Tesla K20 m
Although the memories were allocated as Unified Memory, the container itself is declared in host code and allocated in host memory: container<task> c(2);. You can not pass it as a reference to the device code, and de-referencing it in a kernel will very likely result in illegal memory access.
You may want to use cuda-memcheck to identify such issues.

Why is this not copying from device to host in Cuda?

I'm working through the examples of the "CUDA by Example" book. The following code doesn't give me an answer and work as it should. Where's the mistake?
Will appreciate your help and answers.
I get an output,which reads
Calculation done on GPU yields the answer: &d
Press enter to stop
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
using namespace std;
__global__ void add_integers_cuda(int a, int b, int *c)
{
*c = a + b;
}
int main(void)
{
int c;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, sizeof(int)); //allocate sizeof(int) bytes of contiguous memory in the gpu device and return the address of first byte to dev_ptr.
// call the kernel
add_integers_cuda <<<1,1>>>(2,7,dev_ptr);
cudaMemcpy(&c, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("Calculation done on GPU yields the answer: &d\n",c );
cudaFree(dev_ptr);
printf("Press enter to stop.");
cin.ignore(255, '\n');
return 0;
}
"
&d is not a correct printf formatting character here:
printf("Calculation done on GPU yields the answer: &d\n",c );
You won't get the output you are expecting.
You should use %d instead:
printf("Calculation done on GPU yields the answer: %d\n",c );
This particular issue has nothing to do with CUDA of course.
You may also want to run CUDA codes with cuda-memcheck and/or use proper CUDA error checking if you are just learning and having trouble. Neither of those would have pointed out the above error, however.

cudaEventElapsedTime not expected behaviour

I'm trying to compute the total time taken in GPU to compute something. I'm using the cudaEventRecord and cudaEventElapsedTime to determine this, but I'm having a unexpected behavior, or at least, unexpected for me :) I wrote this example to understand what's happening and I'm still confused.
In the example below I was expecting to report the same time for the three iterations but the result is:
2.80342
1003
2005.6
Which means that the total time in considering the CPU sleep time.
Am I doing something wrong? If not, is it possible do what I want?
#include <iostream>
#include <thread>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
__global__ void kernel_test(int *a, int N) {
for(int i=threadIdx.x;i<N;i+=N) {
if(i<N)
a[i] = 1;
}
}
int main(int argc, char ** argv) {
cudaEvent_t start[3], stop[3];
for(int i=0;i<3;i++) {
cudaEventCreate(&start[i]);
cudaEventCreate(&stop[i]);
}
cudaStream_t stream;
cudaStreamCreate(&stream);
const int N = 1024 * 1024;
int *h_a = (int*)malloc(N * sizeof(int));
int *a = 0;
cudaMalloc((void**)&a, N * sizeof(int));
for(int i=0;i<3;i++) {
cudaEventRecord(start[i], stream);
cudaMemcpyAsync(a, h_a, N * sizeof(int), cudaMemcpyHostToDevice, stream);
kernel_test<<<1, 1024, 0, stream>>>(a, N);
cudaMemcpyAsync(h_a, a, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaEventRecord(stop[i], stream);
std::this_thread::sleep_for (std::chrono::seconds(i));
cudaEventSynchronize(stop[i]);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start[i], stop[i]);
std::cout<<milliseconds<<std::endl;
}
return 0;
}
I attach the nsight result to verify the behaviour of my example.
Windows 8.1
Geforce GTX 780 Ti
Nvidia drivers: 358.50
EDIT:
Added code to be complete
Attached nsight result
Added SO and drivers info
, ,
If you're running the program on Windows using the WDDM (in contrast to TCC with Tesla cards or Linux) this may be the issue:
With the WDDM kernels are not executed immediately after invocation but instead enqueued to a command buffer. Once the buffer is full it gets flushed and the enqueued commands are actually executed. Another option to force the command buffer to be explicitly flushed is to synchronize.
Now what happens is that you wait before the command buffer is acutally flushed...
Edit
Also see https://devtalk.nvidia.com/default/topic/548639/is-wddm-causing-this-/ for the problem and how cudaEventQuery(0) may help

Modulus computation of an array of cufftComplex data type in CUDA

I made a Dll file in visual C++ to compute modulus of an array of complex numbers in CUDA. The array is type of cufftComplex. I then called the Dll in LabVIEW to check the accuracy of the result. I'm receiving an incorrect result. Could anyone tell me what is wrong with the following code, please? I think there should be something wrong with my kernel function(the way I am retrieving the cufftComplex data should be incorrect).
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
extern "C" __declspec(dllexport) void Modulus(cufftComplex *digits,float *result);
__global__ void ModulusComputation(cufftComplex *a, int N, float *temp)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<N)
{
temp[idx] = sqrt((a[idx].x * a[idx].x) + (a[idx].y * a[idx].y));
}
}
void Modulus(cufftComplex *digits,float *result)
{
#define N 1024
cufftComplex *d_data;
float *temp;
size_t size = sizeof(cufftComplex)*N;
cudaMalloc((void**)&d_data, size);
cudaMalloc((void**)&temp, sizeof(float)*N);
cudaMemcpy(d_data, digits, size, cudaMemcpyHostToDevice);
int blockSize = 16;
int nBlocks = N/blockSize;
if( N % blockSize != 0 )
nBlocks++;
ModulusComputation <<< nBlocks, blockSize >>> (d_data, N,temp);
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(temp);
}
In the final cudaMemcpy in your code, you have:
cudaMemcpy(result, temp, size, cudaMemcpyDeviceToHost);
It should be:
cudaMemcpy(result, temp, sizeof(float)*N, cudaMemcpyDeviceToHost);
If you had included error checking for your cuda calls, you would have seen this cuda call (as originally written) throw an error.
There's other comments that could be made. For example your block size (16) should be an integral multiple of 32. But this does not prevent proper operation.
After the kernel call, when copying back the result, you are using size as the memory size. The third argument of cudaMemcpy should be N * sizeof(float).