How to properly malloc struct A with cuda?
struct B
{
int* pointerToInt;
int arraySize;
};
struct A
{
B* pointerToB;
int arraySize;
};
If allocating on host memory, we can think of doing:
struct A* h_A;
h_A = malloc(sizeof(struct A));
h_A->arraySize = 10;
h_A->pointerToB = malloc(10 * sizeof(struct B));
for (int i = 0; i < 10; i++) {
struct B h_B = (h_A->pointerToB)[i];
h_B.arraySize = i + 5;
h_B.pointerToInt = malloc((i + 5) * sizeof(int));
}
If we try to do similar stuff with cudaMalloc:
struct A* d_A;
cudaMalloc(&d_A, sizeof(struct A));
d_A->arraySize = 10; /*** error ***/
cudaMalloc(&(d_A->pointerToB), 10 * sizeof(struct B)); /*** error ***/
...
we'll encounter segmentation fault error because we are trying to dereference d_A which is allocated in device memory. We can't access device memory from host code using dereferencing operator.
One possible solution is to allocate device memory for struct B inside your device code. You can use malloc or free in device code to allocate device memory dynamically. See this section B.20. Dynamic Global Memory Allocation and Operations on CUDA Programming Guide
Flattening your 2D array into 1D array might be a better solution.
Related
I receive Segmentation fault (core dumped) when i run this code.
I know the cudaMalloc is the problem, but I have no idea how to solve it. I just started learning some CUDA programming and im not familiar with it. I'm working on wsl if it matters.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
for (int i=1;i<SIZE;i++){
a[i]=i;
b[i]=i;
c[i]=0;
}
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
cudaDeviceSynchronize();
for(int i=1;i<SIZE;i++){
printf("%d \n",c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
As the comments already suggested, you have to initialize values for arrays a and b on the host, copy them to device array, and once computation is completed you have to copy data from c back to the host.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
int *h_a, *h_b, *h_c; /*declare pointers to host arrays*/
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
/* allocate memory for host arrays */
h_a = new int[SIZE];
h_b = new int[SIZE];
h_c = new int[SIZE];
/* initialize values on host arrays */
for (int i = 0; i < SIZE; i++){
h_a[i]=i;
h_b[i]=i;
}
/*copy data from host to device */
cudaMemcpy(a, h_a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b, h_b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
// cudaDeviceSynchronize(); /* this is not needed because cudaMemcpy implies sync. */
/*copy results from device to host*/
cudaMemcpy(h_c, c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < SIZE; i++){
printf("%d \n",h_c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
/* free host memory */
delete [] h_a;
delete [] h_b;
delete [] h_c;
return 0;
}
Notes
For some reason you start iterating from position 1 instead 0 in you for loops! If this is wrong by accident I fixed it!
cudaMemcpy always performs synchronization between host and device. So, cudaDeviceSynchronize() is not necessary after kernel invocation.
To avoid explicit handling of separated host and device data, you could use cudaMallocManaged instead of cudaMalloc.
I have written a CUDA kernel and when I copy of an array of shorts to device memory and then pass it to the kernel it doesn't work. The simplified code below expresses my issue.
KernelCaller()
{
const int size = 1;
short hostArray[size]{41};
short* devPointer;
cudaMalloc((void**)&devicePointer, size * sizeof(short));
cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
cudaKernel<<<1,1>>>(devPointer);
}
__global__
void cudaKernel(short* arr)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
short val = arr[idx];
}
At this point the value of val is 1063714857 and what I want it to be is 41.
I assume the issue is 41 in hex is 0x29 and the value I have is 0x3F670029 so it looks like it read too many bytes cause the 0x29 is at the beginning. When I switch to an array of floats it works perfectly, but I was trying to save memory. Does CUDA not allow an array of shorts?
I have implemented your code and getting the output as expected.
Here's the code
#include<stdio.h>
__global__ void cudaKernel(short* arr)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
short val = arr[idx];
# if __CUDA_ARCH__>=200
printf("Inside kernel %d\n",val);
#endif
arr[idx] = val;
}
int main()
{
const int size = 1;
short hostArray[size]{41};
printf("Before kernel call %d\n",hostArray[0]);
short *devPointer;
cudaMalloc((void**)&devPointer, size * sizeof(short));
cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
cudaKernel<<<1,1>>>(devPointer);
cudaMemcpy(hostArray, devPointer, size * sizeof(short), cudaMemcpyDeviceToHost);
printf("After kernel call %d\n",hostArray[0]);
cudaFree(devPointer);
return 0;
}
And the output is
Before kernel call 41
Inside kernel 41
After kernel call 41
So, yes we can pass array of shorts into a CUDA kernel.
I am trying to use dynamic parallelism in cuda. I am in a situation such that parent kernel has a variable that needs to be passed to child for further computation. I have gone through the resources in web
here
and it mentions that local variables cannot be passed to the child kernal and has mentioned the ways to pass variables and I have tried to pass the pass the variable as
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(N==10)
{
a[idx] = a[idx] * a[idx];
}
}
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int n=N; // this value of n can be changed locally and need to be passed
printf("%d\n",n);
cudaMalloc((void **) &n, sizeof(int));
square <<< 1, N >>> (arr, n);
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
first <<< 1, 1 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
and the value of parent to child kernel is not passed . how can I pass the value of local variable. Is there any way to do so?
This operation is not appropriate:
int n=N; // this value of n can be changed locally and need to be passed
cudaMalloc((void **) &n, sizeof(int)); // illegal
It is not appropriate in host code, nor in device code. n is an int variable. You are not supposed to assign a pointer to it. When you attempt to do so in a 64-bit environment, you are attempting to write a 64-bit pointer on top of a 32-bit int quantity. It will not work.
It's not clear why you would need it anyway. n is an integer parameter presumably specifying the size of your arr array of float. You don't need to allocate anything on top of it.
If you had run this code with cuda-memcheck, you could easily discover that error. You can also do proper cuda error checking in device code in exactly the same fashion as you do it in host code.
When I comment out that cudaMalloc line in the first kernel, your code runs correctly for me.
i am tring to build a cuda program to do ray-tracing, and i have some code below:
void build_world(World *w, RGBAColor* buffer){
w->vp = (ViewPlane*) malloc(sizeof(ViewPlane));
w->vp->hres = 512;
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
ViewPlane *viewplane;
cudaMalloc(&viewplane,sizeof(ViewPlane)); //return cudaSuccess but pointer still NULL
cudaMemcpy(viewplane,w->vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
free(w->vp);
w->vp = viewplane;
cudaMalloc(&(w->background_color),sizeof(RGBAColor)); //return cudaSuccess but pointer still NULL
*(w->background_color) = black; //Memory access error
cudaMalloc(&(w->sphere),sizeof(Sphere)); //return cudaSuccess but pointer still NULL
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
World *w is a static global pointer, and it is in the global memory.
My problem is that i can not allocate memory in device memory, all "cudaMalloc" calls do not work for most of the time.
i do what #RobertCrovella suggested in comment, like this:
void build_world(World *w, RGBAColor* buffer){
checkCudaErrors( cudaMalloc(&(w->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
w->vp->hres = 512; //memory access errors occurs here
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
checkCudaErrors( cudaMalloc(&(w->background_color),sizeof(RGBAColor)));
getLastCudaError("background allocate failed");
*(w->background_color) = black;
checkCudaErrors( cudaMalloc(&(w->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
and it works once...the cudaMalloc API still returns "cudaSuccess" when it's not.
here is the definitions of structure:
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor *background_color;
Sphere *sphere;
};
after considering the issues that #RobertCrovella mentions in the answer below, here is the third version of build_world:
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
void build_world(World *w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->buffer = buffer;
h_vp->s = 1;
checkCudaErrors( cudaMalloc(&(h_world->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
checkCudaErrors( cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice));
getLastCudaError("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = Point3D(0.0,0.0,0.0);
h_sphere->radius = 300;
checkCudaErrors( cudaMalloc(&(h_world->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
checkCudaErrors( cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice));
getLastCudaError("sphere memory copy failed");
checkCudaErrors( cudaMalloc( &w , sizeof(World)));
getLastCudaError( "world allocate failed" );
checkCudaErrors( cudaMemcpy(w,h_world,sizeof(World),cudaMemcpyHostToDevice));
getLastCudaError("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
this time, all cudaMemcpy calls don't work: when running to the end of this function, the value of h_vp and h_sphere is good; h_world->vp and h_world->sphere do point to an area of device momery but contains wrong value;w does not have correct value, all pointer it contains is 0x00000000...
This question has officially become "a mess" because you have posted two substantially different versions of build_world which differ in important ways, apart from just the error checking I asked you to add. I will try and address some issues as I see them, however my understanding is clouded by the confusion in your posting.
If the pointer *w that you are passing to build_world is already a device pointer (i.e. allocated with cudaMalloc) which seems to be what you are saying, then none of this will work. Creating data structures on the device, which also contain pointers to other data structures that are also on the device, is a somewhat non-intuitive process. You cannot pass a pointer to cudaMalloc that already lives on the device (i.e. is already part of a region created with cudaMalloc. Instead it's necessary to create a parallel set of pointers on the host, cudaMalloc these pointers individually, then copy the pointer values to the appropriate regions in the device data structure, using cudaMemcpy. To see another example of what I am referring to, take a look here.
You cannot dereference device pointers in host code. For example:
w->vp->hres = 512;
If w or w->vp is a pointer set up with cudaMalloc, then the above operation is invalid. Instead it's necessary to create a parallel data structure on the host, set the values there, then cudaMemcpy from host to device:
h_vp->hres = 512;
cudaMemcpy(d_vp, h_vp, sizeof(vp_struct), cudaMemcpyHostToDevice);
Note that in this simplified description I'm glossing over the issue I mentioned in the first point above.
If you are calling build_world over and over again, you need to make sure that you are properly using cudaFree if you are passing the same *w pointer.
EDIT: In response to the additional posting of the 3rd version of build_world I elected to create a sample code which should have the remaining issues fixed:
#include <stdio.h>
#include <vector_functions.h>
#define black make_uchar4(4,3,2,1)
#define white make_uchar4(0,1,2,3)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
__global__ void my_kernel(World *w){
printf("w->vp->hres = %d\n", w->vp->hres);
printf("w->background_color.y = %d\n", w->background_color.y);
printf("w->sphere->radius = %f\n", w->sphere->radius);
printf("w->vp->buffer->y = %d\n", w->vp->buffer->y);
}
void build_world(World **w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->s = 1;
cudaMalloc((void **)&(h_vp->buffer), sizeof(RGBAColor));
cudaCheckErrors("viewplane RGBAColor allocate failed");
cudaMemcpy(h_vp->buffer, buffer, sizeof(RGBAColor), cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane RGBAColor copy failed");
cudaMalloc((void **)&(h_world->vp),sizeof(ViewPlane));
cudaCheckErrors("viewplane allocate failed");
cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = (Point3D) make_float3(0.0,0.0,0.0);
h_sphere->radius = 300;
cudaMalloc((void **)&(h_world->sphere),sizeof(Sphere));
cudaCheckErrors("sphere allocate failed");
cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice);
cudaCheckErrors("sphere memory copy failed");
cudaMalloc((void **)w , sizeof(World));
cudaCheckErrors( "world allocate failed" );
cudaMemcpy(*w,h_world,sizeof(World),cudaMemcpyHostToDevice);
cudaCheckErrors("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
int main(){
World *d_w;
RGBAColor my_buffer = white;
build_world(&d_w, &my_buffer);
my_kernel<<<1,1>>>(d_w);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
You can compile this code with nvcc -arch=sm_20 -o t98 t98.cu
When I compile and run this code, I get no errors and the following output:
$ ./t98
w->vp->hres = 512
w->background_color.y = 3
w->sphere->radius = 300.000000
w->vp->buffer->y = 1
$
we have the following struct defined
typedef struct PurchaseOrder
{
char* Value1;
double Value2;
double* Value3;
int Value3Length;
__device__ int GetValue3Length() { return Value3Length; }
__device__ double GetValue3(int i) { return Value3[i]; }
__device__ void SetValue3(int i, double value) { Value3[i] = value; }
};
The PurchaseOrder data (array of structs) are marshalled from C# application into the following C dll function
int RunMonteCarlo(PurchaseOrder *hostPurchaseOrders, int length) {
PurchaseOrder *devPurchaseOrders;
// display the results
for (int i = 0; i < length; i++)
{
//printf("\n\nAddress: %u",hostPurchaseOrders+i);
printf("\n\nIndex: %d", i);
printf("\nValue1: %s",(hostPurchaseOrders+i)->Value1);
printf("\nValue2: %f",(hostPurchaseOrders+i)->Value2);
for(int j = 0; j < (hostPurchaseOrders+i)->Value3Length; j++)
{
printf("\nValue3[%d]: %fl", j, (hostPurchaseOrders+i)->Value3[j]);
}
}
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&devPurchaseOrders, length * sizeof(PurchaseOrder) ) );
// copy the array 'PurchaseOrder' to the GPU
HANDLE_ERROR( cudaMemcpy( devPurchaseOrders, hostPurchaseOrders, length * sizeof(PurchaseOrder), cudaMemcpyHostToDevice ) );
// Run the kernel code
MonteCarloKernel<<<60,32>>>( devPurchaseOrders, length);
// copy the array 'PurchaseOrders' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy(hostPurchaseOrders, devPurchaseOrders, length * sizeof(PurchaseOrder), cudaMemcpyDeviceToHost ) );
// free the memory allocated on the GPU
HANDLE_ERROR( cudaFree( devPurchaseOrders ) );
return 0;
}
__global__ void MonteCarloKernel(PurchaseOrder *purchaseorders, long length) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (i < length)
{
purchaseorders[i].SetAAUS(1.11);
for (int j=0; j < purchaseorders[i].GetValue3Length(); j++)
{
//purchaseorders[i].SetValue3(j,1.0);
}
i += stride;
}
}
The data are marshalled correctly as verified by the printf code at the beginning.
However, the Value3 (array of double) seems not copied into the device memory as the line purchaseorders[i].SetValue3(j,1.0) in the kernel crashes the application.
What should I do to solve it out?
When the application crashes, the console windows just closed. What debug technique I could use to get some meaningful messages?
Value1 and Value3 are pointers. In hostPurchaseOrders, which refers to host data, these pointers are pointing to locations in host memory.
When you allocate device memory for devPurchaseOrders using cudaMalloc, the memory is only allocated for the structures and the pointers inside them. When you copy hostPurchaseOrders to devPurchaseOrders, you only copied the memory addresses in Value1 and Value3 fields. Since, they are pointing to some location in host memory, this location cannot be accessed successfully from the device.
The host memory pointers cannot be directly copied to device memory like you did above. You will need to manually allocate a device location for each host pointer, copy the values there from host to device and then set this location in the Value1 and Value3 pointers of the device structure.
This is extremely messy. Consider restructuring your host data, so that you can copy from host to device in a simple manner.