we have the following struct defined
typedef struct PurchaseOrder
{
char* Value1;
double Value2;
double* Value3;
int Value3Length;
__device__ int GetValue3Length() { return Value3Length; }
__device__ double GetValue3(int i) { return Value3[i]; }
__device__ void SetValue3(int i, double value) { Value3[i] = value; }
};
The PurchaseOrder data (array of structs) are marshalled from C# application into the following C dll function
int RunMonteCarlo(PurchaseOrder *hostPurchaseOrders, int length) {
PurchaseOrder *devPurchaseOrders;
// display the results
for (int i = 0; i < length; i++)
{
//printf("\n\nAddress: %u",hostPurchaseOrders+i);
printf("\n\nIndex: %d", i);
printf("\nValue1: %s",(hostPurchaseOrders+i)->Value1);
printf("\nValue2: %f",(hostPurchaseOrders+i)->Value2);
for(int j = 0; j < (hostPurchaseOrders+i)->Value3Length; j++)
{
printf("\nValue3[%d]: %fl", j, (hostPurchaseOrders+i)->Value3[j]);
}
}
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&devPurchaseOrders, length * sizeof(PurchaseOrder) ) );
// copy the array 'PurchaseOrder' to the GPU
HANDLE_ERROR( cudaMemcpy( devPurchaseOrders, hostPurchaseOrders, length * sizeof(PurchaseOrder), cudaMemcpyHostToDevice ) );
// Run the kernel code
MonteCarloKernel<<<60,32>>>( devPurchaseOrders, length);
// copy the array 'PurchaseOrders' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy(hostPurchaseOrders, devPurchaseOrders, length * sizeof(PurchaseOrder), cudaMemcpyDeviceToHost ) );
// free the memory allocated on the GPU
HANDLE_ERROR( cudaFree( devPurchaseOrders ) );
return 0;
}
__global__ void MonteCarloKernel(PurchaseOrder *purchaseorders, long length) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (i < length)
{
purchaseorders[i].SetAAUS(1.11);
for (int j=0; j < purchaseorders[i].GetValue3Length(); j++)
{
//purchaseorders[i].SetValue3(j,1.0);
}
i += stride;
}
}
The data are marshalled correctly as verified by the printf code at the beginning.
However, the Value3 (array of double) seems not copied into the device memory as the line purchaseorders[i].SetValue3(j,1.0) in the kernel crashes the application.
What should I do to solve it out?
When the application crashes, the console windows just closed. What debug technique I could use to get some meaningful messages?
Value1 and Value3 are pointers. In hostPurchaseOrders, which refers to host data, these pointers are pointing to locations in host memory.
When you allocate device memory for devPurchaseOrders using cudaMalloc, the memory is only allocated for the structures and the pointers inside them. When you copy hostPurchaseOrders to devPurchaseOrders, you only copied the memory addresses in Value1 and Value3 fields. Since, they are pointing to some location in host memory, this location cannot be accessed successfully from the device.
The host memory pointers cannot be directly copied to device memory like you did above. You will need to manually allocate a device location for each host pointer, copy the values there from host to device and then set this location in the Value1 and Value3 pointers of the device structure.
This is extremely messy. Consider restructuring your host data, so that you can copy from host to device in a simple manner.
Related
I receive Segmentation fault (core dumped) when i run this code.
I know the cudaMalloc is the problem, but I have no idea how to solve it. I just started learning some CUDA programming and im not familiar with it. I'm working on wsl if it matters.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
for (int i=1;i<SIZE;i++){
a[i]=i;
b[i]=i;
c[i]=0;
}
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
cudaDeviceSynchronize();
for(int i=1;i<SIZE;i++){
printf("%d \n",c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
As the comments already suggested, you have to initialize values for arrays a and b on the host, copy them to device array, and once computation is completed you have to copy data from c back to the host.
#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
int i = threadIdx.x;
if(i<n)
c[i]=a[i]+b[i];
}
int main(){
int *a,*b,*c;
int *h_a, *h_b, *h_c; /*declare pointers to host arrays*/
cudaMalloc((void**)&a,SIZE *sizeof(int));
cudaMalloc((void**)&b, SIZE *sizeof(int));
cudaMalloc((void**)&c,SIZE *sizeof(int));
/* allocate memory for host arrays */
h_a = new int[SIZE];
h_b = new int[SIZE];
h_c = new int[SIZE];
/* initialize values on host arrays */
for (int i = 0; i < SIZE; i++){
h_a[i]=i;
h_b[i]=i;
}
/*copy data from host to device */
cudaMemcpy(a, h_a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b, h_b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
// cudaDeviceSynchronize(); /* this is not needed because cudaMemcpy implies sync. */
/*copy results from device to host*/
cudaMemcpy(h_c, c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0; i < SIZE; i++){
printf("%d \n",h_c[i]);
}
cudaFree(a);
cudaFree(b);
cudaFree(c);
/* free host memory */
delete [] h_a;
delete [] h_b;
delete [] h_c;
return 0;
}
Notes
For some reason you start iterating from position 1 instead 0 in you for loops! If this is wrong by accident I fixed it!
cudaMemcpy always performs synchronization between host and device. So, cudaDeviceSynchronize() is not necessary after kernel invocation.
To avoid explicit handling of separated host and device data, you could use cudaMallocManaged instead of cudaMalloc.
I have a problem with threads' id during the block executes.
I would like to have sentence like :"My temporary string is printed via GPU!" as you see (on the attached photo ealier) the sentence has been displayed wrongly and I don't know how to fix it.
Code:
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
while (id_x < static_cast<int>(*loop_repeat))
{
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp{};
temp = Get_String_Length(my_string); //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length{};
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), HostToDevice);
char* string_GPU{};
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), HostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("%cKernel executed successfully with code: %d !%\n", NEW_LINE, final_error);
}
else
{
printf("%cKernel executed with code error: %d !\n", NEW_LINE, final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
I will be grateful for any help given.
The main issue here is that you are expecting that the thread or warp execution order has some predictable order. Actually, it does not. Your usage of __syncthreads() doesn't fix or address this issue.
If you want the warps to execute in a predictable order (not recommended) you would need to impose that order yourself. Here is an example that demonstrates that for this very simple code. It is not extensible without modification to larger strings, and this method will completely break down if you introduce more than 1 threadblock.
$ cat t1543.cu
#include <stdio.h>
#include <stdlib.h>
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
int warp_ID = threadIdx.x>>5;
while (id_x < static_cast<int>(*loop_repeat))
{
if (warp_ID == 0)
printf("%c", __string[id_x]);
__syncthreads();
if (warp_ID == 1)
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp;
temp = 40; //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length;
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), cudaMemcpyHostToDevice);
char* string_GPU;
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), cudaMemcpyHostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("\nKernel executed successfully with code: %d !%\n", final_error);
}
else
{
printf("\nKernel executed with code error: %d !\n", final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
$ nvcc -o t1543 t1543.cu
$ cuda-memcheck ./t1543
========= CUDA-MEMCHECK
My temporary string is printed via GPU!
Kernel executed successfully with code: 0 !%
========= ERROR SUMMARY: 0 errors
$
Note that I'm not suggesting the above is good coding style. It's provided for understanding of the issue. Even this code is relying on the idea that the threads within a warp will call the printf function in a predictable order, which is not guaranteed by the CUDA programming model. So the code is really still broken.
This happened because The multiprocessor creates, manages, schedules, and executes threads in groups of 32 parallel threads called warps as you can see in CUDA Programming Guide, so the first 32 threads covers "My temporary string is printed v" and the remaining part covers "ia GPU!". It seems that the kernel put the latter wrap before the first one in execution order.
How to properly malloc struct A with cuda?
struct B
{
int* pointerToInt;
int arraySize;
};
struct A
{
B* pointerToB;
int arraySize;
};
If allocating on host memory, we can think of doing:
struct A* h_A;
h_A = malloc(sizeof(struct A));
h_A->arraySize = 10;
h_A->pointerToB = malloc(10 * sizeof(struct B));
for (int i = 0; i < 10; i++) {
struct B h_B = (h_A->pointerToB)[i];
h_B.arraySize = i + 5;
h_B.pointerToInt = malloc((i + 5) * sizeof(int));
}
If we try to do similar stuff with cudaMalloc:
struct A* d_A;
cudaMalloc(&d_A, sizeof(struct A));
d_A->arraySize = 10; /*** error ***/
cudaMalloc(&(d_A->pointerToB), 10 * sizeof(struct B)); /*** error ***/
...
we'll encounter segmentation fault error because we are trying to dereference d_A which is allocated in device memory. We can't access device memory from host code using dereferencing operator.
One possible solution is to allocate device memory for struct B inside your device code. You can use malloc or free in device code to allocate device memory dynamically. See this section B.20. Dynamic Global Memory Allocation and Operations on CUDA Programming Guide
Flattening your 2D array into 1D array might be a better solution.
I am trying to use dynamic parallelism in cuda. I am in a situation such that parent kernel has a variable that needs to be passed to child for further computation. I have gone through the resources in web
here
and it mentions that local variables cannot be passed to the child kernal and has mentioned the ways to pass variables and I have tried to pass the pass the variable as
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(N==10)
{
a[idx] = a[idx] * a[idx];
}
}
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int n=N; // this value of n can be changed locally and need to be passed
printf("%d\n",n);
cudaMalloc((void **) &n, sizeof(int));
square <<< 1, N >>> (arr, n);
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
first <<< 1, 1 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
and the value of parent to child kernel is not passed . how can I pass the value of local variable. Is there any way to do so?
This operation is not appropriate:
int n=N; // this value of n can be changed locally and need to be passed
cudaMalloc((void **) &n, sizeof(int)); // illegal
It is not appropriate in host code, nor in device code. n is an int variable. You are not supposed to assign a pointer to it. When you attempt to do so in a 64-bit environment, you are attempting to write a 64-bit pointer on top of a 32-bit int quantity. It will not work.
It's not clear why you would need it anyway. n is an integer parameter presumably specifying the size of your arr array of float. You don't need to allocate anything on top of it.
If you had run this code with cuda-memcheck, you could easily discover that error. You can also do proper cuda error checking in device code in exactly the same fashion as you do it in host code.
When I comment out that cudaMalloc line in the first kernel, your code runs correctly for me.
this is my first attempt at a CUDA program. This is what it's supposed to do:
Receive 1D Pixel array from host memory
Each Pixel is processed by one thread: it is thread-safe because only "val" is read and only "newval" is updated. Wait for sync.
Each Pixel is processed by one thread: copy "newval" to "val."
Write this array back to host memory.
Repeat 2-4 for several different frames.
What happens, however, is that only a couple of variables, out of about 32000, in the new arrays seem to have decent values at all; the rest are zero.
I've removed the calculations for brevity.
__global__ void kernel(Pixel *array, float dt)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//DO A BUNCH OF CALCULATIONS ON PIXEL KIND OF LIKE THIS
point->newval = point->val + foo;
}
__global__ void copykernel(Pixel *array)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//COPY THE NEWVALS OVER TO THE OLD VALS IN PREPARATION FOR THE NEXT FRAME
point->val = point->newval;
}
extern "C" bool runIt(const int argc, const char **argv, Pixel *inarray, Pixel **outarrays, int arraysize, int numframes, float dt)
{
int memsize = arraysize*sizeof(Pixel);
int i=0;
Pixel *array;
cudaMalloc((void **) &array, memsize);
cudaMemcpy(array, inarray, memsize, cudaMemcpyHostToDevice);
int numthreads = arraysize;
dim3 grid(1,1,1);
dim3 threads(numthreads,1,1);
for(i=0;i<numframes;i++)
{
kernel<<<grid, threads>>>((Pixel *) array, dt);
cudaThreadSynchronize();
copykernel<<<grid, threads>>>((Pixel *) array);
cudaThreadSynchronize();
cudaMemcpy(array, outarrays[i], memsize, cudaMemcpyDeviceToHost);
}
cudaFree(array);
return true;
}
I have a suspicion that I'm setting up the parameters for the device incorrectly, or else I'm getting one of the device-specific keywords wrong or forgetting a crucial step. Does anything jump out at you?
I don't think you can run that many threads, and if you can, its not a good idea. Try setting the number of threads to 256 (16x16 for 2D), then choosing gridsize based on your input size.
dim3 threads(256,1,1);
dim3 grid(arraysize/threads.x,1,1); //Careful of integer division, this is just for example
Also your second copy is incorrect. You need to switch array and out_arrays
cudaMemcpy(outarrays[i], array, memsize, cudaMemcpyDeviceToHost);