CUDA program giving garbage value - cuda

I really do not understand why the output for the below code is not a and b.
#include<cutil.h>
#include<iostream>
__global__ void p(unsigned char **a){
unsigned char temp[2];
temp[0] = 'a';
temp[1] = 'b';
a[0] = temp;
}
void main(){
unsigned char **a ;
cudaMalloc((void**)&a, sizeof(unsigned char*));
p<<<1,1>>>(a);
unsigned char **c;
unsigned char b[2];
cudaMemcpy(c, a, sizeof(unsigned char *), cudaMemcpyDeviceToHost);
cudaMemcpy(b, c[0], 2*sizeof(unsigned char), cudaMemcpyDeviceToHost);
for( int i=0 ; i < 2; i++){
printf("%c\n", b[i]);
}
getchar();
}
what is wrong with my logic?

Let's leave out CUDA for now. Let's just make a function that writes data to a user-provided array. The user passes the array via a pointer:
void fill_me_up(int * dst)
{
// We sure hope that `dst` points to a large enough area of memory!
dst[0] = 28;
dst[1] = 75;
}
Now, what you're doing with the local variable doesn't make sense, because you want to use the address of a local variable, which becomes invalid after you leave the function scope. The next best thing you could do is memcpy(), or some equivalent C++ algorithm:
void fill_me_up_again(int * dst)
{
int temp[] = { 28, 75 };
memcpy((void *)dst, (const void *)temp, sizeof(temp));
}
OK, now on to calling that function: We first must provide the target memory, and then pass a pointer:
int main()
{
int my_memory[2]; // here's our memory -- automatic local storage
fill_me_up(my_memory); // OK, array decays to pointer-to-beginning
fill_me_up(&my_memory[0]); // A bit more explicit
int * your_memory = malloc(sizeof(int) * 2); // more memory, this time dynamic
fill_me_up_again(your_memory);
/* ... */
free(your_memory);
}
(In C++ you would probably have uses new int[2] and delete your_memory instead, but by using C malloc() the connection to CUDA hopefully becomes clear.)
When you're moving fill_me_up to the CUDA device, you have to give it a device pointer rather than a host pointer, so you have to set that one up first and afterwards copy the results back out, but that's about the only change.

Related

Zero padding on the fly with cuFFT

I have a float array and want to FFT from this with an amount of data and padding by zero padding to 2^N. I also want to overlap the data by a selectable factor.
So far I have a cuda kernel with which I create another array in which I store the overlapped and padded data. Afterwards a cufftPlanMany is executed.
By the two factors, the amount of data becomes very large and it is in principle only copies of the original data and zeros with which I waste my entire memory bandwidth.
I could not find anything if cuFFT supports zero padding or if I have a possibility to create custom scripts.
(Nvidia Quadro P5000, C++14, Kubuntu)
Update
I have written a callback function which is called when loading the data into the FFT. Unfortunately this is still a little bit slower than my previous solution with a kernel which prepares the data in another array and then calls the FFT.
I need an average of 2.4ms for the example with the given values.
My hope was that if I process the data on the fly, my memory bandwidth will not limit me anymore. Unfortunately this does not look like that at the moment.
Does anyone have an idea how I can speed this up even more?
// Don't forget to include cufft_static(not cufft), culibos and set flag -dc
#include <stdio.h>
#include <cstdint>
#include <unistd.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include <math.h>
typedef struct fft_CB_LD_callerInfo{
uint16_t rgLen;
uint16_t rgDataLen;
uint16_t overlapFactor;
};
static __device__ cufftReal myOwnCallback(void *dataIn,
size_t offset,
void *callerInfo,
void *sharedPtr) {
const fft_CB_LD_callerInfo *fftInfo = (fft_CB_LD_callerInfo*)callerInfo;
int idx_rg = offset/fftInfo->rgLen;
int idx_realRg = idx_rg/fftInfo->overlapFactor;
int idx_posInRg = offset-(size_t)idx_rg*fftInfo->rgLen;
if(idx_posInRg < fftInfo->rgDataLen){
const size_t idx_data = idx_posInRg
+ idx_realRg*fftInfo->rgDataLen
+ idx_rg - (idx_realRg*fftInfo->overlapFactor)*fftInfo->rgDataLen/fftInfo->overlapFactor;
return ((cufftReal*)dataIn)[idx_data];
}
else{
return 0.0f;
}
}
__device__ cufftCallbackLoadR myOwnCallbackPtr = myOwnCallback;
int main(){
// Data
float *dataHost;
float *data;
cufftComplex *spectrum;
cufftComplex *spectrumHost;
unsigned int rgDataLen = 400;
unsigned int rgLen = 2048;
unsigned int overlap = 8;
int peakPosHost[] = {0};
int *peakPos;
unsigned int rgCountClean = 52*16*4;
unsigned int rgCount = rgCountClean*overlap-(overlap-1);
int peakCountHost = 1;
int *peakCount;
// for FFT
cudaStream_t stream;
cufftHandle plan;
cufftResult result;
int fftRank = 1; // --- 1D FFTs
int fftIRide = 1, fftORide = 1; // --- Distance between two successive input/output elements
int fftInembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int fftOnembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int fftEachLen[] = { (int)rgLen }; // --- Size of the Fourier transform
int fftIDist = rgLen;
int fftODist = rgLen/2+1; // --- Distance between batches
// for Custom callback
cufftCallbackLoadR hostCopyOfCallbackPtr;
size_t worksize;
fft_CB_LD_callerInfo *fftInfo;
fft_CB_LD_callerInfo *fftInfoHost;
// Allocate host memory
dataHost = new float[rgDataLen*rgCountClean*peakCountHost];
spectrumHost = new cufftComplex[fftODist*rgCount];
fftInfoHost = new fft_CB_LD_callerInfo;
// create array with example data
for(int k=0; k<rgDataLen;k++){
for(int i=0; i<rgCountClean; i++){
dataHost[i*rgDataLen + k] = sin((2+i*4)*M_PI*k/rgDataLen);
}
}
fftInfoHost->overlapFactor = overlap;
fftInfoHost->rgDataLen = rgDataLen;
fftInfoHost->rgLen = rgLen;
// allocate device memory
cudaMalloc((void **)&data, sizeof(float) * rgDataLen*rgCountClean*peakCountHost);
cudaMalloc((void **)&peakPos, sizeof(int) * peakCountHost);
cudaMalloc((void **)&peakCount, sizeof(int));
cudaMalloc((void **)&spectrum, sizeof(cufftComplex)*fftODist*rgCount);
cudaMalloc((void **)&fftInfo, sizeof(fft_CB_LD_callerInfo));
// copy date from host to device
cudaMemcpy(data, dataHost, sizeof(float)*rgDataLen*rgCountClean*peakCountHost, cudaMemcpyHostToDevice);
cudaMemcpy(peakPos, peakPosHost, sizeof(int)*peakCountHost, cudaMemcpyHostToDevice);
cudaMemcpy(peakCount, &peakCountHost, sizeof(peakCountHost), cudaMemcpyHostToDevice);
cudaMemcpy(fftInfo, fftInfoHost, sizeof(fft_CB_LD_callerInfo), cudaMemcpyHostToDevice);
// get device pointer to custom callback function
cudaError_t error = cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr, myOwnCallbackPtr, sizeof(hostCopyOfCallbackPtr));
if(error != 0) printf("cudaMemcpyFromSymbol faild with %d!\n", (int)error);
// Create a plan of FFTs to fast execute there later
cufftCreate(&plan);
result = cufftMakePlanMany(plan, fftRank, fftEachLen, fftInembed, fftIRide, fftIDist, fftOnembed, fftORide, fftODist, CUFFT_R2C, rgCount, &worksize);
if(result != CUFFT_SUCCESS) printf("cufftMakePlanMany failed with %d!\n", (int)result);
result = cufftXtSetCallback(plan, (void**)&hostCopyOfCallbackPtr, CUFFT_CB_LD_REAL, (void**)&fftInfo);
if(result != CUFFT_SUCCESS) printf("cufftXtSetCallback failed with %d!\n", (int)result);
// ----- Begin test area ---------------------------------------------------
if(cufftExecR2C(plan, data, spectrum) != CUFFT_SUCCESS)
printf("cufftExecR2C is failed!\n");
// ----- End test area ---------------------------------------------------
return 0;
}

cudaMemcpy error from Device to Host

I am returning a two-dimensional structure after computation on a kernel, from device to host.
HANDLE_ERROR(cudaMemcpy(Pixel,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
Pixel is declared on host, Pixel_gpu is allocated on device, as below:
**Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
pixel **Pixel = (pixel**)malloc((img_ht)*sizeof(pixel*));
for(int i=0;i<(img_ht);i++)
Pixel[i]=(pixel*)malloc((img_wd)*sizeof(pixel));
Using this I end up getting illegal memory access error.
Trying a similar memory alignment for result, doesn't help either.
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
Kernel launching:
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
int thread_block=sqrt(prop.maxThreadsPerBlock);
dim3 DimGrid(ceil(img_wd/thread_block),ceil(img_ht/thread_block),1);
dim3 DimBlock(sqrt(prop.maxThreadsPerBlock),sqrt(prop.maxThreadsPerBlock),1);
//allocating gpu memory
pixel **Pixel_tmp_gpu, **Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_tmp_gpu,img_wd*img_ht*sizeof(pixel)));
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
float **kernel0_gpu, **kernel1_gpu;
HANDLE_ERROR(cudaMalloc(&kernel0_gpu,k*1*sizeof(float)));
HANDLE_ERROR(cudaMalloc(&kernel1_gpu,1*k*sizeof(float)));
cout<<"memory allocated"<<endl;
//copying needed data
HANDLE_ERROR(cudaMemcpy(Pixel_tmp_gpu,Pixel_tmp,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(Pixel_gpu,Pixel,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel0_gpu,kernel0,k*1*sizeof(float),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel1_gpu,kernel1,1*k*sizeof(float),cudaMemcpyHostToDevice));
cout<<"memory transfers done"<<endl;
vertical_conv<<<DimGrid,DimBlock>>>(Pixel_gpu, Pixel_tmp_gpu,img_wd, img_ht,kernel0_gpu,k);
time_t vertical_convolution=time(NULL);
cout<<" vertical_convolution time: "<<double(vertical_convolution - reading_file)<<"sec"<<endl;
horizontal_conv<<<DimGrid,DimBlock>>>(Pixel_tmp_gpu, Pixel_gpu, img_wd, img_ht, kernel1_gpu, k);
time_t horizontal_convolution=time(NULL);
cout<<" horizontal convolution time:" <<double(horizontal_convolution-vertical_convolution)<<" sec"<<endl;
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
The functions used:
struct pixel //to store RGB values
{
unsigned char r;
unsigned char g;
unsigned char b;
};
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
cout<<cudaGetErrorString(err)<<" in "<< file <<" at line "<< line<<endl;
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__device__ void padding(pixel** Pixel_val, int x_coord, int y_coord, int img_width, int img_height, pixel Px) //padding the image,depending on pixel coordinates, can be replaced by reflect for better result //currently zero padding
{
if(x_coord<img_width && y_coord<img_height && x_coord>=0 && y_coord>=0)
Px=Pixel_val[y_coord][x_coord];
}
The vertical convolution:
__global__ void vertical_conv(pixel** Pixel_in, pixel** Pixel_out,int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_g, tmp_b;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
if(row<img_ht && col<img_wd){
tmp_r=0, tmp_g=0, tmp_b=0;
for(int l=0;l<k;l++)
{
padding(Pixel_in, col, row+l-(k-1)/2, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[l][0];
tmp_b+=pix_val.b * kernel[l][0];
tmp_g+=pix_val.g * kernel[l][0];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
The horizontal convolution:
__global__ void horizontal_conv(pixel** Pixel_in, pixel** Pixel_out, int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_b, tmp_g;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
//horizontal convolution
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
tmp_r=0, tmp_g=0, tmp_b=0;
if(row<img_ht && col<img_wd)
{
for(int l=0; l<k;l++)
{
padding(Pixel_in, col+l-(k-1)/2, row, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[0][l];
tmp_g+=pix_val.g * kernel[0][l];
tmp_b+=pix_val.b * kernel[0][l];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
Can someone help me know what could be wrong here?
The Pixel_gpu is be one contiguous memory block, consisting of w*h elements of type pixel. Its size is
sizeOfDeviceMemory = img_wd * img_ht * sizeof(pixel)
On contrast to that, Pixel on the CPU side is an "array of pointers": The Pixel pointer points to h elements of type pixel*. Its size is
sizeOfHostMemory = img_ht * sizeof(pixel*)
Clearly, these sizes are different, and trying to write sizeOfDeviceMemory bytes to this pointer causes an illegal access.
Usually, you should allocate your memory on the host as one contiguous block as well:
pixel* Pixel = (pixel*)malloc(img_wd * img_ht * sizeof(pixel));
Then you can copy the memory to this pointer using the cudaMemcpy call that you already have.
If having a pixel* on the host is not OK for you, and you urgently need a pixel** (for example, to pass it to some other function), then you can create an "array of pointers" like you had before, but not allocate new memory for each row, but instead, let each pointer point to one "row" of the single, contiguous pixel block.

thrust::device_vector in constant memory

I have a float array that needs to be referenced many times on the device, so I believe the best place to store it is in __ constant __ memory (using this reference). The array (or vector) will need to be written once at run-time when initializing, but read by multiple different functions many millions of times, so constant copying to the kernel each function call seems like A Bad Idea.
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
__host__ __device__ float operator()(const float& x) const { return fmax(x,C);}
};
void foo(const thrust::host_vector<float> &, const float &);
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x,0.0);
return(0);
}
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
//this method works fine, but the memory transfer is unacceptable
thrust::device_vector<float> input_dev_vec(n);
input_dev_vec = input_host_x; //I want to avoid this
thrust::transform(input_dev_vec.begin(),input_dev_vec.end(),dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this memory transfer for debugging
//this method compiles fine, but crashes at runtime
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(dev_ptr,dev_ptr+n,dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this line crashes
}
I tried adding a global thrust::device_vector dev_x(n), but that also crashed at run-time, and would be in __ global __ memory rather than __ constant__ memory
This can all be made to work if I just discard the thrust library, but is there a way to use the thrust library with globals and device constant memory?
Good question! You can't cast a __constant__ array as if it's a regular device pointer.
I will answer your question (after the line below), but first: this is a bad use of __constant__, and it isn't really what you want. The constant cache in CUDA is optimized for uniform access across threads in a warp. That means all threads in the warp access the same location at the same time. If each thread of the warp accesses a different constant memory location, then the accesses get serialized. So your access pattern, where consecutive threads access consecutive memory locations, will be 32 times slower than a uniform access. You should really just use device memory. If you need to write the data once, but read it many times, then just use a device_vector: initialize it once, and then read it many times.
To do what you asked, you can use a thrust::counting_iterator as the input to thrust::transform to generate a range of indices into your __constant__ array. Then your functor's operator() takes an int index operand rather than a float value operand, and does the lookup into constant memory.
(Note that this means your functor is now __device__ code only. You could easily overload the operator to take a float and call it differently on host data if you need portability.)
I modified your example to initialize the data and print the result to verify that it is correct.
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
// only works as a device function
__device__ float operator()(const int& i) const {
// use index into constant array
return fmax(dev_x[i],C);
}
};
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n),
dev_sol.begin(),
struct_max(x0));
host_sol = dev_sol; //this line crashes
for (int i = 0; i < n; i++)
printf("%f\n", host_sol[i]);
}
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
for (int i = 0; i < n; i++) x[i] = rand() / (float)RAND_MAX;
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x, 0.5);
return(0);
}

CUDA 1-D array not getting updated

this is my first attempt at a CUDA program. This is what it's supposed to do:
Receive 1D Pixel array from host memory
Each Pixel is processed by one thread: it is thread-safe because only "val" is read and only "newval" is updated. Wait for sync.
Each Pixel is processed by one thread: copy "newval" to "val."
Write this array back to host memory.
Repeat 2-4 for several different frames.
What happens, however, is that only a couple of variables, out of about 32000, in the new arrays seem to have decent values at all; the rest are zero.
I've removed the calculations for brevity.
__global__ void kernel(Pixel *array, float dt)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//DO A BUNCH OF CALCULATIONS ON PIXEL KIND OF LIKE THIS
point->newval = point->val + foo;
}
__global__ void copykernel(Pixel *array)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//COPY THE NEWVALS OVER TO THE OLD VALS IN PREPARATION FOR THE NEXT FRAME
point->val = point->newval;
}
extern "C" bool runIt(const int argc, const char **argv, Pixel *inarray, Pixel **outarrays, int arraysize, int numframes, float dt)
{
int memsize = arraysize*sizeof(Pixel);
int i=0;
Pixel *array;
cudaMalloc((void **) &array, memsize);
cudaMemcpy(array, inarray, memsize, cudaMemcpyHostToDevice);
int numthreads = arraysize;
dim3 grid(1,1,1);
dim3 threads(numthreads,1,1);
for(i=0;i<numframes;i++)
{
kernel<<<grid, threads>>>((Pixel *) array, dt);
cudaThreadSynchronize();
copykernel<<<grid, threads>>>((Pixel *) array);
cudaThreadSynchronize();
cudaMemcpy(array, outarrays[i], memsize, cudaMemcpyDeviceToHost);
}
cudaFree(array);
return true;
}
I have a suspicion that I'm setting up the parameters for the device incorrectly, or else I'm getting one of the device-specific keywords wrong or forgetting a crucial step. Does anything jump out at you?
I don't think you can run that many threads, and if you can, its not a good idea. Try setting the number of threads to 256 (16x16 for 2D), then choosing gridsize based on your input size.
dim3 threads(256,1,1);
dim3 grid(arraysize/threads.x,1,1); //Careful of integer division, this is just for example
Also your second copy is incorrect. You need to switch array and out_arrays
cudaMemcpy(outarrays[i], array, memsize, cudaMemcpyDeviceToHost);

CUDA: Host memory pointers not copied to device memory

we have the following struct defined
typedef struct PurchaseOrder
{
char* Value1;
double Value2;
double* Value3;
int Value3Length;
__device__ int GetValue3Length() { return Value3Length; }
__device__ double GetValue3(int i) { return Value3[i]; }
__device__ void SetValue3(int i, double value) { Value3[i] = value; }
};
The PurchaseOrder data (array of structs) are marshalled from C# application into the following C dll function
int RunMonteCarlo(PurchaseOrder *hostPurchaseOrders, int length) {
PurchaseOrder *devPurchaseOrders;
// display the results
for (int i = 0; i < length; i++)
{
//printf("\n\nAddress: %u",hostPurchaseOrders+i);
printf("\n\nIndex: %d", i);
printf("\nValue1: %s",(hostPurchaseOrders+i)->Value1);
printf("\nValue2: %f",(hostPurchaseOrders+i)->Value2);
for(int j = 0; j < (hostPurchaseOrders+i)->Value3Length; j++)
{
printf("\nValue3[%d]: %fl", j, (hostPurchaseOrders+i)->Value3[j]);
}
}
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&devPurchaseOrders, length * sizeof(PurchaseOrder) ) );
// copy the array 'PurchaseOrder' to the GPU
HANDLE_ERROR( cudaMemcpy( devPurchaseOrders, hostPurchaseOrders, length * sizeof(PurchaseOrder), cudaMemcpyHostToDevice ) );
// Run the kernel code
MonteCarloKernel<<<60,32>>>( devPurchaseOrders, length);
// copy the array 'PurchaseOrders' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy(hostPurchaseOrders, devPurchaseOrders, length * sizeof(PurchaseOrder), cudaMemcpyDeviceToHost ) );
// free the memory allocated on the GPU
HANDLE_ERROR( cudaFree( devPurchaseOrders ) );
return 0;
}
__global__ void MonteCarloKernel(PurchaseOrder *purchaseorders, long length) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (i < length)
{
purchaseorders[i].SetAAUS(1.11);
for (int j=0; j < purchaseorders[i].GetValue3Length(); j++)
{
//purchaseorders[i].SetValue3(j,1.0);
}
i += stride;
}
}
The data are marshalled correctly as verified by the printf code at the beginning.
However, the Value3 (array of double) seems not copied into the device memory as the line purchaseorders[i].SetValue3(j,1.0) in the kernel crashes the application.
What should I do to solve it out?
When the application crashes, the console windows just closed. What debug technique I could use to get some meaningful messages?
Value1 and Value3 are pointers. In hostPurchaseOrders, which refers to host data, these pointers are pointing to locations in host memory.
When you allocate device memory for devPurchaseOrders using cudaMalloc, the memory is only allocated for the structures and the pointers inside them. When you copy hostPurchaseOrders to devPurchaseOrders, you only copied the memory addresses in Value1 and Value3 fields. Since, they are pointing to some location in host memory, this location cannot be accessed successfully from the device.
The host memory pointers cannot be directly copied to device memory like you did above. You will need to manually allocate a device location for each host pointer, copy the values there from host to device and then set this location in the Value1 and Value3 pointers of the device structure.
This is extremely messy. Consider restructuring your host data, so that you can copy from host to device in a simple manner.