Access violation writing location 0x00000003 - deep-learning

So, I'm writing a DQL Neural network. When I run the code through the debugger, it throws this exception
Exception thrown at 0x00D05DFB in Deep Q-learning Neural
Network.exe: 0xC0000005: Access violation writing location 0x00000003.
Here is the relevant code:
The neuron structure:
typedef struct neuron_t {
// not entirely relevant to this problem, but it's nicer to have the full picture
float activation;
float* outputWeights;
float bias;
float z;
float dactv;
float* dw;
float dbias;
float dz;
}Neuron;
The layer structure:
typedef struct layer_t {
int numberOfNeurons;
Neuron* neu;
}Layer;
the main function:
Layer* testTarget = NULL;
Layer* test = NULL;
int numberOfLayers = 3;
int* neuronInlayer[] = { 2,3,4 };
test = createPredictionArchitecture(test, testTarget, numberOfLayers, neuronInlayer); /* I plan to turn this into merely
createPredictionArchitecture(test, testTarget, numberOfLayers, neuronInlayer) once this problem
is fixed.*/
The relevant bit of the createArchitecture function:
int createArchitecture(Layer* predictionNetwork, Layer* targetNetwork, int numberOfLayers, int* neuronInEachLayer) {
predictionNetwork = (Layer*)malloc(numberOfLayers * sizeof(Layer)); if (predictionNetwork == NULL) { fprintf(stderr, "Failed to allocate memory to 'predictionNetwork' in line %d\n ", __LINE__); exit(1); }
else {
printf("Memory successfully allocated to 'predictionNetwork'\n");
}
targetNetwork = (Layer*)malloc(numberOfLayers * sizeof(Layer)); if (targetNetwork == NULL) { fprintf(stderr, "Failed to allocate memory to 'predictionNetwork' in line %d\n ", __LINE__); exit(1); }
else {
printf("Memory successfully allocated to 'targetNetwork'\n");
}
Layer** targetNW = &targetNetwork;
...
}
The exception occurs when I do this:
*targetNW[i] = createLayer(neuronInEachLayer[i]);
Why does this happen, and how should I fix it?
And also, do say if you need to see more of the code.

Well, I figured out the problem; I wasn't assigning any memory to targetNW.

Related

Unified memory and struct with arrays

I have a big Struct of Arrays of Structs on CUDA, that is constant and read only for my application. A quite simplified example would be
struct Graph{
Node * nodes;
int nNode;
}
struct Node{
int* pos;
int nPos;
}
My kernels would need to navigate this graph and query it. As you know, copying this struct to GPU memory with cudaMalloc and cudaMemcpy is just lots of code, that unified memory is supposed to remove the need of.
In my code, I generated the graph in CPU and then, for testing, I designed the following kernel
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nNode;
};
being called as:
// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));
testKernel<<<1,1>>>(graph,d_res);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));
with the error checks from here.
When I use the testKernel as is shown, it works fine, but if I change the kernel to:
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nodes[0].nPos;
};
I get illegal memory access errors.
Is this because the unified memory does not handle this type of data correctly?
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Full MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
// d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
graph.nodes = (Node*)malloc(2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
Your code isn't using CUDA unified memory. UM is not "automatic" in any way. It requires specific programming steps to take advantage of it and it has specific system requirements.
All of this is covered in the UM section of the programming guide.
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Proper use of UM should allow this. Here is a fully worked example. The only thing I have done is mechanically convert your malloc operations in host code to equivalent cudaMallocManaged operations.
$ cat t1389.cu
#include <algorithm>
#include <stdio.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
$ nvcc t1389.cu -o t1389
$ cuda-memcheck ./t1389
========= CUDA-MEMCHECK
2
3========= ERROR SUMMARY: 0 errors
$
UM has a number of system requirements that are documented. I'm not going to try to recite them all here. Primarily you need a cc3.0 or higher GPU. Your MCVE did not include any standard error checking, and I didn't try to add it. But if you still have problems with this code, be sure to use proper CUDA error checking and run it with cuda-memcheck.
If your entire data structure, including embedded pointers, is allocated using ordinary host allocators, and you have no control over that, then you won't be able to use it directly in a UM regime, without doing some sort of involved copying. The exception here would be on an IBM Power9 system as mentioned in section K.1.6 of the above linked programming guide section.
Before attempting to use a host allocator (e.g. malloc) with UM, you should first test the pageableMemoryAccessUsesHostPageTables property, as mentioned in that section.
That property currently won't be set on any system except a properly configured IBM Power9 system. No x86 system currently has this property set/available.

How to call cudaMalloc from a separate function?

I'm learning cuda and try to write a function that allocate memory on the device in a similar way to that on the host. For example:
//host
float* allocate1D_float(int size)
{
float* array = (float*)malloc(size* sizeof(float));
if (array==NULL)
{
printf("\n Error allocating memory 1\n");
free(array);
exit(EXIT_FAILURE);
}
return array;
}
float *h_A = allocate1D_float(numElements);
//device
float* alloc_cuda1D_float(int numElements)
{
float *d_array = NULL;
size_t size = numElements * sizeof(float);
cudaError_t err = cudaSuccess;
err = cudaMalloc((void **)&d_array, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
return d_array;
}
float *d_A = alloc_cuda1D_float(int numElements);
However, nvcc keep saying
error: type name is not allowed
error: expected a ")"
for device function while the host function is ok. Hope you can help me to figure out the issue.
Thanks.
Regarding "type name is not allowed":
You did this correctly:
float *h_A = allocate1D_float(numElements);
But this is wrong:
float *d_A = alloc_cuda1D_float(int numElements);
^^^
This int shouldn't be here
So remove the int right in front of numElements
This of course has nothing to do with CUDA. Your host function call would have given a similar error if you attempted to put int where it doesn't belong in that call.

passing 2 parameters of undefined type to a constructor (of 2 expected)

I have a constructor where as first 2 parameteres I would like to pass:
ID3D11ShaderResourceView* as a steady downloaded texture OR
const CHAR* as a filename to downlowd this texture and assign it to a class ID3D11ShaderResourceView* member (with following release on demand), but I can not understand the way I should do it correctly. it looks this way:
class {button
public
button() {};
button(data1 (or texture or filname), data2 (or texture or filname), rest data....);
...
~button();
}
so I tried:
templates but failed, may be cause of knowledge lack, templates
define one type while I need a choice of 2. Varradic templates, or I didnt get them right but they mean undetermined amount of variables when I need to differ only 2 first.
Unions but it had conflict with class variable set - said could not match const char [amount] with const char* and unions do not work with std::string.
tried void* with typeid.name() but it always showed me "void *"
I don't want to overload constructors, becuase this will create 4+ of them barely differing one from another. Do you think boost::variant helps me in this case? Is there any smooth and effective way to build that kind of constructor? My c++ knowledge is on beginning level, sorry if its a duplicate topic, read all it suggested to me while creating it but didn't seem to find out anything closely similar, thanks:)
Update:
Applied boost::any, got next results:
class button : public frame {
public:
button() {};
button(boost::any _texture,
boost::any _hover_texture,
...
};
if (_texture.type() == typeid(ID3D11ShaderResourceView*)) texture = boost::any_cast<ID3D11ShaderResourceView*>(_texture);
if (_texture.type() == typeid(const char*))
{
if ( FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice,boost::any_cast<const char*>(_texture), NULL, NULL, &texture, NULL )) )
mboxout( "loading texture failed.", "UI texture load error", true );
};
if (_hover_texture.type() == typeid(ID3D11ShaderResourceView*)) hover_texture = boost::any_cast<ID3D11ShaderResourceView*>(_hover_texture);
if (_hover_texture.type() == typeid(const char*))
{
if ( FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice, boost::any_cast<const char*>(_hover_texture), NULL, NULL, &hover_texture, NULL )) )
mboxout( "loading texture failed.", "UI texture load error", true );
};
Is it the only possible decision because this one seems akward for me? Thanks :)
As always, when you have combinatoric explosion/tedious repetition, refactor your code into reusable units.
In this case, your constructor could be just
template <typename T1, typename T2>
button(T1 const& texture, T2 const& hover_texture)
: _texture(load(texture)),
_hover_texture(load(hover_texture))
{
};
And all the loading logic would be inside... you guess it the load function. A sample implementation of that:
static ID3D11ShaderResourceView* load(ID3D11ShaderResourceView* v){
return v; // just return the resources passed in
}
static ID3D11ShaderResourceView* load(char const* fname) {
ID3D11ShaderResourceView* p = NULL;
if (FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice, fname, NULL, NULL, &p, NULL)))
throw std::runtime_error(std::string("loading texture failed (") + fname + ")");
return p;
}
Note: while we were at it we separated UI from business logic. You do not want to display messageboxes from inside constructors. Ever. You just want to notify the caller of the problem and the caller decides what to do (use another resource, try a different path, retry a download, write a warning message to the log, shut down etc.)
Full Demo
Live On Coliru
#include <iostream>
#include <stdexcept>
///////////////////////////////////////////////////////
// mocking ID3D*
struct ID3D11ShaderResourceView;
enum ERROR_CODE { ERR_OK };
#define FAILED(e) (ERR_OK != (e))
static int gvDevice = 42;
ERROR_CODE D3DX11CreateShaderResourceViewFromFile(int, char const* fname, void*, void*, ID3D11ShaderResourceView**, void*) {
std::cout << "Loaded from " << fname << "\n";
return ERR_OK;
}
//
///////////////////////////////////////////////////////
struct frame{ virtual ~frame() = default; };
class button : public frame {
public:
button() {};
template <typename T1, typename T2>
button(T1 const& texture, T2 const& hover_texture)
: _texture(load(texture)),
_hover_texture(load(hover_texture))
{
};
private:
// TODO Rule-Of-Three constructor/destructorl
// SUGGEST: Rule-Of-Zero using shared pointers instead
ID3D11ShaderResourceView* _texture;
ID3D11ShaderResourceView* _hover_texture;
static ID3D11ShaderResourceView* load(ID3D11ShaderResourceView* v) { return v; }
static ID3D11ShaderResourceView* load(char const* fname) {
ID3D11ShaderResourceView* p = NULL;
if (FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice, fname, NULL, NULL, &p, NULL)))
throw std::runtime_error(std::string("loading texture failed (") + fname + ")");
return p;
}
};
#include <cassert>
int main() {
ID3D11ShaderResourceView* default_texture = NULL;
assert(!FAILED( D3DX11CreateShaderResourceViewFromFile(gvDevice, "default_texture.bin", NULL, NULL, &default_texture, NULL)));
try {
button button1("t1.bin", "hover1.bin");
button button2(default_texture, "hover2.bin");
button button3("t3.bin", default_texture);
button button4(default_texture, default_texture);
} catch(std::exception const& e) {
std::cout << "Oops: " << e.what() << "\n";
}
}
Prints:
Loaded from default_texture.bin
Loaded from t1.bin
Loaded from hover1.bin
Loaded from hover2.bin
Loaded from t3.bin
There's still a lot to be improved (see the comments, e.g.) but this is a start.

CUDA pinned memory flushing from the device

CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.
There is no shared memory access between threads, every thread is standalone.
I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize on the host.
I want to be able to:
Flush into the pinned memory as soon as the device has updated it.
Not block the device thread (maybe by copying asynchronously)
I tried calling __threadfence_system and __threadfence after each device write, but that didn't flush.
Below is a full sample CUDA code that demonstrates my question:
#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}
Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}
I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:
The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.
I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1 2 and for a long time does nothing until stream finished... is printed to the console.
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )
bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}
class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);
private:
void call();
};
void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}
void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}
int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));
int filledElementsCounter = 0;
while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.
You cannot pass the host pointer directly to the kernel. If you allocate host memory using cudaHostAlloc with cudaHostAllocMapped flag, then first you have to retrieve the device pointer of the mapped host memory before you can use it in the kernel. Use cudaHostGetDevicePointer to get the device pointer of mapped host memory.
float* hResult, *dResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
cudaHostGetDevicePointer(&dResult,hResult);
Kernel<<<1,blocks>>>(dResult);
Calling __threadfence_system() will ensure that the write is visible to the system before proceeding, but your CPU will be caching the h_result variable and hence you're just spinning on the old value in an infinite loop. Try marking h_result as volatile.

cudaMalloc does not work when trying to create a custom struct type

i am tring to build a cuda program to do ray-tracing, and i have some code below:
void build_world(World *w, RGBAColor* buffer){
w->vp = (ViewPlane*) malloc(sizeof(ViewPlane));
w->vp->hres = 512;
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
ViewPlane *viewplane;
cudaMalloc(&viewplane,sizeof(ViewPlane)); //return cudaSuccess but pointer still NULL
cudaMemcpy(viewplane,w->vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
free(w->vp);
w->vp = viewplane;
cudaMalloc(&(w->background_color),sizeof(RGBAColor)); //return cudaSuccess but pointer still NULL
*(w->background_color) = black; //Memory access error
cudaMalloc(&(w->sphere),sizeof(Sphere)); //return cudaSuccess but pointer still NULL
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
World *w is a static global pointer, and it is in the global memory.
My problem is that i can not allocate memory in device memory, all "cudaMalloc" calls do not work for most of the time.
i do what #RobertCrovella suggested in comment, like this:
void build_world(World *w, RGBAColor* buffer){
checkCudaErrors( cudaMalloc(&(w->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
w->vp->hres = 512; //memory access errors occurs here
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
checkCudaErrors( cudaMalloc(&(w->background_color),sizeof(RGBAColor)));
getLastCudaError("background allocate failed");
*(w->background_color) = black;
checkCudaErrors( cudaMalloc(&(w->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
and it works once...the cudaMalloc API still returns "cudaSuccess" when it's not.
here is the definitions of structure:
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor *background_color;
Sphere *sphere;
};
after considering the issues that #RobertCrovella mentions in the answer below, here is the third version of build_world:
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
void build_world(World *w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->buffer = buffer;
h_vp->s = 1;
checkCudaErrors( cudaMalloc(&(h_world->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
checkCudaErrors( cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice));
getLastCudaError("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = Point3D(0.0,0.0,0.0);
h_sphere->radius = 300;
checkCudaErrors( cudaMalloc(&(h_world->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
checkCudaErrors( cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice));
getLastCudaError("sphere memory copy failed");
checkCudaErrors( cudaMalloc( &w , sizeof(World)));
getLastCudaError( "world allocate failed" );
checkCudaErrors( cudaMemcpy(w,h_world,sizeof(World),cudaMemcpyHostToDevice));
getLastCudaError("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
this time, all cudaMemcpy calls don't work: when running to the end of this function, the value of h_vp and h_sphere is good; h_world->vp and h_world->sphere do point to an area of device momery but contains wrong value;w does not have correct value, all pointer it contains is 0x00000000...
This question has officially become "a mess" because you have posted two substantially different versions of build_world which differ in important ways, apart from just the error checking I asked you to add. I will try and address some issues as I see them, however my understanding is clouded by the confusion in your posting.
If the pointer *w that you are passing to build_world is already a device pointer (i.e. allocated with cudaMalloc) which seems to be what you are saying, then none of this will work. Creating data structures on the device, which also contain pointers to other data structures that are also on the device, is a somewhat non-intuitive process. You cannot pass a pointer to cudaMalloc that already lives on the device (i.e. is already part of a region created with cudaMalloc. Instead it's necessary to create a parallel set of pointers on the host, cudaMalloc these pointers individually, then copy the pointer values to the appropriate regions in the device data structure, using cudaMemcpy. To see another example of what I am referring to, take a look here.
You cannot dereference device pointers in host code. For example:
w->vp->hres = 512;
If w or w->vp is a pointer set up with cudaMalloc, then the above operation is invalid. Instead it's necessary to create a parallel data structure on the host, set the values there, then cudaMemcpy from host to device:
h_vp->hres = 512;
cudaMemcpy(d_vp, h_vp, sizeof(vp_struct), cudaMemcpyHostToDevice);
Note that in this simplified description I'm glossing over the issue I mentioned in the first point above.
If you are calling build_world over and over again, you need to make sure that you are properly using cudaFree if you are passing the same *w pointer.
EDIT: In response to the additional posting of the 3rd version of build_world I elected to create a sample code which should have the remaining issues fixed:
#include <stdio.h>
#include <vector_functions.h>
#define black make_uchar4(4,3,2,1)
#define white make_uchar4(0,1,2,3)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
__global__ void my_kernel(World *w){
printf("w->vp->hres = %d\n", w->vp->hres);
printf("w->background_color.y = %d\n", w->background_color.y);
printf("w->sphere->radius = %f\n", w->sphere->radius);
printf("w->vp->buffer->y = %d\n", w->vp->buffer->y);
}
void build_world(World **w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->s = 1;
cudaMalloc((void **)&(h_vp->buffer), sizeof(RGBAColor));
cudaCheckErrors("viewplane RGBAColor allocate failed");
cudaMemcpy(h_vp->buffer, buffer, sizeof(RGBAColor), cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane RGBAColor copy failed");
cudaMalloc((void **)&(h_world->vp),sizeof(ViewPlane));
cudaCheckErrors("viewplane allocate failed");
cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = (Point3D) make_float3(0.0,0.0,0.0);
h_sphere->radius = 300;
cudaMalloc((void **)&(h_world->sphere),sizeof(Sphere));
cudaCheckErrors("sphere allocate failed");
cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice);
cudaCheckErrors("sphere memory copy failed");
cudaMalloc((void **)w , sizeof(World));
cudaCheckErrors( "world allocate failed" );
cudaMemcpy(*w,h_world,sizeof(World),cudaMemcpyHostToDevice);
cudaCheckErrors("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
int main(){
World *d_w;
RGBAColor my_buffer = white;
build_world(&d_w, &my_buffer);
my_kernel<<<1,1>>>(d_w);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
You can compile this code with nvcc -arch=sm_20 -o t98 t98.cu
When I compile and run this code, I get no errors and the following output:
$ ./t98
w->vp->hres = 512
w->background_color.y = 3
w->sphere->radius = 300.000000
w->vp->buffer->y = 1
$