copy data which is allocated in device from device to host - cuda

I have a pointer which is dynamically allocated in device,then how can I copy it from device to host.
#include <stdio.h>
#define cudaSafeCall(call){ \
cudaError err = call; \
if(cudaSuccess != err){ \
fprintf(stderr, "%s(%i) : %s.\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
}}
#define cudaCheckErr(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if(cudaSuccess != err){ \
fprintf(stderr, "%s(%i) : %s : (code %d) %s.\n", __FILE__, __LINE__, errorMessage, err, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \``
}}
struct num{
int *a;
int b;
};
__device__ struct num *gun;
int main()
{
int i;
char c[100];
struct num *dun,*cun;
cudaSafeCall(cudaSetDevice(1));
cun=(struct num*)malloc(10*sizeof(struct num));
cudaSafeCall(cudaMalloc(&dun,10*sizeof(struct num)));
cudaSafeCall(cudaMemcpyToSymbol(gun,&dun,sizeof(struct num*)));
__global__ void kernel();
kernel<<<1,10>>>();
cudaSafeCall(cudaDeviceSynchronize());
cudaCheckErr(c);
cudaSafeCall(cudaMemcpyFromSymbol(&dun,gun,sizeof(struct num*)));
cudaSafeCall(cudaMemcpy(cun,dun,10*sizeof(struct num),cudaMemcpyDeviceToHost));
for(i=0;i<10;i++) cudaSafeCall(cudaMalloc(&csu[i].a,10*sizeof(int)));
cudaSafeCall(cudaGetSymbolAddress((void**)csu[0].a,(void**)gun[0].a));
for(i=0;i<10;i++) cun[i].a=(int*)malloc(10*sizeof(int));
for(i=0;i<10;i++) cudaSafeCall(cudaMemcpy(cun[i].a,dun[i].a,10*sizeof(int),cudaMemcpyDeviceToHost));
printf("%d ",cun[8].b);
printf("%d ",cun[8].a[8]);
cudaSafeCall(cudaFree(dun));
free(cun);
}
__global__ void kernel()
{
int i;
int tid=threadIdx.x;
gun[tid].b=tid;
gun[tid].a=(int*)malloc(10*sizeof(int));/*this is dynamically allocated in device.*/
for(i=0;i<10;i++)
gun[tid].a[i]=tid+i;
}
In this program, it always comes to a "segmentation fault" in
cudaSafeCall(cudaMemcpy(cun[i].a,dun[i].a,10*sizeof(int),cudaMemcpyDeviceToHost))
Why? And what can I do to copy this data from device to host?

The problem you have is that you are trying to use device pointer indirection in host code, which is illegal. In your example
cudaMemcpy(cun[i].a,dun[i].a,10*sizeof(int),cudaMemcpyDeviceToHost)
dun contains a device pointer, so dun[i].a implies indirection of dun[i] to read the value of a. That is not a valid host memory address and so a seg fault results. You have actually already copied the pointers to the heap memory your kernel allocated when you do this:
cudaMemcpy(cun,dun,10*sizeof(struct num),cudaMemcpyDeviceToHost);
so following that code with
int ** a_h = (int **)malloc(10 * sizeof(int *)); // to hold heap pointers
for(i=0;i<10;i++) {
a_h[i] = cun[i].a; // save heap pointer
cun[i].a=(int*)malloc(10*sizeof(int));
cudaMemcpy(cun[i].a,a_h[i],10*sizeo(int),cudaMemcpyDeviceToHost); // copy heap to host
}
should safely copy the heap memory you allocated back to the host.

Related

Is there a function in the cublas that can apply the sigmoid function with a vector?

As the title says, I want to do the element-wise operation in the vector with a function.I wonder that is there any function in the cublas library to do that?
I am not aware of a suitable CUBLAS function that can assist in the task. However, you can easily write your own code that applies the sigmoid function, or any other single-argument function for that matter, element-wise to a vector. Note that such code would be memory-bound rather than compute-bound in most circumstances. See the CUDA program below for a worked example, in particular sigmoid_kernel(). The output of the program should look something like this:
source[0]= 0.0000000000000000e+000 source[99999]= 9.9999000000000005e-001
result[0]= 5.0000000000000000e-001 result[99999]= 7.3105661250612963e-001
.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define DEFAULT_LEN 100000
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ __forceinline__ double sigmoid (double a)
{
return 1.0 / (1.0 + exp (-a));
}
__global__ void sigmoid_kernel (const double * __restrict__ src,
double * __restrict__ dst, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = sigmoid (src[i]);
}
}
int main (void)
{
double *source, *result;
double *d_a = 0, *d_b = 0;
int len = DEFAULT_LEN;
/* Allocate memory on host */
source = (double *)malloc (len * sizeof (source[0]));
if (!source) return EXIT_FAILURE;
result = (double *)malloc (len * sizeof (result[0]));
if (!result) return EXIT_FAILURE;
/* create source data */
for (int i = 0; i < len; i++) source [i] = i * 1e-5;
/* spot check of source data */
printf ("source[0]=% 23.16e source[%d]=% 23.16e\n",
source[0], len-1, source[len-1]);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));
/* Push source data to device */
CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
cudaMemcpyHostToDevice));
/* Compute execution configuration */
dim3 dimBlock(256);
int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
CHECK_LAUNCH_ERROR();
/* retrieve results from device */
CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
cudaMemcpyDeviceToHost));
/* spot check of results */
printf ("result[0]=% 23.16e result[%d]=% 23.16e\n",
result[0], len-1, result[len-1]);
/* free memory on host and device */
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
free (result);
free (source);
return EXIT_SUCCESS;
}

Is it possible to use thrust::device_ptr on a mapped array?

I am trying to use the thrust::copy_if function on mapped memory. However, as I get a runtime error and I am not being able to find it, before spending a lot of time in debugging, I would like to have a confirmation of the fact that it is effectively allowed to pass a pointer to a mapped memory location to the thrust::device_ptr wrapper.
Here is an example of what I mean:
int size=1024;
int* v_locked;
int* v_device;
int* stencil_device;
device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);
cudaMalloc((void**)&stencil_device, size*sizeof(int));
/*
kernel assigning stencil_device elements ...
*/
v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);
v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);
Is this a correct usage of mapped memory with thrust library?
Thank you.
Yes, it is possible.
I believe there were several problems with your code.
You don't appear to be doing any proper cuda error checking If you were, you would have detected that although your calls to cudaHostGetDevicePointer seem to compile correctly, they were not set up correctly.
As mentioned above, your calls to cudaHostGetDevicePointer() were not set up correctly. The second pointer argument is passed as a single pointer (*), not double pointer (**). Refer to the documentation This call as written would throw a cuda runtime error which you can trap.
Prior to your cudaHostAlloc calls, you should use the cudaSetDeviceFlags(cudaDeviceMapHost); call to enable this feature.
Here is a sample code which seems to work correctly for me, and has the above problems fixed:
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x==1);
}
};
int main(){
int size=1024;
int* v_locked;
int* v_device;
int* stencil_locked;
int* stencil_device;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags");
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 1");
cudaHostGetDevicePointer(&v_device, v_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 1");
cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 2");
cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 2");
for (int i = 0; i < size; i++){
v_locked[i] = i;
stencil_locked[i] = i%2;}
thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
thrust::device_vector<int> result(size);
thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
int result_size = result_end - result.begin();
thrust::host_vector<int> h_result(result_size);
thrust::copy_n(result.begin(), result_size, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$

Changing from for loop to multithreading in kernel

I'm currently working on interpolation of a grid and having some problems regarding multithreading. The code is suppose to read a map represented by a 2x2 matrix, and then interpolate it to increase the number of points by a factor of 100. When using for loops in the kernel, it works great.
Before interpolation: http://bildr.no/view/OWV1UDRO
After interpolation: http://bildr.no/view/eTlmNmpo
When I tried to change the for loops with threads, it produced some weird result. In stead of numbers, it filled the resulting matrix with -1.#QNAN
Here's my working code with for loops in the kernel
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"
using namespace std;
float Z[41][41];
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<float, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
for (float i=0; i<n*k; i++)
{
for (float j=0; j<m*k; j++)
{
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
}
}
int main (void)
{
// Start timer
clock_t tStart = clock();
// Size of map
int n=41;
int m=41;
int g = 0;
float numberOfInterpolationsPerSquare = 100;
float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);
size_t pitch, tex_ofs;
float *f;
float *r;
float *map_d = 0;
// Build read-Streams
ifstream map;
//Create and open a txt file for MATLAB
ofstream file;
// Open data
map.open("Map.txt", ios_base::in);
file.open("Bilinear.txt");
// Store the map in a 2D array
for (int i=0; i<n; i++)
{
for (int j=0; j<m; j++)
{
map >> Z[i][j];
}
}
// Allocate memory on host and device
CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
r = (float*)malloc(numberOfElements*sizeof(float));
// Copy map from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));
// Set texture mode to bilinear interpolation
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
// Bind the map to texture
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));
// Checking for offset
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
// Launch Kernel
kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
// Copy result from device to host
cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);
// Write results to file
for(int h=0;h<numberOfElements;h++)
{
if(g==sqrt(numberOfElements))
{
file << endl;
g=0;
}
file << r[h] << " ";
g++;
}
// Free memory
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (map_d));
CUDA_SAFE_CALL (cudaFree (f));
free( r );
// Print out execution time
printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
Here's the kernel with multithreading, which doesn't work
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j= blockIdx.y * blockDim.y + threadIdx.y;
if(i>=n*k || j>=m*k)
return;
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
Does anyone know why the multithread version doesn't work?
Regards
Sondre
In the second kernel, i and j are int instead of float. So j/k and i/k in tex2D will result in integer division. Declare k as float to avoid integer division.
Initially, the kernel was launched with the following configuration:
//Find number of blocks
int nthreads = 1024;
int blocksize = 512;
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads);
// Launch Kernel
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);
The problem with the above code is that it would launch a 1D grid of 1D blocks, but inside the kernel, 2D indexing is used. A 2D grid/block configuration is required for the kernel to work correctly. From the looks of the kernel code, following grid/block configuration should work:
float k = sqrt(numberOfInterpolationsPerSquare);
const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);
const dim3 dimBlock(16,16);
dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;
kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);

cudaMalloc does not work when trying to create a custom struct type

i am tring to build a cuda program to do ray-tracing, and i have some code below:
void build_world(World *w, RGBAColor* buffer){
w->vp = (ViewPlane*) malloc(sizeof(ViewPlane));
w->vp->hres = 512;
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
ViewPlane *viewplane;
cudaMalloc(&viewplane,sizeof(ViewPlane)); //return cudaSuccess but pointer still NULL
cudaMemcpy(viewplane,w->vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
free(w->vp);
w->vp = viewplane;
cudaMalloc(&(w->background_color),sizeof(RGBAColor)); //return cudaSuccess but pointer still NULL
*(w->background_color) = black; //Memory access error
cudaMalloc(&(w->sphere),sizeof(Sphere)); //return cudaSuccess but pointer still NULL
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
World *w is a static global pointer, and it is in the global memory.
My problem is that i can not allocate memory in device memory, all "cudaMalloc" calls do not work for most of the time.
i do what #RobertCrovella suggested in comment, like this:
void build_world(World *w, RGBAColor* buffer){
checkCudaErrors( cudaMalloc(&(w->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
w->vp->hres = 512; //memory access errors occurs here
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
checkCudaErrors( cudaMalloc(&(w->background_color),sizeof(RGBAColor)));
getLastCudaError("background allocate failed");
*(w->background_color) = black;
checkCudaErrors( cudaMalloc(&(w->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
and it works once...the cudaMalloc API still returns "cudaSuccess" when it's not.
here is the definitions of structure:
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor *background_color;
Sphere *sphere;
};
after considering the issues that #RobertCrovella mentions in the answer below, here is the third version of build_world:
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
void build_world(World *w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->buffer = buffer;
h_vp->s = 1;
checkCudaErrors( cudaMalloc(&(h_world->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
checkCudaErrors( cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice));
getLastCudaError("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = Point3D(0.0,0.0,0.0);
h_sphere->radius = 300;
checkCudaErrors( cudaMalloc(&(h_world->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
checkCudaErrors( cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice));
getLastCudaError("sphere memory copy failed");
checkCudaErrors( cudaMalloc( &w , sizeof(World)));
getLastCudaError( "world allocate failed" );
checkCudaErrors( cudaMemcpy(w,h_world,sizeof(World),cudaMemcpyHostToDevice));
getLastCudaError("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
this time, all cudaMemcpy calls don't work: when running to the end of this function, the value of h_vp and h_sphere is good; h_world->vp and h_world->sphere do point to an area of device momery but contains wrong value;w does not have correct value, all pointer it contains is 0x00000000...
This question has officially become "a mess" because you have posted two substantially different versions of build_world which differ in important ways, apart from just the error checking I asked you to add. I will try and address some issues as I see them, however my understanding is clouded by the confusion in your posting.
If the pointer *w that you are passing to build_world is already a device pointer (i.e. allocated with cudaMalloc) which seems to be what you are saying, then none of this will work. Creating data structures on the device, which also contain pointers to other data structures that are also on the device, is a somewhat non-intuitive process. You cannot pass a pointer to cudaMalloc that already lives on the device (i.e. is already part of a region created with cudaMalloc. Instead it's necessary to create a parallel set of pointers on the host, cudaMalloc these pointers individually, then copy the pointer values to the appropriate regions in the device data structure, using cudaMemcpy. To see another example of what I am referring to, take a look here.
You cannot dereference device pointers in host code. For example:
w->vp->hres = 512;
If w or w->vp is a pointer set up with cudaMalloc, then the above operation is invalid. Instead it's necessary to create a parallel data structure on the host, set the values there, then cudaMemcpy from host to device:
h_vp->hres = 512;
cudaMemcpy(d_vp, h_vp, sizeof(vp_struct), cudaMemcpyHostToDevice);
Note that in this simplified description I'm glossing over the issue I mentioned in the first point above.
If you are calling build_world over and over again, you need to make sure that you are properly using cudaFree if you are passing the same *w pointer.
EDIT: In response to the additional posting of the 3rd version of build_world I elected to create a sample code which should have the remaining issues fixed:
#include <stdio.h>
#include <vector_functions.h>
#define black make_uchar4(4,3,2,1)
#define white make_uchar4(0,1,2,3)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
__global__ void my_kernel(World *w){
printf("w->vp->hres = %d\n", w->vp->hres);
printf("w->background_color.y = %d\n", w->background_color.y);
printf("w->sphere->radius = %f\n", w->sphere->radius);
printf("w->vp->buffer->y = %d\n", w->vp->buffer->y);
}
void build_world(World **w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->s = 1;
cudaMalloc((void **)&(h_vp->buffer), sizeof(RGBAColor));
cudaCheckErrors("viewplane RGBAColor allocate failed");
cudaMemcpy(h_vp->buffer, buffer, sizeof(RGBAColor), cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane RGBAColor copy failed");
cudaMalloc((void **)&(h_world->vp),sizeof(ViewPlane));
cudaCheckErrors("viewplane allocate failed");
cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = (Point3D) make_float3(0.0,0.0,0.0);
h_sphere->radius = 300;
cudaMalloc((void **)&(h_world->sphere),sizeof(Sphere));
cudaCheckErrors("sphere allocate failed");
cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice);
cudaCheckErrors("sphere memory copy failed");
cudaMalloc((void **)w , sizeof(World));
cudaCheckErrors( "world allocate failed" );
cudaMemcpy(*w,h_world,sizeof(World),cudaMemcpyHostToDevice);
cudaCheckErrors("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
int main(){
World *d_w;
RGBAColor my_buffer = white;
build_world(&d_w, &my_buffer);
my_kernel<<<1,1>>>(d_w);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
You can compile this code with nvcc -arch=sm_20 -o t98 t98.cu
When I compile and run this code, I get no errors and the following output:
$ ./t98
w->vp->hres = 512
w->background_color.y = 3
w->sphere->radius = 300.000000
w->vp->buffer->y = 1
$

Copy symbol address to symbol

I'd like to set a symbol pointer to a symbol address, from the host. Something like:
__device__ float * symbolPtr; // address of symbol[3]
__device__ float symbol[5];
cudaGetSymbolAddress( &symbolPtr, symbol[3] );
I've only used this to get a device address to the host. But there is no mention that it can't set a symbol variable.
Here is another approach:
cudaMemcpytoSymbol(&symbolPtr, &symbol[3], sizeof(void*), 0, cudaMemcpyDeviceToDevice);
I doubt either of these will work. Is there a simpler way to do this?
This method worked for me. There may be a better way.
#include <stdio.h>
#define N 5
#define Q 3
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__device__ float *symbolPtr;
__device__ float symbol[N];
__global__ void mykernel(){
for (int i=0; i<N; i++)
symbol[i] = (float) i;
printf("symbol[%d] = %f\n", Q, *symbolPtr);
}
int main(){
float *temp;
cudaGetSymbolAddress((void **)&temp, symbol);
cudaCheckErrors("cudaGetSymbolAddress fail");
temp += Q;
cudaMemcpyToSymbol(symbolPtr, &temp, sizeof(float *));
cudaCheckErrors("cudaMemcpyToSymbol fail");
mykernel<<<1,1>>>();
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
A static approach would be:
__device__ float symbol[5];
__device__ float * symbolPtr= &symbol[3];