cuda - cudaInvalidValue error when cudaMemcpyToSymbol - cuda

I'm trying to copy an array of double to a chunk of constant memory on device. But get cudaInvalidValue error.
Here's my code.
#define kCIELEN 95 // length of CIE_X[]
const double CIE_X[] = {
1.299000e-04, 2.321000e-04, 4.149000e-04, 7.416000e-04, 1.368000e-03,
2.236000e-03, 4.243000e-03, 7.650000e-03, 1.431000e-02, 2.319000e-02,
4.351000e-02, 7.763000e-02, 1.343800e-01, 2.147700e-01, 2.839000e-01,
3.285000e-01, 3.482800e-01, 3.480600e-01, 3.362000e-01, 3.187000e-01,
2.908000e-01, 2.511000e-01, 1.953600e-01, 1.421000e-01, 9.564000e-02,
5.795001e-02, 3.201000e-02, 1.470000e-02, 4.900000e-03, 2.400000e-03,
9.300000e-03, 2.910000e-02, 6.327000e-02, 1.096000e-01, 1.655000e-01,
2.257499e-01, 2.904000e-01, 3.597000e-01, 4.334499e-01, 5.120501e-01,
5.945000e-01, 6.784000e-01, 7.621000e-01, 8.425000e-01, 9.163000e-01,
9.786000e-01, 1.026300e+00, 1.056700e+00, 1.062200e+00, 1.045600e+00,
1.002600e+00, 9.384000e-01, 8.544499e-01, 7.514000e-01, 6.424000e-01,
5.419000e-01, 4.479000e-01, 3.608000e-01, 2.835000e-01, 2.187000e-01,
1.649000e-01, 1.212000e-01, 8.740000e-02, 6.360000e-02, 4.677000e-02,
3.290000e-02, 2.270000e-02, 1.584000e-02, 1.135916e-02, 8.110916e-03,
5.790346e-03, 4.106457e-03, 2.899327e-03, 2.049190e-03, 1.439971e-03,
9.999493e-04, 6.900786e-04, 4.760213e-04, 3.323011e-04, 2.348261e-04,
1.661505e-04, 1.174130e-04, 8.307527e-05, 5.870652e-05, 4.150994e-05,
2.935326e-05, 2.067383e-05, 1.455977e-05, 1.025398e-05, 7.221456e-06,
5.085868e-06, 3.581652e-06, 2.522525e-06, 1.776509e-06, 1.251141e-06
};
__constant__ double *dev_CIE_X;
cudaStatus = cudaMalloc((void**)&dev_CIE_X, kCIELEN * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
// below do not work, so try to use a pointer `*ciex`
// cudaStatus = cudaMemcpyToSymbol(dev_CIE_X, &CIE_X, kCIELEN * sizeof(double));
double *ciex = new double[kCIELEN];
for (int i = 0; i < kCIELEN; i++) {
ciex[i] = CIE_X[i];
}
cudaStatus = cudaMemcpyToSymbol(dev_CIE_X, &ciex, kCIELEN * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
From the link, it says cudaInvalidValue is due to one or more of the parameters passed to the API call is not within an acceptable range of values.
But I think I pass two pointers and the size of copied item is correct. What is the problem then?

This:
__constant__ double *dev_CIE_X;
cudaStatus = cudaMalloc((void**)&dev_CIE_X, kCIELEN * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
is illegal. __constant__ memory cannot be dynamically allocated and device symbols cannot be directly manipulated from the host in this manner. You could do this:
__constant__ double *dev_CIE_X;
double *h_dev_CIE_X;
cudaMalloc((void**)&h_dev_CIE_X, kCIELEN * sizeof(double));
cudaMemcpyToSymbol(dev_CIE_X, &h_dev_CIE_X, sizeof(double));
// populate CIE_X
cudaMemcpy(h_dev_CIE_X, &CIE_X[0], kCIELEN * sizeof(double), cudaMemcpyHostToDevice);
But then you need to understand that the array is stored in global memory and only the pointer value is stored in constant memory.
What you probably want is something like this:
__constant__ double dev_CIE_X[kCIELEN];
// populate CIE_X
cudaMemcpyToSymbol(dev_CIE_X, &CIE_X[0], kCIELEN * sizeof(double));
i.e. statically declare the array in constant memory and then copy the host data to that array.

Related

Values of array after cudaMemcpy do not change, any idea why? [duplicate]

Thank you very much for reading my threads.
I am doing CUDA work, but keep getting cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress, without any idea why. I did the search for both the code and the function, surprisingly , only a few records showed up. Very strange.
I basically sum up all pixels of images. To make my questions have as much reference as it can, I am showing all my CUDA code here:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>
using namespace std;
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
__global__ void reduceSum(unsigned short *input,
unsigned long long *per_block_results,
const int n)
{
extern __shared__ unsigned long long sdata[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
unsigned short x = 0;
if(i < n)
{
x = input[i];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
if(threadIdx.x < offset)
{
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = sdata[0];
}
}
// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist)
{
int size = N*N;
unsigned long long* dev_sum = 0;
unsigned short* dev_img = 0;
cudaError_t cudaStatus;
const int block_size = 512;
const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
int L = filelist.size();
// Choose which GPU to run on, change this on a multi-GPU system.
double totalgpuinittime = 0;
StartCounter(7);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
totalgpuinittime = GetCounter(7);
unsigned short* img;
unsigned short* pimg;
unsigned long long* sum = new unsigned long long[num_blocks];
unsigned long long* psum = sum;
cout<<endl;
cout << "gpu looping starts, and in progress ..." << endl;
StartCounter(6);
double totalfileiotime = 0;
double totalh2dcpytime = 0;
double totalkerneltime = 0;
double totald2hcpytime = 0;
double totalcpusumtime = 0;
double totalloopingtime = 0;
for (int k = 0; k < L; k++)
{
StartCounter(1);
img = (unsigned short*)LoadTIFF(filelist[k].c_str());
totalfileiotime += GetCounter(1);
psum = sum;
pimg = img;
float gpumean = 0;
memset(psum, 0, sizeof(unsigned long long)*num_blocks);
StartCounter(2);
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totalh2dcpytime += GetCounter(2);
StartCounter(3);
//reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
//reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
totalkerneltime += GetCounter(3);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
// !!!!!! following is where the code 77 error occurs!!!!!!!
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
StartCounter(4);
cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totald2hcpytime += GetCounter(4);
StartCounter(5);
for (int i = 0; i < num_blocks; i++)
{
gpumean += *psum;
psum++;
}
gpumean /= N*N;
totalcpusumtime += GetCounter(5);
delete img;
img = NULL;
cout<<gpumean<<endl;
}
int S = 1e+6;
int F = filelist.size();
float R = S/F;
totalloopingtime = GetCounter(6);
cout<<"gpu looping ends."<<endl<<endl;
cout<< "analysis:"<<endl;
cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
cout<<"file I/O time: "<<endl;
cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
cout<<"host-to-device copy time: "<<endl;
cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
cout<<"pure gpu kerneling time: "<<endl;
cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
cout<<"device-to-host copy time: "<<endl;
cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
/*cout<<"cpu summing time: "<<endl;
cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/
/*cout <<"gpu looping time: " << endl;
cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/
Error:
cudaFree(dev_sum);
cudaFree(dev_img);
delete sum;
sum = NULL;
return cudaStatus;
}
void kernel(float* &mean, int N, vector<string> filelist)
{
// wrapper and kernel
cudaError_t cudaStatus = gpuWrapper(mean, N, filelist);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "gpuWapper failed!");
}
// printf("mean is: %f\n", mean);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
StartCounter(8);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!");
}
cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
//return *mean;
}
I have assigned enough and equivalent memory space for both host and device memory. Any comments is appreciated.
While this may not be the only source of error in the code, you are not allocating any dynamic shared memory for the reduction kernel, leading to the illegal addressing error you see. The correct kernel launch should be something like
size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);
This allocates the equivalent of one unsigned long long for each thread running in the reduction kernel, which (by my very cursory reading of your code) should make the shared memory array sdata the correct size for the kernel to run without out-of-bounds access to that array.

How to call cudaMalloc from a separate function?

I'm learning cuda and try to write a function that allocate memory on the device in a similar way to that on the host. For example:
//host
float* allocate1D_float(int size)
{
float* array = (float*)malloc(size* sizeof(float));
if (array==NULL)
{
printf("\n Error allocating memory 1\n");
free(array);
exit(EXIT_FAILURE);
}
return array;
}
float *h_A = allocate1D_float(numElements);
//device
float* alloc_cuda1D_float(int numElements)
{
float *d_array = NULL;
size_t size = numElements * sizeof(float);
cudaError_t err = cudaSuccess;
err = cudaMalloc((void **)&d_array, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
return d_array;
}
float *d_A = alloc_cuda1D_float(int numElements);
However, nvcc keep saying
error: type name is not allowed
error: expected a ")"
for device function while the host function is ok. Hope you can help me to figure out the issue.
Thanks.
Regarding "type name is not allowed":
You did this correctly:
float *h_A = allocate1D_float(numElements);
But this is wrong:
float *d_A = alloc_cuda1D_float(int numElements);
^^^
This int shouldn't be here
So remove the int right in front of numElements
This of course has nothing to do with CUDA. Your host function call would have given a similar error if you attempted to put int where it doesn't belong in that call.

CUDA - atomicAdd only adds up to 16777216

I have the following, easily reproducible problem, when running the following kernel, which does nothing except atomicAdds of floats:
#define OUT_ITERATIONS 20000000
#define BLOCKS 12
#define THREADS 192
__global__ void testKernel(float* result) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
float bias = 1.0f;
int n = 1;
while (i < OUT_ITERATIONS) {
atomicAdd(result, bias);
i += BLOCKS * THREADS;
}
}
The kernel is supposed to increment the result OUT_ITERATIONS times, that is 20M. I call the kernel with this standard code:
int main() {
cudaError_t cudaStatus;
float* result;
float* dev_result;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
result = new float;
cudaStatus = cudaMalloc((void**)&dev_result, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaMemset(dev_result, 0, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemset failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
testKernel<<<BLOCKS, THREADS>>>(dev_result);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
printf("Result: %f\n", *result);
However, the result printed at the end is 16777216.0, which is incidentally 0x1000000 in hex. The problem does not occur if OUT_ITERATIONS < 16777216, that is, if I change it to 16777000 for example, sure enough the output is 16777000.0!
System: NVidia-Titan, CUDA 5.5, Windows7
This issue is due to the limited precision of the type float.
float has only 24bit binary precison. If you add 2 numbers where one is more than 2^24-1 times larger than the other, the result will be exactly the same as the larger one.
When you add a big number like 16777216.0(=2^24) with a tiny number like 1.0, you will lost some precison and the result will still be 16777216.0. The same situations happens in a standard C propgram
float a=16777216.0f;
float b=1.0f;
printf("%f\n",a+b);
You could replace float with double or int to solve this problem.
Please refer to cuda doc for the implementation of the double version of atomicAdd()
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
20M does not fit within the available integer precision in a float.
A float quantity does not have 32 bits of mantissa (you discovered how many mantissa bits there are with your observation of "incidentally 0x1000000 in hex"), so it cannot represent all integers in the same way that a int or unsigned int can.
16777216 is the largest integer that can be reliably stored in a float.
Limit your storage range to what will fit in float, or else use some other representation, such as unsigned int or double if you want to reliably store 20M as an integer.
This isn't really a CUDA issue. You'd have similar difficulty trying to store large integers in a float in host code.

cudaMalloc does not work when trying to create a custom struct type

i am tring to build a cuda program to do ray-tracing, and i have some code below:
void build_world(World *w, RGBAColor* buffer){
w->vp = (ViewPlane*) malloc(sizeof(ViewPlane));
w->vp->hres = 512;
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
ViewPlane *viewplane;
cudaMalloc(&viewplane,sizeof(ViewPlane)); //return cudaSuccess but pointer still NULL
cudaMemcpy(viewplane,w->vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
free(w->vp);
w->vp = viewplane;
cudaMalloc(&(w->background_color),sizeof(RGBAColor)); //return cudaSuccess but pointer still NULL
*(w->background_color) = black; //Memory access error
cudaMalloc(&(w->sphere),sizeof(Sphere)); //return cudaSuccess but pointer still NULL
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
World *w is a static global pointer, and it is in the global memory.
My problem is that i can not allocate memory in device memory, all "cudaMalloc" calls do not work for most of the time.
i do what #RobertCrovella suggested in comment, like this:
void build_world(World *w, RGBAColor* buffer){
checkCudaErrors( cudaMalloc(&(w->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
w->vp->hres = 512; //memory access errors occurs here
w->vp->vres = 512;
w->vp->buffer = buffer;
w->vp->s = 1;
checkCudaErrors( cudaMalloc(&(w->background_color),sizeof(RGBAColor)));
getLastCudaError("background allocate failed");
*(w->background_color) = black;
checkCudaErrors( cudaMalloc(&(w->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
w->sphere->center = Point3D(0.0,0.0,0.0);
w->sphere->radius = 300;
}
and it works once...the cudaMalloc API still returns "cudaSuccess" when it's not.
here is the definitions of structure:
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor *background_color;
Sphere *sphere;
};
after considering the issues that #RobertCrovella mentions in the answer below, here is the third version of build_world:
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
void build_world(World *w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->buffer = buffer;
h_vp->s = 1;
checkCudaErrors( cudaMalloc(&(h_world->vp),sizeof(ViewPlane)));
getLastCudaError("viewplane allocate failed");
checkCudaErrors( cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice));
getLastCudaError("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = Point3D(0.0,0.0,0.0);
h_sphere->radius = 300;
checkCudaErrors( cudaMalloc(&(h_world->sphere),sizeof(Sphere)));
getLastCudaError("sphere allocate failed");
checkCudaErrors( cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice));
getLastCudaError("sphere memory copy failed");
checkCudaErrors( cudaMalloc( &w , sizeof(World)));
getLastCudaError( "world allocate failed" );
checkCudaErrors( cudaMemcpy(w,h_world,sizeof(World),cudaMemcpyHostToDevice));
getLastCudaError("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
this time, all cudaMemcpy calls don't work: when running to the end of this function, the value of h_vp and h_sphere is good; h_world->vp and h_world->sphere do point to an area of device momery but contains wrong value;w does not have correct value, all pointer it contains is 0x00000000...
This question has officially become "a mess" because you have posted two substantially different versions of build_world which differ in important ways, apart from just the error checking I asked you to add. I will try and address some issues as I see them, however my understanding is clouded by the confusion in your posting.
If the pointer *w that you are passing to build_world is already a device pointer (i.e. allocated with cudaMalloc) which seems to be what you are saying, then none of this will work. Creating data structures on the device, which also contain pointers to other data structures that are also on the device, is a somewhat non-intuitive process. You cannot pass a pointer to cudaMalloc that already lives on the device (i.e. is already part of a region created with cudaMalloc. Instead it's necessary to create a parallel set of pointers on the host, cudaMalloc these pointers individually, then copy the pointer values to the appropriate regions in the device data structure, using cudaMemcpy. To see another example of what I am referring to, take a look here.
You cannot dereference device pointers in host code. For example:
w->vp->hres = 512;
If w or w->vp is a pointer set up with cudaMalloc, then the above operation is invalid. Instead it's necessary to create a parallel data structure on the host, set the values there, then cudaMemcpy from host to device:
h_vp->hres = 512;
cudaMemcpy(d_vp, h_vp, sizeof(vp_struct), cudaMemcpyHostToDevice);
Note that in this simplified description I'm glossing over the issue I mentioned in the first point above.
If you are calling build_world over and over again, you need to make sure that you are properly using cudaFree if you are passing the same *w pointer.
EDIT: In response to the additional posting of the 3rd version of build_world I elected to create a sample code which should have the remaining issues fixed:
#include <stdio.h>
#include <vector_functions.h>
#define black make_uchar4(4,3,2,1)
#define white make_uchar4(0,1,2,3)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float3 Point3D;
typedef uchar4 RGBAColor;
struct Sphere{
Point3D center;
float radius;
};
struct ViewPlane{
public:
int hres;
int vres;
float s;
//float gamma;
//float inv_gamma;
RGBAColor *buffer;
};
struct World{
public:
ViewPlane *vp;
RGBAColor background_color;
Sphere *sphere;
};
__global__ void my_kernel(World *w){
printf("w->vp->hres = %d\n", w->vp->hres);
printf("w->background_color.y = %d\n", w->background_color.y);
printf("w->sphere->radius = %f\n", w->sphere->radius);
printf("w->vp->buffer->y = %d\n", w->vp->buffer->y);
}
void build_world(World **w, RGBAColor* buffer){
World *h_world;
h_world = (World*)malloc(sizeof(World));
ViewPlane *h_vp = (ViewPlane*)malloc(sizeof(ViewPlane));
h_vp->hres = 512;
h_vp->vres = 512;
h_vp->s = 1;
cudaMalloc((void **)&(h_vp->buffer), sizeof(RGBAColor));
cudaCheckErrors("viewplane RGBAColor allocate failed");
cudaMemcpy(h_vp->buffer, buffer, sizeof(RGBAColor), cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane RGBAColor copy failed");
cudaMalloc((void **)&(h_world->vp),sizeof(ViewPlane));
cudaCheckErrors("viewplane allocate failed");
cudaMemcpy(h_world->vp,h_vp,sizeof(ViewPlane),cudaMemcpyHostToDevice);
cudaCheckErrors("viewplane memory copy failed");
h_world->background_color = black;
Sphere *h_sphere = (Sphere*)malloc(sizeof(Sphere));
h_sphere->center = (Point3D) make_float3(0.0,0.0,0.0);
h_sphere->radius = 300;
cudaMalloc((void **)&(h_world->sphere),sizeof(Sphere));
cudaCheckErrors("sphere allocate failed");
cudaMemcpy(h_world->sphere,h_sphere,sizeof(Sphere),cudaMemcpyHostToDevice);
cudaCheckErrors("sphere memory copy failed");
cudaMalloc((void **)w , sizeof(World));
cudaCheckErrors( "world allocate failed" );
cudaMemcpy(*w,h_world,sizeof(World),cudaMemcpyHostToDevice);
cudaCheckErrors("world memory copy failed");
free(h_world);free(h_vp);free(h_sphere);
}
int main(){
World *d_w;
RGBAColor my_buffer = white;
build_world(&d_w, &my_buffer);
my_kernel<<<1,1>>>(d_w);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
You can compile this code with nvcc -arch=sm_20 -o t98 t98.cu
When I compile and run this code, I get no errors and the following output:
$ ./t98
w->vp->hres = 512
w->background_color.y = 3
w->sphere->radius = 300.000000
w->vp->buffer->y = 1
$

Cuda __syncthreads undefined. Without it-> random results

I am new with cuda and I have a problem. I want to put a synchronization to my threads so I tried to use syncthreads. The problem is that Visual Studio 2010 says: idetifier __syncthreads() is undefined... I am using cuda 4.2 by the way. So I decided to use cudaDeviceSynchronize() instead and call it from host. My code is something like the above (i send to you only the important parts):
__global__ void sum( float avg[]){
avg[0]+=1;
avg[1]+=2;
}
int main(){
float avg[2];
float *devAvg;
cudaError_t cudaStatus;
size_t size=sizeof(unsigned char)*2;
cudaStatus = cudaMalloc((void**)&devAvg, size2);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc 2 failed!");
return -1;
}
avg[0]=0;
avg[1]=0;
cudaStatus = cudaMemcpy(devAvg,avg, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
return -1;
}
dim3 nblocks(40,40);
dim3 nthreads(20,20);
sum<<<nblocks,nthreads,msBytes>>>(devAvg);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(avg,devAvg,size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy Device to Host failed!");
return -1;}
cout<<"avg[0]="avg[0]<<" avg[1]="<<avg[1]<<endl;
cudaFree devAvg;
return 0;
}
I thought that the results should be
avg[0]=640.000 avg[1]=1.280.000
but not only my results are different(this could be an overflow problem) but they does not be stable. For example for three different executions the results are:
avg[0]=3041 avg[1]=6604
avg[0]=3015 avg[1]=6578
avg[0]=3047 avg[1]=6600
So what I am doing wrong here?Is it a synchronization problem?And why I cannot use __syncthreads()
Or is it the problem of race conditions?
Additionally for the __syncthreads() problem it comes with any code that I write. Even the simplest one:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <Windows.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
__syncthreads();
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
return 0;
}
It is saying this: Error: identifier "__syncthreads()" is undefined
The funny part is that even with the sample codes that comes with the 4.2 CUDA SDK the same thing happens... Maybe is something more general wrong because there are more functions in the SDK samples that are considered undefined.
All of your blocks of threads are writing to the same two locations. The only way to make this work properly is to use atomic operations. Otherwise the results of threads reading the location, adding to it and writing the result back to the location "simultaneously" is undefined.
If you rewrite your kernel as follows:
__global__ void sum( float avg[]){
atomicAdd(&(avg[0]),1);
atomicAdd(&(avg[1]),2);
}
It should resolve the issue you are seeing.
To answer the question about __syncthreads(), I would need to see the exact code that caused the compiler error. If you post that, I'll update my answer. There shouldn't be a problem with inserting a __syncthreads() call in this kernel, although it won't fix the problem you are seeing.
You may wish to review the atomic operations section of the C programming guide.
Note that using atomics generally will cause your code to run slower, so they should be used carefully. However for this learning exercise it should sort out the issue for you.
also note that the code you posted doesn't compile cleanly, there are a number of missing definitions, and a variety of other issues with your code. But since you are posting results, I assume you have some version of this working, even though you haven't posted it. Therefore I haven't identified every issue with the code that you have posted.
Here is code that is similar to yours with all of the various coding issues fixed, and it seems to work for me:
#include <stdio.h>
#include <iostream>
#define msBytes 0
__global__ void sum( float avg[]){
atomicAdd(&(avg[0]),1);
atomicAdd(&(avg[1]),2);
}
int main(){
float avg[2];
float *devAvg;
cudaError_t cudaStatus;
size_t size=sizeof(float)*2;
cudaStatus = cudaMalloc((void**)&devAvg, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc 2 failed!");
return -1;
}
avg[0]=0;
avg[1]=0;
cudaStatus = cudaMemcpy(devAvg,avg, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
return -1;
}
dim3 nblocks(40,40);
dim3 nthreads(20,20);
sum<<<nblocks,nthreads,msBytes>>>(devAvg);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(avg,devAvg,size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy Device to Host failed!");
return -1;}
std::cout<<"avg[0]="<<avg[0]<<" avg[1]="<<avg[1]<<std::endl;
cudaFree(devAvg);
return 0;
}
I get the following output when I run it:
avg[0]=640000 avg[1]=1.28e+06
Also note that for atomicAdd to be usable on float, it's necessary to have a compute capability 2.0 or better device (and to pass the compiler switch e.g. -arch=sm_20 to compile for that kind of device). If you have an earlier device (compute capability 1.x) then you can create a similar program defining avg[] as int instead of float. Or if you prefer, you can create your own atomicAdd __ device__ function that is usable on a cc 1.x device as suggested here in the section beginning with "Note however that any atomic operation can be implemented based on atomicCAS() (Compare And Swap). ".