Weird CUDA error in a very simple test [duplicate] - cuda

I am new to CUDA. When I multiply the 1024x1024 matrix, and launch a kernel with:
multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
But when I multiply a 2048 x 2048 matrix, with
dim3(64,64,1)
I get this error:
cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure
From tinkering with the code, I think that the error is in this statement
result += a[row * size + ind] * b[col + size * ind];
in the part
b[col+size*ind]
If I take that out, I don't get a kernel launch error (just the wrong answer, obviously). I cannot figure out what's wrong. Any suggestions would be most appreciated.
I am using Visual Studio 2013. I am using the debugger, but this does not help me find the error.
This seems to be a similar problem:
cudaDeviceSynchronize returned error code 4 after launching
many thanks, here is the code:
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row > size || col > size) return;
// target field in 1-D
int z = row * size + col;
int result = 0;
for (int ind = 0; ind < size ; ++ind) {
result += a[row * size + ind] * b[col + size * ind];
}
c[z] = result;
}
int main(){
const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];
for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
a[i] = rand() % 2;
b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for c allocated \n");
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for a allocated \n");
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for b allocated \n");
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");
fprintf(stdout, "about to launch kernel \n");
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
fprintf(stdout, "kernel launched\n");
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}

On Windows, I right clicked the NSight monitor icon in the system tray. There I chose Options>General. We see WDDM TDR delay. It was at 2, and I increased it to 10. Then, I ran my program again, and it worked fine.
This was according to Robert's link (see above)
http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

Related

memory location error: thrust::stable_sort when using big array and user-defined comparison operator

I'm running this code to sort big array of IPs using thrust stable_sort and user defined operator to compare the IPs.
this code is working for arrays less than 50000 IPs, but I got a memory error for big arrays.
here is the code I used:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
template<typename T>
struct vector_less
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
__host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
if (lhs[0] == rhs[0])
if (lhs[1] == rhs[1])
if (lhs[2] == rhs[2])
return lhs[3] < rhs[3];
else
return lhs[2] < rhs[2];
else
return lhs[1] < rhs[1];
else
return lhs[0] < rhs[0];
}
};
__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
int thread = threadIdx.x + blockIdx.x * blockDim.x;
if (thread < searchedIpsSize)
{
dev_sorted_Ips[thread] = new unsigned char[4];
dev_sorted_Ips[thread][0] = ip_b1[thread];
dev_sorted_Ips[thread][1] = ip_b2[thread];
dev_sorted_Ips[thread][2] = ip_b3[thread];
dev_sorted_Ips[thread][3] = ip_b4[thread];
}
}
int main()
{
const int size = 1000000;
unsigned char * ip_b1 = new unsigned char[size];
unsigned char * ip_b2 = new unsigned char[size];;
unsigned char * ip_b3 = new unsigned char[size];;
unsigned char * ip_b4 = new unsigned char[size];;
unsigned char * dev_ip_b1;
unsigned char * dev_ip_b2;
unsigned char * dev_ip_b3;
unsigned char * dev_ip_b4;
unsigned char ** dev_sortedIps;
for (int i = 0; i < size; i++)
{
ip_b1[i] = rand() % 240;
ip_b2[i] = rand() % 240;
ip_b3[i] = rand() % 240;
ip_b4[i] = rand() % 240;
}
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
int resetThreads = size;
int resetBlocks = 1;
if (size > 1024)
{
resetThreads = 1024;
resetBlocks = size / 1024;
if (size % 1024 > 0)
resetBlocks++;
}
prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);
thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
goto Error;
}
return 0;
Error:
cudaFree(dev_ip_b1);
cudaFree(dev_ip_b2);
cudaFree(dev_ip_b3);
cudaFree(dev_ip_b4);
cudaFree(dev_sortedIps);
}
the error I got is :
Microsoft C++ exception: thrust::system::system_error at memory location
how to solve this problem with big arrays?
should I use another technique to achieve this sorting such as dividing and sorting for the parts then merging?
The proximal issue is that in-kernel malloc and new are limited in the size of the device heap that they have available to allocate from. This limit can be raised. Please read the documentation.
A few other suggestions:
You're not doing any error checking after your kernel (before the first thrust call). You should do error checking on the kernel, then you would discover that your kernel is what is failing, and thrust is merely reporting the error for you. Avoid the confusion. Do rigorous, proper cuda error checking any time you are having trouble with a CUDA code.
As a good practice, it's not a bad idea, at least for debugging purposes, to test any pointer return by new or malloc for NULL. This is how the API informs you that an allocation failure occurred.
The code below demonstrates a possible workaround for the proximal issue, by adjusting the device heap for the input size. It also demonstrates possible ways to address the other two suggestions:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <device_functions.h>
#include <assert.h>
template<typename T>
struct vector_less
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
__host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
if (lhs[0] == rhs[0])
if (lhs[1] == rhs[1])
if (lhs[2] == rhs[2])
return lhs[3] < rhs[3];
else
return lhs[2] < rhs[2];
else
return lhs[1] < rhs[1];
else
return lhs[0] < rhs[0];
}
};
__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
int thread = threadIdx.x + blockIdx.x * blockDim.x;
if (thread < searchedIpsSize)
{
dev_sorted_Ips[thread] = new unsigned char[4];
if (dev_sorted_Ips[thread] == NULL) assert(0);
dev_sorted_Ips[thread][0] = ip_b1[thread];
dev_sorted_Ips[thread][1] = ip_b2[thread];
dev_sorted_Ips[thread][2] = ip_b3[thread];
dev_sorted_Ips[thread][3] = ip_b4[thread];
}
}
int main(int argc, char *argv[])
{
int size = 50000;
if (argc > 1) size = atoi(argv[1]);
int chunks = size/50000 + 1;
cudaError_t cudaStatus;
cudaStatus = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8000000 * chunks);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "set device heap limit failed!");
}
unsigned char * ip_b1 = new unsigned char[size];
unsigned char * ip_b2 = new unsigned char[size];;
unsigned char * ip_b3 = new unsigned char[size];;
unsigned char * ip_b4 = new unsigned char[size];;
unsigned char * dev_ip_b1;
unsigned char * dev_ip_b2;
unsigned char * dev_ip_b3;
unsigned char * dev_ip_b4;
unsigned char ** dev_sortedIps;
for (int i = 0; i < size; i++)
{
ip_b1[i] = rand() % 240;
ip_b2[i] = rand() % 240;
ip_b3[i] = rand() % 240;
ip_b4[i] = rand() % 240;
}
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
int resetThreads = size;
int resetBlocks = 1;
if (size > 1024)
{
resetThreads = 1024;
resetBlocks = size / 1024;
if (size % 1024 > 0)
resetBlocks++;
}
prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess){
printf(" kernel fail\n");
exit(0);}
thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
}
return 0;
}
Note that you can test various sizes by passing the desired size as a command line parameter. I tested up to 1000000, it seemed to work fine. Eventually, for a large enough problem size, you will run out of memory on your GPU. You don't indicate what GPU you have.
I've removed the goto statements, because I am working on linux (apparently you've switched back to windows). I would suggest you come up with a different error handling process than using goto, if for no other reason than that it causes difficulty with thrust constructs.
Also note that in-kernel new or malloc is kind of "slow". You could probably speed this up for larger sizes substantially by doing your necessary allocation up-front, with a single cudaMalloc call of the appropriate size. Unfortunately this is complicated by your use of the double-pointer array dev_sorted_Ips. I would suggest that you instead flatten that to a single pointer array, allocate the necessary size once via cudaMalloc, and do the necessary array indexing in your kernel to make it work. If you profile this code, you'll discover that the vast majority of execution time for longer cases (e.g. size = 1000000) is consumed by your prepare_ips_list kernel, not the sorting operation. So the focus of your efforts for performance improvement should begin there.

Values of array after cudaMemcpy do not change, any idea why? [duplicate]

Thank you very much for reading my threads.
I am doing CUDA work, but keep getting cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress, without any idea why. I did the search for both the code and the function, surprisingly , only a few records showed up. Very strange.
I basically sum up all pixels of images. To make my questions have as much reference as it can, I am showing all my CUDA code here:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>
using namespace std;
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
__global__ void reduceSum(unsigned short *input,
unsigned long long *per_block_results,
const int n)
{
extern __shared__ unsigned long long sdata[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
unsigned short x = 0;
if(i < n)
{
x = input[i];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
if(threadIdx.x < offset)
{
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = sdata[0];
}
}
// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist)
{
int size = N*N;
unsigned long long* dev_sum = 0;
unsigned short* dev_img = 0;
cudaError_t cudaStatus;
const int block_size = 512;
const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
int L = filelist.size();
// Choose which GPU to run on, change this on a multi-GPU system.
double totalgpuinittime = 0;
StartCounter(7);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
totalgpuinittime = GetCounter(7);
unsigned short* img;
unsigned short* pimg;
unsigned long long* sum = new unsigned long long[num_blocks];
unsigned long long* psum = sum;
cout<<endl;
cout << "gpu looping starts, and in progress ..." << endl;
StartCounter(6);
double totalfileiotime = 0;
double totalh2dcpytime = 0;
double totalkerneltime = 0;
double totald2hcpytime = 0;
double totalcpusumtime = 0;
double totalloopingtime = 0;
for (int k = 0; k < L; k++)
{
StartCounter(1);
img = (unsigned short*)LoadTIFF(filelist[k].c_str());
totalfileiotime += GetCounter(1);
psum = sum;
pimg = img;
float gpumean = 0;
memset(psum, 0, sizeof(unsigned long long)*num_blocks);
StartCounter(2);
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totalh2dcpytime += GetCounter(2);
StartCounter(3);
//reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
//reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
totalkerneltime += GetCounter(3);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
// !!!!!! following is where the code 77 error occurs!!!!!!!
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
StartCounter(4);
cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totald2hcpytime += GetCounter(4);
StartCounter(5);
for (int i = 0; i < num_blocks; i++)
{
gpumean += *psum;
psum++;
}
gpumean /= N*N;
totalcpusumtime += GetCounter(5);
delete img;
img = NULL;
cout<<gpumean<<endl;
}
int S = 1e+6;
int F = filelist.size();
float R = S/F;
totalloopingtime = GetCounter(6);
cout<<"gpu looping ends."<<endl<<endl;
cout<< "analysis:"<<endl;
cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
cout<<"file I/O time: "<<endl;
cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
cout<<"host-to-device copy time: "<<endl;
cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
cout<<"pure gpu kerneling time: "<<endl;
cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
cout<<"device-to-host copy time: "<<endl;
cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
/*cout<<"cpu summing time: "<<endl;
cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/
/*cout <<"gpu looping time: " << endl;
cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/
Error:
cudaFree(dev_sum);
cudaFree(dev_img);
delete sum;
sum = NULL;
return cudaStatus;
}
void kernel(float* &mean, int N, vector<string> filelist)
{
// wrapper and kernel
cudaError_t cudaStatus = gpuWrapper(mean, N, filelist);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "gpuWapper failed!");
}
// printf("mean is: %f\n", mean);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
StartCounter(8);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!");
}
cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
//return *mean;
}
I have assigned enough and equivalent memory space for both host and device memory. Any comments is appreciated.
While this may not be the only source of error in the code, you are not allocating any dynamic shared memory for the reduction kernel, leading to the illegal addressing error you see. The correct kernel launch should be something like
size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);
This allocates the equivalent of one unsigned long long for each thread running in the reduction kernel, which (by my very cursory reading of your code) should make the shared memory array sdata the correct size for the kernel to run without out-of-bounds access to that array.

How to call cudaMalloc from a separate function?

I'm learning cuda and try to write a function that allocate memory on the device in a similar way to that on the host. For example:
//host
float* allocate1D_float(int size)
{
float* array = (float*)malloc(size* sizeof(float));
if (array==NULL)
{
printf("\n Error allocating memory 1\n");
free(array);
exit(EXIT_FAILURE);
}
return array;
}
float *h_A = allocate1D_float(numElements);
//device
float* alloc_cuda1D_float(int numElements)
{
float *d_array = NULL;
size_t size = numElements * sizeof(float);
cudaError_t err = cudaSuccess;
err = cudaMalloc((void **)&d_array, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
return d_array;
}
float *d_A = alloc_cuda1D_float(int numElements);
However, nvcc keep saying
error: type name is not allowed
error: expected a ")"
for device function while the host function is ok. Hope you can help me to figure out the issue.
Thanks.
Regarding "type name is not allowed":
You did this correctly:
float *h_A = allocate1D_float(numElements);
But this is wrong:
float *d_A = alloc_cuda1D_float(int numElements);
^^^
This int shouldn't be here
So remove the int right in front of numElements
This of course has nothing to do with CUDA. Your host function call would have given a similar error if you attempted to put int where it doesn't belong in that call.

CUDA - atomicAdd only adds up to 16777216

I have the following, easily reproducible problem, when running the following kernel, which does nothing except atomicAdds of floats:
#define OUT_ITERATIONS 20000000
#define BLOCKS 12
#define THREADS 192
__global__ void testKernel(float* result) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
float bias = 1.0f;
int n = 1;
while (i < OUT_ITERATIONS) {
atomicAdd(result, bias);
i += BLOCKS * THREADS;
}
}
The kernel is supposed to increment the result OUT_ITERATIONS times, that is 20M. I call the kernel with this standard code:
int main() {
cudaError_t cudaStatus;
float* result;
float* dev_result;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
result = new float;
cudaStatus = cudaMalloc((void**)&dev_result, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaMemset(dev_result, 0, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemset failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
testKernel<<<BLOCKS, THREADS>>>(dev_result);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
printf("Result: %f\n", *result);
However, the result printed at the end is 16777216.0, which is incidentally 0x1000000 in hex. The problem does not occur if OUT_ITERATIONS < 16777216, that is, if I change it to 16777000 for example, sure enough the output is 16777000.0!
System: NVidia-Titan, CUDA 5.5, Windows7
This issue is due to the limited precision of the type float.
float has only 24bit binary precison. If you add 2 numbers where one is more than 2^24-1 times larger than the other, the result will be exactly the same as the larger one.
When you add a big number like 16777216.0(=2^24) with a tiny number like 1.0, you will lost some precison and the result will still be 16777216.0. The same situations happens in a standard C propgram
float a=16777216.0f;
float b=1.0f;
printf("%f\n",a+b);
You could replace float with double or int to solve this problem.
Please refer to cuda doc for the implementation of the double version of atomicAdd()
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
20M does not fit within the available integer precision in a float.
A float quantity does not have 32 bits of mantissa (you discovered how many mantissa bits there are with your observation of "incidentally 0x1000000 in hex"), so it cannot represent all integers in the same way that a int or unsigned int can.
16777216 is the largest integer that can be reliably stored in a float.
Limit your storage range to what will fit in float, or else use some other representation, such as unsigned int or double if you want to reliably store 20M as an integer.
This isn't really a CUDA issue. You'd have similar difficulty trying to store large integers in a float in host code.

I lost data after __syncthreads() in cuda

I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help
#include
#include
#include
#include
#include
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void findMax(int size,float *array_device , float *outPut)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< size)
{
sdata[tid] = array_device[i];
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
__threadfence();
}
__syncthreads();
if(tid<size)
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
{
if (tid < s)
{
sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if (tid == 0) outPut[blockIdx.x] = sdata[0];
}
int main()
{
long double M = pow(2,20);
long double N = 2;
int noThreadsPerBlock = 512 ;
printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
scanf("%Lf",&N);
long int size = 16;
int numOfBlock = (int)size /noThreadsPerBlock + 1;
printf("\n num of blocks==%ld",numOfBlock);
float *array_device , *outPut;
float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
cudaMalloc((void **)&array_device, size*sizeof(float));
cudaMalloc((void **)&outPut, size*sizeof(float));
cudaError_t error0 = cudaGetLastError();
printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
printf("size===%ld",size);
cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
cudaError_t error1 = cudaGetLastError();
printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));
while(size>1 )
{
cudaPrintfInit();
findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaError_t error2 = cudaGetLastError();
printf(" 2CUDA error: %s\n", cudaGetErrorString(error2));
cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
size = numOfBlock;
printf("\n ****size==%ld\n",size);
numOfBlock = (int)size /noThreadsPerBlock + 1;
}
cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t error3 = cudaGetLastError();
printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
for(int i=0;i<size;i++)
printf("\n index==%d ;data=%f ",i,array_host[i]);
return 0;
}
I'm posting my comment as an answer as requested.
Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like:
findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>
Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.