Achieved Occupancy column is not shown is Nsight Profiling result - cuda

I have faced a problem that is very weird to me. I can not see the achieved occupancy column in Nsight Performance Analysis output. I am using Geforce 920M GPU, NVIDIA driver of version 425.31, Nsight version of 6.0.0.18296 and visual studio 2017. The Nsight's version is compatible with driver's.
Can anyone help me out? I have quite no idea that why this happens.
I use Nsight performance analysis with CUDA trace checked as bellow:
I also used Visual Profiler but the achieved occupancy could not be seen there, too.
And the GPU examination gives out an error:
Note that as talonmies mentioned the error above was due to not running profiler in administrator mode. And solved but achieved occupancy is still not shown.
And here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#include <iostream>
#define MAX_HISTORGRAM_NUMBER 10000
#define ARRAY_SIZE 102400000
#define CHUNK_SIZE 100
#define THREAD_COUNT 8
#define SCALER 80
cudaError_t histogramWithCuda(int *a, unsigned long long int *c);
__global__ void histogramKernelSingle(unsigned long long int *c, int *a)
{
unsigned long long int worker = blockIdx.x*blockDim.x + threadIdx.x;
unsigned long long int start = worker * CHUNK_SIZE;
unsigned long long int end = start + CHUNK_SIZE;
for (int ex = 0; ex < SCALER; ex++)
for (long long int i = start; i < end; i++)
{
if (i < ARRAY_SIZE)
atomicAdd(&c[a[i]], 1);
else
{
break;
}
}
}
int main()
{
int* a = (int*)malloc(sizeof(int)*ARRAY_SIZE);
unsigned long long int* c = (unsigned long long int*)malloc(sizeof(unsigned long long int)*MAX_HISTORGRAM_NUMBER);
for (unsigned long long i = 0; i < ARRAY_SIZE;i++)
a[i] = rand() % MAX_HISTORGRAM_NUMBER;
for (unsigned long long i = 0; i < MAX_HISTORGRAM_NUMBER; i++)
c[i] = 0;
// Add vectors in parallel.
double start_time = omp_get_wtime();
cudaError_t cudaStatus=histogramWithCuda(a,c);
double end_time = omp_get_wtime();
std::cout << end_time - start_time;
// =
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
unsigned long long int R = 0;
for (int i = 0; i < MAX_HISTORGRAM_NUMBER; i++)
{
R += c[i];
//printf("%d ", c[i]);
}
printf("\nCORRECT:%ld ", R/(SCALER));
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t histogramWithCuda(int *a, unsigned long long int *c)
{
int *dev_a = 0;
unsigned long long int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, MAX_HISTORGRAM_NUMBER * sizeof(unsigned long long int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, ARRAY_SIZE * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
//// BLOCK CALCULATOR HERE
////BLOCK CALCULATOR HERE
histogramKernelSingle << < ARRAY_SIZE / (THREAD_COUNT*CHUNK_SIZE), THREAD_COUNT>> > (dev_c, dev_a);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, MAX_HISTORGRAM_NUMBER * sizeof(unsigned long long int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
return cudaStatus;
}
Thanks in advance.

Achieved Occupancy is only captured in the Profile Activity. The Trace Activity does not support capturing GPU performance counters. Achieved Occupancy is sm__active_warps_sum / sm__actice_cycles_sum / SM__MAX_WARPS * 100.
Nsight Visual Studio Edition
The Trace Activity cannot collect Achieved Occupancy. Run the command Nsight | Start Performance Analysis ... and in the Activity window select Profile CUDA Application (not Trace Application). The default Profile CUDA Application contains the experiment Achieved Occupancy.
NVIDIA Visual Profiler
In NVVP ensure that you are collecting GPU performance counters. The default activity will collect the timeline but will not collect GPU events.
Run | Generate Timeline will not collect Achieved Occupancy
Run | Analyze Application will collect Achieved Occupancy
If you continue to have issues then you may have an issue with permissions on the system. Please try collecting another set of performance counters using Nsight Profile CUDA Application or NVVP | Collect Metrics and Events...

Related

Weird CUDA error in a very simple test [duplicate]

I am new to CUDA. When I multiply the 1024x1024 matrix, and launch a kernel with:
multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
But when I multiply a 2048 x 2048 matrix, with
dim3(64,64,1)
I get this error:
cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure
From tinkering with the code, I think that the error is in this statement
result += a[row * size + ind] * b[col + size * ind];
in the part
b[col+size*ind]
If I take that out, I don't get a kernel launch error (just the wrong answer, obviously). I cannot figure out what's wrong. Any suggestions would be most appreciated.
I am using Visual Studio 2013. I am using the debugger, but this does not help me find the error.
This seems to be a similar problem:
cudaDeviceSynchronize returned error code 4 after launching
many thanks, here is the code:
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row > size || col > size) return;
// target field in 1-D
int z = row * size + col;
int result = 0;
for (int ind = 0; ind < size ; ++ind) {
result += a[row * size + ind] * b[col + size * ind];
}
c[z] = result;
}
int main(){
const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];
for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
a[i] = rand() % 2;
b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for c allocated \n");
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for a allocated \n");
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for b allocated \n");
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");
fprintf(stdout, "about to launch kernel \n");
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
fprintf(stdout, "kernel launched\n");
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
On Windows, I right clicked the NSight monitor icon in the system tray. There I chose Options>General. We see WDDM TDR delay. It was at 2, and I increased it to 10. Then, I ran my program again, and it worked fine.
This was according to Robert's link (see above)
http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

Values of array after cudaMemcpy do not change, any idea why? [duplicate]

Thank you very much for reading my threads.
I am doing CUDA work, but keep getting cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress, without any idea why. I did the search for both the code and the function, surprisingly , only a few records showed up. Very strange.
I basically sum up all pixels of images. To make my questions have as much reference as it can, I am showing all my CUDA code here:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>
using namespace std;
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
__global__ void reduceSum(unsigned short *input,
unsigned long long *per_block_results,
const int n)
{
extern __shared__ unsigned long long sdata[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
unsigned short x = 0;
if(i < n)
{
x = input[i];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
{
if(threadIdx.x < offset)
{
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = sdata[0];
}
}
// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist)
{
int size = N*N;
unsigned long long* dev_sum = 0;
unsigned short* dev_img = 0;
cudaError_t cudaStatus;
const int block_size = 512;
const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
int L = filelist.size();
// Choose which GPU to run on, change this on a multi-GPU system.
double totalgpuinittime = 0;
StartCounter(7);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
totalgpuinittime = GetCounter(7);
unsigned short* img;
unsigned short* pimg;
unsigned long long* sum = new unsigned long long[num_blocks];
unsigned long long* psum = sum;
cout<<endl;
cout << "gpu looping starts, and in progress ..." << endl;
StartCounter(6);
double totalfileiotime = 0;
double totalh2dcpytime = 0;
double totalkerneltime = 0;
double totald2hcpytime = 0;
double totalcpusumtime = 0;
double totalloopingtime = 0;
for (int k = 0; k < L; k++)
{
StartCounter(1);
img = (unsigned short*)LoadTIFF(filelist[k].c_str());
totalfileiotime += GetCounter(1);
psum = sum;
pimg = img;
float gpumean = 0;
memset(psum, 0, sizeof(unsigned long long)*num_blocks);
StartCounter(2);
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totalh2dcpytime += GetCounter(2);
StartCounter(3);
//reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
//reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
totalkerneltime += GetCounter(3);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
// !!!!!! following is where the code 77 error occurs!!!!!!!
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
StartCounter(4);
cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totald2hcpytime += GetCounter(4);
StartCounter(5);
for (int i = 0; i < num_blocks; i++)
{
gpumean += *psum;
psum++;
}
gpumean /= N*N;
totalcpusumtime += GetCounter(5);
delete img;
img = NULL;
cout<<gpumean<<endl;
}
int S = 1e+6;
int F = filelist.size();
float R = S/F;
totalloopingtime = GetCounter(6);
cout<<"gpu looping ends."<<endl<<endl;
cout<< "analysis:"<<endl;
cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
cout<<"file I/O time: "<<endl;
cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
cout<<"host-to-device copy time: "<<endl;
cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
cout<<"pure gpu kerneling time: "<<endl;
cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
cout<<"device-to-host copy time: "<<endl;
cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
/*cout<<"cpu summing time: "<<endl;
cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/
/*cout <<"gpu looping time: " << endl;
cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/
Error:
cudaFree(dev_sum);
cudaFree(dev_img);
delete sum;
sum = NULL;
return cudaStatus;
}
void kernel(float* &mean, int N, vector<string> filelist)
{
// wrapper and kernel
cudaError_t cudaStatus = gpuWrapper(mean, N, filelist);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "gpuWapper failed!");
}
// printf("mean is: %f\n", mean);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
StartCounter(8);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!");
}
cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
//return *mean;
}
I have assigned enough and equivalent memory space for both host and device memory. Any comments is appreciated.
While this may not be the only source of error in the code, you are not allocating any dynamic shared memory for the reduction kernel, leading to the illegal addressing error you see. The correct kernel launch should be something like
size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);
This allocates the equivalent of one unsigned long long for each thread running in the reduction kernel, which (by my very cursory reading of your code) should make the shared memory array sdata the correct size for the kernel to run without out-of-bounds access to that array.

Cannot run CUDA kernel : too many resources requested for launch

Take a look on my self written cuda kernel. I had a big kernel but it returned me error message. Then I simplified it and found that it fails on one loop. I simplified this loop and found that if I use int value or constant value to fill data[threadIdx.x] in loop it works fine. But if I use double type value it returns an error.
Advice: if you are not correctly coping your data from host to device, you can get "warning: Cuda API error detected: cudaLaunch returned (0x7)" message when you use Nsight or you can get segmentation fault error when you run your app from terminal
__global__ void sumSeries(double* dSum,int* totalThreadNumber){
volatile __shared__ double data[768];
double var=0;
data[threadIdx.x]=0;
for ( int i = 10 ; i < 20 ;++i){
var=i;
data[threadIdx.x] += (var)/(var*var+1);
__syncthreads();
}
}
Why it does not work?
int main() {
int threadsPerBlock=768;
int blockCount=8;
int *hostThreadNumber=new int ;
*hostThreadNumber=threadsPerBlock*blockCount;
int* deviceThreadNumber=NULL;
double* deviceSum=NULL;
double* hostSum=(double*)malloc(blockCount);
cudaError_t cuerr=cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
if (cuerr != cudaSuccess){
std::cout<<"Cant SetCacheConfig: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceSum,blockCount*sizeof(double));//размер дабла*число блоков
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceSum: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceThreadNumber,sizeof(int));
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceThreadNumber: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostSum to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceThreadNumber,hostThreadNumber,sizeof(int),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostThreadNumber to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
sumSeries<<<dim3(blockCount),dim3(threadsPerBlock)>>>(deviceSum,deviceThreadNumber);
cuerr=cudaGetLastError();
if (cuerr != cudaSuccess){
std::cout<<"Cuda kernel error: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaDeviceSynchronize();
if (cuerr != cudaSuccess){
std::cout<<"Can not synchronize cuda kernel : "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy data to host: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cudaFree(deviceSum);
cudaFree(deviceThreadNumber);
return 0;
}
You just allocated 8 bytes memory for hostSum
double* hostSum=(double*)malloc(blockCount)
That's wrong if I assumed that you want to allocate blockCount * sizeof(double) bytes for it, because you allocate this amount of memory for deviceSum and uses it for the memory copy between host and device.
cuerr = cudaMalloc(&deviceSum,blockCount*sizeof(double));
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);

CUDA - atomicAdd only adds up to 16777216

I have the following, easily reproducible problem, when running the following kernel, which does nothing except atomicAdds of floats:
#define OUT_ITERATIONS 20000000
#define BLOCKS 12
#define THREADS 192
__global__ void testKernel(float* result) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
float bias = 1.0f;
int n = 1;
while (i < OUT_ITERATIONS) {
atomicAdd(result, bias);
i += BLOCKS * THREADS;
}
}
The kernel is supposed to increment the result OUT_ITERATIONS times, that is 20M. I call the kernel with this standard code:
int main() {
cudaError_t cudaStatus;
float* result;
float* dev_result;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
result = new float;
cudaStatus = cudaMalloc((void**)&dev_result, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaMemset(dev_result, 0, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemset failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
testKernel<<<BLOCKS, THREADS>>>(dev_result);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
printf("Result: %f\n", *result);
However, the result printed at the end is 16777216.0, which is incidentally 0x1000000 in hex. The problem does not occur if OUT_ITERATIONS < 16777216, that is, if I change it to 16777000 for example, sure enough the output is 16777000.0!
System: NVidia-Titan, CUDA 5.5, Windows7
This issue is due to the limited precision of the type float.
float has only 24bit binary precison. If you add 2 numbers where one is more than 2^24-1 times larger than the other, the result will be exactly the same as the larger one.
When you add a big number like 16777216.0(=2^24) with a tiny number like 1.0, you will lost some precison and the result will still be 16777216.0. The same situations happens in a standard C propgram
float a=16777216.0f;
float b=1.0f;
printf("%f\n",a+b);
You could replace float with double or int to solve this problem.
Please refer to cuda doc for the implementation of the double version of atomicAdd()
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
20M does not fit within the available integer precision in a float.
A float quantity does not have 32 bits of mantissa (you discovered how many mantissa bits there are with your observation of "incidentally 0x1000000 in hex"), so it cannot represent all integers in the same way that a int or unsigned int can.
16777216 is the largest integer that can be reliably stored in a float.
Limit your storage range to what will fit in float, or else use some other representation, such as unsigned int or double if you want to reliably store 20M as an integer.
This isn't really a CUDA issue. You'd have similar difficulty trying to store large integers in a float in host code.

Cuda __syncthreads undefined. Without it-> random results

I am new with cuda and I have a problem. I want to put a synchronization to my threads so I tried to use syncthreads. The problem is that Visual Studio 2010 says: idetifier __syncthreads() is undefined... I am using cuda 4.2 by the way. So I decided to use cudaDeviceSynchronize() instead and call it from host. My code is something like the above (i send to you only the important parts):
__global__ void sum( float avg[]){
avg[0]+=1;
avg[1]+=2;
}
int main(){
float avg[2];
float *devAvg;
cudaError_t cudaStatus;
size_t size=sizeof(unsigned char)*2;
cudaStatus = cudaMalloc((void**)&devAvg, size2);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc 2 failed!");
return -1;
}
avg[0]=0;
avg[1]=0;
cudaStatus = cudaMemcpy(devAvg,avg, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
return -1;
}
dim3 nblocks(40,40);
dim3 nthreads(20,20);
sum<<<nblocks,nthreads,msBytes>>>(devAvg);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(avg,devAvg,size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy Device to Host failed!");
return -1;}
cout<<"avg[0]="avg[0]<<" avg[1]="<<avg[1]<<endl;
cudaFree devAvg;
return 0;
}
I thought that the results should be
avg[0]=640.000 avg[1]=1.280.000
but not only my results are different(this could be an overflow problem) but they does not be stable. For example for three different executions the results are:
avg[0]=3041 avg[1]=6604
avg[0]=3015 avg[1]=6578
avg[0]=3047 avg[1]=6600
So what I am doing wrong here?Is it a synchronization problem?And why I cannot use __syncthreads()
Or is it the problem of race conditions?
Additionally for the __syncthreads() problem it comes with any code that I write. Even the simplest one:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <Windows.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
__syncthreads();
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
return 0;
}
It is saying this: Error: identifier "__syncthreads()" is undefined
The funny part is that even with the sample codes that comes with the 4.2 CUDA SDK the same thing happens... Maybe is something more general wrong because there are more functions in the SDK samples that are considered undefined.
All of your blocks of threads are writing to the same two locations. The only way to make this work properly is to use atomic operations. Otherwise the results of threads reading the location, adding to it and writing the result back to the location "simultaneously" is undefined.
If you rewrite your kernel as follows:
__global__ void sum( float avg[]){
atomicAdd(&(avg[0]),1);
atomicAdd(&(avg[1]),2);
}
It should resolve the issue you are seeing.
To answer the question about __syncthreads(), I would need to see the exact code that caused the compiler error. If you post that, I'll update my answer. There shouldn't be a problem with inserting a __syncthreads() call in this kernel, although it won't fix the problem you are seeing.
You may wish to review the atomic operations section of the C programming guide.
Note that using atomics generally will cause your code to run slower, so they should be used carefully. However for this learning exercise it should sort out the issue for you.
also note that the code you posted doesn't compile cleanly, there are a number of missing definitions, and a variety of other issues with your code. But since you are posting results, I assume you have some version of this working, even though you haven't posted it. Therefore I haven't identified every issue with the code that you have posted.
Here is code that is similar to yours with all of the various coding issues fixed, and it seems to work for me:
#include <stdio.h>
#include <iostream>
#define msBytes 0
__global__ void sum( float avg[]){
atomicAdd(&(avg[0]),1);
atomicAdd(&(avg[1]),2);
}
int main(){
float avg[2];
float *devAvg;
cudaError_t cudaStatus;
size_t size=sizeof(float)*2;
cudaStatus = cudaMalloc((void**)&devAvg, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc 2 failed!");
return -1;
}
avg[0]=0;
avg[1]=0;
cudaStatus = cudaMemcpy(devAvg,avg, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
return -1;
}
dim3 nblocks(40,40);
dim3 nthreads(20,20);
sum<<<nblocks,nthreads,msBytes>>>(devAvg);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(avg,devAvg,size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy Device to Host failed!");
return -1;}
std::cout<<"avg[0]="<<avg[0]<<" avg[1]="<<avg[1]<<std::endl;
cudaFree(devAvg);
return 0;
}
I get the following output when I run it:
avg[0]=640000 avg[1]=1.28e+06
Also note that for atomicAdd to be usable on float, it's necessary to have a compute capability 2.0 or better device (and to pass the compiler switch e.g. -arch=sm_20 to compile for that kind of device). If you have an earlier device (compute capability 1.x) then you can create a similar program defining avg[] as int instead of float. Or if you prefer, you can create your own atomicAdd __ device__ function that is usable on a cc 1.x device as suggested here in the section beginning with "Note however that any atomic operation can be implemented based on atomicCAS() (Compare And Swap). ".