Checking currently residing entities in GPU memory - cuda

What would be the easiest way of checking which (and their size) entities that have been allocated with cudaMalloc (), reside currently on a GPU device? I want to find a memory leak inside a function, that if it's just called once and exit, there is no memory leak (checked via cuda-memcheck), but if called multiple times the memory footprint gets bigger and bigger.
Nsight Visual Profiler seems too complex for what I ask and cuda-memcheck finds no leak!

There is no way to do this with the CUDA APIs. If you did want to do this, you would need to make your own instrumentation system which wraps the CUDA memory allocation/deallocation APIs which your code calls. The simplest implementation could look something like this:
#include <iostream>
#include <vector>
#include <algorithm>
typedef std::pair<void*, size_t> mrecord;
struct mymemory
{
std::vector<mrecord> mstack;
mymemory() {};
cudaError_t cudaMalloc(void** p, size_t sz);
cudaError_t cudaFree(void* p);
void print_stack();
};
cudaError_t mymemory::cudaMalloc(void** p, size_t sz)
{
cudaError_t ret = ::cudaMalloc(p, sz);
if (ret == cudaSuccess) {
mstack.push_back(mrecord(*p,sz));
}
return ret;
};
cudaError_t mymemory::cudaFree(void* p)
{
cudaError_t ret = ::cudaFree(p);
if (ret == cudaSuccess) {
auto rit = std::find_if( mstack.begin(), mstack.end(),
[&](const mrecord& r){ return r.first == p; } );
if (rit != mstack.end()) {
mstack.erase(rit);
}
}
return ret;
};
void mymemory::print_stack()
{
auto it = mstack.begin();
for(; it != mstack.end(); ++it) {
mrecord rec = *it;
std::cout << rec.first << " : " << rec.second << std::endl;
}
}
int main(void)
{
const int nallocs = 10;
void* pointers[nallocs];
mymemory mdebug;
for(int i=0; i<nallocs; ++i) {
mdebug.cudaMalloc(&pointers[i], 4<<i);
}
std::cout << "After Allocation" << std::endl;
mdebug.print_stack();
mdebug.cudaFree(pointers[1]);
mdebug.cudaFree(pointers[7]);
mdebug.cudaFree(pointers[8]);
mdebug.cudaFree(0);
std::cout << "After Deallocation" << std::endl;
mdebug.print_stack();
return 0;
}
[Warning: only very lightly tested and required C++11 compiler support]
which would do this:
~/SO$ nvcc -std=c++11 -g -arch=sm_52 instrumentation.cu
~/SO$ ./a.out
After Allocation
0x705e40000 : 4
0x705e40200 : 8
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705e40e00 : 512
0x705e41000 : 1024
0x705f40000 : 2048
After Deallocation
0x705e40000 : 4
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705f40000 : 2048
This might be enough to understand which memory allocations are leaking. But be aware that memory management on the GPU isn't as predictable as you might believe it to be, and you need to be careful when diagnosing a memory leak just on the basis of the amount of free memory which the device reports at any given instant. See this question for some more details.

Related

CUDA: Invalid Device Pointer error when reallocating memory

In the following code I am simply calling a function foo twice serially from main. The function simply does device memory allocation , and then increments this pointer. Then it exits and goes back to main.
First time foo is called memory is correctly allocated. But now as you can see in output when I call foo again, cuda memory allocation is failing with an error invalid device pointer
I tried using cudaThreadSynchronize() between two foo calls, but no gain. Why memory allocation failing ?
Actually the error is casued due to
matrixd += 3;
Because if I don't do this increment the error disappeared.
But why , even though I am using cudaFree() ?
Kindly help me understand this.
My Output is here
Calling foo for the first time
Allocation of matrixd passed:
I came back to main safely :-)
I am going back to foo again :-)
Allocation of matrixd failed, the reason is: invalid device pointer
My main() is here
#include<stdio.h>
#include <cstdlib> // malloc(), free()
#include <iostream> // cout, stream
#include <math.h>
#include <ctime> // time(), clock()
#include <bitset>
bool foo( );
/***************************************
Main method.
****************************************/
int main()
{
// Perform one warm-up pass and validate
std::cout << "Calling foo for the first time"<<std::endl;
foo();
std::cout << "I came back to main safely :-) "<<std::endl;
std::cout << "I am going back to foo again :-) "<<std::endl;
foo( );
getchar();
return 0;
}
Definition of foo() is in this file :
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#include <iostream>
bool foo( )
{
// Error return value
cudaError_t status;
// Number of bytes in the matrix.
int bytes = 9 *sizeof(float);
// Pointers to the device arrays
float *matrixd=NULL;
// Allocate memory on the device to store matrix
cudaMalloc((void**) &matrixd, bytes);
status = cudaGetLastError(); //To check the error
if (status != cudaSuccess) {
std::cout << "Allocation of matrixd failed, the reason is: " << cudaGetErrorString(status) <<
std::endl;
cudaFree(matrixd); //Free call for memory
return false;
}
std::cout << "Allocation of matrixd passed: "<<std::endl;
////// Increment address
for (int i=0; i<3; i++){
matrixd += 3;
}
// Free device memory
cudaFree(matrixd);
return true;
}
Update
With better error checking. Also I am incrementalism the device pointer only once. This time I get following output:
Calling foo for the first time
Allocation of matrixd passed:
Increamented the pointer and going to free cuda memory:
GPUassert: invalid device pointer C:/Users/user/Desktop/Gauss/Gauss/GaussianElem
inationGPU.cu 44
Line number 44 is cudaFree(). Why it still failing?
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// GPU function for direct method Gross Jorden method.
bool foo( )
{
// Error return value
cudaError_t status;
// Number of bytes in the matrix.
int bytes = 9 *sizeof(float);
// Pointers to the device arrays
float *matrixd=NULL;
// Allocate memory on the device to store each matrix
gpuErrchk( cudaMalloc((void**) &matrixd, bytes));
//cudaMemset(outputMatrixd, 0, bytes);
std::cout << "Allocation of matrixd passed: "<<std::endl;
////// Incerament address
matrixd += 1;
std::cout << "Increamented the pointer and going to free cuda memory: "<<std::endl;
// Free device memory
gpuErrchk( cudaFree(matrixd));
return true;
}
The real problem is in this code:
for (int i=0; i<3; i++){
matrixd += 3;
}
// Free device memory
cudaFree(matrixd);
You never allocated matrixd+9, so passing it to cudaFree is illegal and produces an invalid device pointer error. This error is being propagated to the next time you perform error checking, which is after the subsequent call to cudaMalloc. If you read the documentation for any of these API calls you will note that there is a warning that they can return errors from prior GPU operations. This is what is happening in this case.
Error checking in the CUDA runtime API can be subtle to do correctly. There is a robust, ready recipe for how to do it here. I suggest you use it.

Counting occurrences of specific events in CUDA kernels

Problem
I am trying to find the best way to count how many times my program ends up in some specific branches of my CUDA kernels. The idea is that some events should nearly never happen, but since the data processed by the GPU is given by a numerical optimization solver, there may be some situations where ill-defined cases become more common. Thus, I want to be able to track/monitor these phenomenons over multiple simulations to make some global statistics later.
Possible idea
The most straightforward way to do this may be to use a structure dedicated to monitoring such occurrences. Then, when entering a monitored branch, we increment the associated counter using atomicAdd. At the end of the simulation, we copy the counters back to the host and store them for some future statistics processing.
In my case, the cost of using atomicAdd should not be that important since I should not be entering those branches that much, but still, I may want to monitor some of the common branches later on, so what would be a better approach then? Since this is just for monitoring, I do not want the overhead to be too important.
I guess I could also have one monitoring structure per block and do a sum at the end, since it should not use much global memory anyway (1 unsigned int per monitored branch).
Code example
#include <iostream>
#include <time.h>
#include <cuda.h>
#include <stdio.h>
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
struct Stats
{
unsigned int even;
};
__global__ void test_kernel(int* A, int* B, Stats* stats)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int res = A[tid] + (int)tid;
if (res%2 == 0)
atomicAdd(&(stats->even), 1);
B[tid] = res;
}
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
void print_array(int* ar, unsigned int n)
{
for (unsigned int i = 0; i < n; ++i)
std::cout << ar[i] << " ";
std::cout << std::endl;
}
void print_stats(Stats* s)
{
std::cout << "even: " << s->even << std::endl;
}
int main()
{
// vector size
const unsigned int N = 10;
// device vectors
int *d_A, *d_B;
Stats *d_stats;
// host vectors
int *h_A, *h_B;
Stats *h_stats;
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc(&d_A, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_B, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_stats, sizeof(Stats)));
// allocate host memory
h_A = new int[N];
h_B = new int[N];
h_stats = new Stats;
// initialize host data
srand(time(NULL));
for (unsigned int i = 0; i < N; ++i)
{
h_A[i] = get_random_int(0,10);
h_B[i] = 0;
}
memset(h_stats, 0, sizeof(Stats));
// copy data to the device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_stats, h_stats, sizeof(Stats), cudaMemcpyHostToDevice));
// launch kernel
dim3 grid_size, block_size;
grid_size.x = N;
test_kernel<<<grid_size, block_size>>>(d_A, d_B, d_stats);
// copy result back to host
CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_stats, d_stats, sizeof(Stats), cudaMemcpyDeviceToHost));
print_array(h_B, N);
print_stats(h_stats);
// free device memory
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_stats));
// free host memory
delete [] h_A;
delete [] h_B;
delete h_stats;
}
Hardware/software information
The solution I am looking for should work for CC >= 2.0 devices and CUDA >= 5.0.
The atomicAdd is is one possibility and i would probably go that route. If you do not use the result of the atomicAdd function call the compiler will emit a reduction operation such as RED.E.ADD. Reduction is very fast as long as there are not many conflicts happening (i actually use it sometimes even if i do not need the operation to be atomic because it can be quicker than loading value from global memory, doing an arithmetic operation and saving back to global memory).
The second option you have is to use a profiler counter and use the profiler to analyze the result. Please see Profiler Counter Function for more details.

CUDA pinned memory flushing from the device

CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.
There is no shared memory access between threads, every thread is standalone.
I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize on the host.
I want to be able to:
Flush into the pinned memory as soon as the device has updated it.
Not block the device thread (maybe by copying asynchronously)
I tried calling __threadfence_system and __threadfence after each device write, but that didn't flush.
Below is a full sample CUDA code that demonstrates my question:
#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}
Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}
I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:
The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.
I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1 2 and for a long time does nothing until stream finished... is printed to the console.
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )
bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}
class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);
private:
void call();
};
void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}
void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}
int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));
int filledElementsCounter = 0;
while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.
You cannot pass the host pointer directly to the kernel. If you allocate host memory using cudaHostAlloc with cudaHostAllocMapped flag, then first you have to retrieve the device pointer of the mapped host memory before you can use it in the kernel. Use cudaHostGetDevicePointer to get the device pointer of mapped host memory.
float* hResult, *dResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
cudaHostGetDevicePointer(&dResult,hResult);
Kernel<<<1,blocks>>>(dResult);
Calling __threadfence_system() will ensure that the write is visible to the system before proceeding, but your CPU will be caching the h_result variable and hence you're just spinning on the old value in an infinite loop. Try marking h_result as volatile.

Cuda __syncthreads undefined. Without it-> random results

I am new with cuda and I have a problem. I want to put a synchronization to my threads so I tried to use syncthreads. The problem is that Visual Studio 2010 says: idetifier __syncthreads() is undefined... I am using cuda 4.2 by the way. So I decided to use cudaDeviceSynchronize() instead and call it from host. My code is something like the above (i send to you only the important parts):
__global__ void sum( float avg[]){
avg[0]+=1;
avg[1]+=2;
}
int main(){
float avg[2];
float *devAvg;
cudaError_t cudaStatus;
size_t size=sizeof(unsigned char)*2;
cudaStatus = cudaMalloc((void**)&devAvg, size2);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc 2 failed!");
return -1;
}
avg[0]=0;
avg[1]=0;
cudaStatus = cudaMemcpy(devAvg,avg, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
return -1;
}
dim3 nblocks(40,40);
dim3 nthreads(20,20);
sum<<<nblocks,nthreads,msBytes>>>(devAvg);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(avg,devAvg,size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy Device to Host failed!");
return -1;}
cout<<"avg[0]="avg[0]<<" avg[1]="<<avg[1]<<endl;
cudaFree devAvg;
return 0;
}
I thought that the results should be
avg[0]=640.000 avg[1]=1.280.000
but not only my results are different(this could be an overflow problem) but they does not be stable. For example for three different executions the results are:
avg[0]=3041 avg[1]=6604
avg[0]=3015 avg[1]=6578
avg[0]=3047 avg[1]=6600
So what I am doing wrong here?Is it a synchronization problem?And why I cannot use __syncthreads()
Or is it the problem of race conditions?
Additionally for the __syncthreads() problem it comes with any code that I write. Even the simplest one:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <Windows.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
__syncthreads();
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
return 0;
}
It is saying this: Error: identifier "__syncthreads()" is undefined
The funny part is that even with the sample codes that comes with the 4.2 CUDA SDK the same thing happens... Maybe is something more general wrong because there are more functions in the SDK samples that are considered undefined.
All of your blocks of threads are writing to the same two locations. The only way to make this work properly is to use atomic operations. Otherwise the results of threads reading the location, adding to it and writing the result back to the location "simultaneously" is undefined.
If you rewrite your kernel as follows:
__global__ void sum( float avg[]){
atomicAdd(&(avg[0]),1);
atomicAdd(&(avg[1]),2);
}
It should resolve the issue you are seeing.
To answer the question about __syncthreads(), I would need to see the exact code that caused the compiler error. If you post that, I'll update my answer. There shouldn't be a problem with inserting a __syncthreads() call in this kernel, although it won't fix the problem you are seeing.
You may wish to review the atomic operations section of the C programming guide.
Note that using atomics generally will cause your code to run slower, so they should be used carefully. However for this learning exercise it should sort out the issue for you.
also note that the code you posted doesn't compile cleanly, there are a number of missing definitions, and a variety of other issues with your code. But since you are posting results, I assume you have some version of this working, even though you haven't posted it. Therefore I haven't identified every issue with the code that you have posted.
Here is code that is similar to yours with all of the various coding issues fixed, and it seems to work for me:
#include <stdio.h>
#include <iostream>
#define msBytes 0
__global__ void sum( float avg[]){
atomicAdd(&(avg[0]),1);
atomicAdd(&(avg[1]),2);
}
int main(){
float avg[2];
float *devAvg;
cudaError_t cudaStatus;
size_t size=sizeof(float)*2;
cudaStatus = cudaMalloc((void**)&devAvg, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc 2 failed!");
return -1;
}
avg[0]=0;
avg[1]=0;
cudaStatus = cudaMemcpy(devAvg,avg, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
return -1;
}
dim3 nblocks(40,40);
dim3 nthreads(20,20);
sum<<<nblocks,nthreads,msBytes>>>(devAvg);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(avg,devAvg,size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy Device to Host failed!");
return -1;}
std::cout<<"avg[0]="<<avg[0]<<" avg[1]="<<avg[1]<<std::endl;
cudaFree(devAvg);
return 0;
}
I get the following output when I run it:
avg[0]=640000 avg[1]=1.28e+06
Also note that for atomicAdd to be usable on float, it's necessary to have a compute capability 2.0 or better device (and to pass the compiler switch e.g. -arch=sm_20 to compile for that kind of device). If you have an earlier device (compute capability 1.x) then you can create a similar program defining avg[] as int instead of float. Or if you prefer, you can create your own atomicAdd __ device__ function that is usable on a cc 1.x device as suggested here in the section beginning with "Note however that any atomic operation can be implemented based on atomicCAS() (Compare And Swap). ".

Contiguous Memory Allocation on GPU

Does cudaMalloc allocate contiguous chunks of memory (i.e., physical bytes next to each other)?
I have a piece of CUDA code that simply copies 128 bytes from global device memory to shared memory, using 32 threads. I am trying to find a way to guarantee that this transfer can be completed in one memory transaction of 128 byes. If cudaMalloc allocates contiguous memory blocks, then it can be easily done.
Following is the code:
#include <iostream>
using namespace std;
#define SIZE 32 //SIZE of the array to store in shared memory
#define NUMTHREADS 32
__global__ void copy(uint* memPointer){
extern __shared__ uint bits[];
int tid = threadIdx.x;
bits[tid] = memPointer[tid];
}
int main(){
uint inputData[SIZE];
uint* storedData;
for(int i=0;i<SIZE;i++){
inputData[i] = i;
}
cudaError_t e1=cudaMalloc((void**) &storedData, sizeof(uint)*SIZE);
if(e1 == cudaSuccess){
cudaError_t e3= cudaMemcpy(storedData, inputData, sizeof(uint)*SIZE, cudaMemcpyHostToDevice);
if(e3==cudaSuccess){
copy<<<1,NUMTHREADS, SIZE*4>>>(storedData);
cudaError_t e6 = cudaFree(storedData);
if(e6==cudaSuccess){
}
else{
cout << "Error freeing memory storedData" << e6 << endl;
}
}
else{
cout << "Failed to copy" << " " << e3 << endl;
}
}
else{
cout << "Failed to allocate memory" << " " << e1 << endl;
}
return 0;
}
Yes, cudaMalloc allocates contiguous chunks of memory. The "Matrix Transpose" example in the SDK (http://developer.nvidia.com/cuda-cc-sdk-code-samples) has a kernel called "copySharedMem" that does almost exactly what you're describing.