CUDA atomic function usage with volatile shared memory - cuda

I have a CUDA kernel that needs to use an atomic function on volatile shared integer memory. However, when I try to declare the shared memory as volatile and use it in an atomic function, I get an error message.
Below is some minimalist code that reproduces the error. Please note that the following kernel does nothing and horribly abuses why you would ever want to declare shared memory as volatile (or even use shared memory at all). But it does reproduce the error.
The code uses atomic functions on shared memory, so, to run it, you probably need to compile with "arch12" or higher (in Visual Studio 2010, right click on your project and go to "Properties -> Configuration Properties -> CUDA C/C++ -> Device" and enter "compute_12,sm_12" in the "Code Generation" line). The code should otherwise compile as is.
#include <cstdlib>
#include <cuda_runtime.h>
static int const X_THRDS_PER_BLK = 32;
static int const Y_THRDS_PER_BLK = 8;
__global__ void KernelWithSharedMemoryAndAtomicFunction(int * d_array, int numTotX, int numTotY)
{
__shared__ int s_blk[Y_THRDS_PER_BLK][X_THRDS_PER_BLK]; // compiles
//volatile __shared__ int s_blk[Y_THRDS_PER_BLK][X_THRDS_PER_BLK]; // will not compile
int tx = threadIdx.x;
int ty = threadIdx.y;
int mx = blockIdx.x*blockDim.x + threadIdx.x;
int my = blockIdx.y*blockDim.y + threadIdx.y;
int mi = my*numTotX + mx;
if (mx < numTotX && my < numTotY)
{
s_blk[ty][tx] = d_array[mi];
__syncthreads();
atomicMin(&s_blk[ty][tx], 4); // will compile with volatile shared memory only if this line is commented out
__syncthreads();
d_array[mi] = s_blk[ty][tx];
}
}
int main(void)
{
// Declare and initialize some array on host
int const NUM_TOT_X = 4*X_THRDS_PER_BLK;
int const NUM_TOT_Y = 6*Y_THRDS_PER_BLK;
int * h_array = (int *)malloc(NUM_TOT_X*NUM_TOT_Y*sizeof(int));
for (int i = 0; i < NUM_TOT_X*NUM_TOT_Y; ++i) h_array[i] = i;
// Copy array to device
int * d_array;
cudaMalloc((void **)&d_array, NUM_TOT_X*NUM_TOT_Y*sizeof(int));
cudaMemcpy(d_array, h_array, NUM_TOT_X*NUM_TOT_Y*sizeof(int), cudaMemcpyHostToDevice);
// Declare block and thread variables
dim3 thdsPerBlk;
dim3 blks;
thdsPerBlk.x = X_THRDS_PER_BLK;
thdsPerBlk.y = Y_THRDS_PER_BLK;
thdsPerBlk.z = 1;
blks.x = (NUM_TOT_X + X_THRDS_PER_BLK - 1)/X_THRDS_PER_BLK;
blks.y = (NUM_TOT_Y + Y_THRDS_PER_BLK - 1)/Y_THRDS_PER_BLK;
blks.z = 1;
// Run kernel
KernelWithSharedMemoryAndAtomicFunction<<<blks, thdsPerBlk>>>(d_array, NUM_TOT_X, NUM_TOT_Y);
// Cleanup
free (h_array);
cudaFree(d_array);
return 0;
}
Anyway, if you comment out the "s_blk" declaration towards the top of the kernel and uncomment the commented-out declaration immediately following it, then you should get the following error:
error : no instance of overloaded function "atomicMin" matches the argument list
I do not understand why declaring the shared memory as volatile would affect its type, as (I think) this error message is indicating, nor why it cannot be used with atomic operations.
Can anyone please provide any insight?
Thanks,
Aaron

Just replace
atomicMin(&s_blk[ty][tx], 4);
by
atomicMin((int *)&s_blk[ty][tx], 4);.
It typecasts &s_blk[ty][tx] so it matches the argument of atomicMin(..).

Related

Difficulty using atomicMin to find minimum value in a matrix

I'm having trouble using atomicMin to find the minimum value in a matrix in cuda. I'm sure it has something to do with the parameters I'm passing into the atomicMin function. The findMin function is the function to focus on, the popmatrix function is just to populate the matrix.
#include <stdio.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define SIZE 4
__global__ void popMatrix(unsigned *matrix) {
unsigned id, num;
curandState_t state;
id = threadIdx.x * blockDim.x + threadIdx.y;
// Populate matrix with random numbers
curand_init(id, 0, 0, &state);
num = curand(&state)%100;
matrix[id] = num;
}
__global__ void findMin(unsigned *matrix, unsigned *temp) {
unsigned id;
id = threadIdx.x * blockDim.y + threadIdx.y;
atomicMin(temp, matrix[id]);
printf("old: %d, new: %d", matrix[id], temp);
}
int main() {
dim3 block(SIZE, SIZE, 1);
unsigned *arr, *harr, *temp;
cudaMalloc(&arr, SIZE*SIZE*sizeof(unsigned));
popMatrix<<<1,block>>>(arr);
// Print matrix of random numbers to see if min number was picked right
cudaMemcpy(harr, arr, SIZE*SIZE*sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned i = 0; i < SIZE; i++) {
for (unsigned j = 0; j < SIZE; j++) {
printf("%d ", harr[i*SIZE+j]);
}
printf("\n");
}
temp = harr[0];
findMin<<<1, block>>>(harr);
return 0;
}
harr is not allocated. You should allocated it on the host side using for example malloc before calling cudaMemcpy. As a result, the printed values you look are garbage. This is quite surprising that the program did not segfault on your machine.
Moreover, when you call the kernel findMin at the end, its parameter is harr (which is supposed to be on the host side regarding its name) should be on the device to perform the atomic operation correctly. As a result, the current kernel call is invalid.
As pointed out by #RobertCrovella, a cudaDeviceSynchronize() call is missing at the end. Moreover, you need to free your memory using cudaFree.

What is the proper way to use stride in cuda to do multiblock reduction?

Hello everyone I'm trying to use grid-stride method and atomic functions to do multi-block reduction.
I know that the usual way to do this is to launch two kernels or use lastblock method as directed in this note.(or this tutorial)
However, I thought this could also be done by using grid-stride with atomic code.
As I tested, it worked very well..
until for some number, it gives the wrong answer. (which is very weird)
I have tested for some "n"s and found that I get wrong answer for n = 1234565, 1234566, 1234567.
This is my whole code of doing n sum of 1. So the answer should be n.
Any help or comment is appreciated.
#include<iostream>
__global__ void stride_sum(const double* input,
const int size,
double* sumOut){
extern __shared__ double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
for(unsigned int s=i;
s<size;
s+=blockDim.x*gridDim.x){
sm[tid] = input[i];
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
}
int main(){
unsigned int n = 1234567;
int blockSize = 4;
int nBlocks = (n + blockSize - 1) / blockSize;
int sharedMemory = sizeof(double)*blockSize;
double *data, *sum;
cudaMallocManaged(&data, sizeof(double)*n);
cudaMallocManaged(&sum, sizeof(double));
std::fill_n(data,n,1.);
std::fill_n(sum,1,0.);
stride_sum<<<nBlocks, blockSize, sharedMemory>>>(data,n,sum);
cudaDeviceSynchronize();
printf("res: 10.f \n",sum[0]);
cudaFree(data);
cudaFree(sum);
return 0;
}
You have gotten quite a lot wrong in your implementation. This will work:
__global__ void stride_sum(const double* input,
const int size,
double* sumOut)
{
extern __shared__ volatile double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
double val = 0.;
for(unsigned int s=i; s<size; s+=blockDim.x*gridDim.x){
val += input[i];
}
// Load partial sum to memory
sm[tid] = val;
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
[Never compiled and run, use a own risk]
In short -- do the grid strided summation, then a single shared memory reduction, then a single atomic update. Your implementation has undefined behaviour in a few places, especially the conditionally executed __syncthreads calls and using uninitialized shared memory when some threads fall out of the summation loop.

Pointer arithmetic with shared memory

I don't understand what exactly happens in the following lines:
unsigned char *membershipChanged = (unsigned char *)sharedMemory; and
float *clusters = (float *)(sharedMemory + blockDim.x);
I assume that in #1 sharedMemory is effectively renamed into membershipChanged, but why would you add the blockDim to the sharedMemorypointer. Where does this address point?
sharedMemory was created with extern __shared__ char sharedMemory[];
The code I found in a CUDA kmeans implementation.
void find_nearest_cluster(int numCoords,
int numObjs,
int numClusters,
float *objects, // [numCoords][numObjs]
float *deviceClusters, // [numCoords][numClusters]
int *membership, // [numObjs]
int *intermediates)
{
extern __shared__ char sharedMemory[];
// The type chosen for membershipChanged must be large enough to support
// reductions! There are blockDim.x elements, one for each thread in the
// block.
unsigned char *membershipChanged = (unsigned char *)sharedMemory;
float *clusters = (float *)(sharedMemory + blockDim.x);
membershipChanged[threadIdx.x] = 0;
// BEWARE: We can overrun our shared memory here if there are too many
// clusters or too many coordinates!
for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
for (int j = 0; j < numCoords; j++) {
clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
}
}
.....
sharedMemory + blockDim.x points blockDim.x bytes away from the base of the shared memory region.
The reason you might do something like this is to suballocate in shared memory. The launch site of the kernel which includes find_nearest_cluster dynamically allocates some amount of shared storage for the kernel. The code implies that two logically different arrays reside in the shared storage pointed to by sharedMemory -- membershipChanged, and clusters. The pointer arithmetic is simply a means to get a pointer to the second array.

CUDA: Fermi (Tesla M2090) generating CUDA_EXCEPTION_10 without reason

I have a small piece of code which runs perfectly on Nvidia old architecture (Tesla T10 processor) but not on Fermi (Tesla M2090)
I learned that Fermi behaves slightly differently. Due to which unsafe code might work correctly on old architectures, while on Fermi it catches the bug.
But I don't know how to resolve it.
Here is my code:
__global__ void exec (int *arr_ptr, int size, int *result) {
int tx = threadIdx.x;
int ty = threadIdx.y;
*result = arr_ptr[-2];
}
void run(int *arr_dev, int size, int *result) {
cudaStream_t stream = 0;
int *arr_ptr = arr_dev + 5;
dim3 threads(1,1,1);
dim3 grid (1,1);
exec<<<grid, threads, 0, stream>>>(arr_ptr, size, result);
}
since I am accessing arr_ptr[-2], the fermi throws CUDA_EXCEPTION_10, Device Illegal Address. But it is not. The address is legal.
Can anyone help me on this.
My driver code is
int main(){
int *arr;
int *arr_dev = NULL;
int result = 1;
arr = (int*)malloc(10*sizeof(int));
for(int i = 0; i < 10; i++)
arr[i] = i;
if(arr_dev == NULL)
{
cudaMalloc((void**)&arr_dev, 10);
cudaMemcpy(arr_dev, arr, 10*sizeof(int), cudaMemcpyHostToDevice);
}
run(arr_dev, 10, &result);
printf("%d \n", result);
return 0;
}
Fermi cards have much better memory protection on the device and will detect out of bounds conditions which will appear to "work" on older cards. Use cuda-memchk (or the cuda-memchk mode in cuda-gdb) to get a better handle on what is going wrong.
EDIT:
This is the culprit:
cudaMalloc((void**)&arr_dev, 10);
which should be
cudaMalloc((void**)&arr_dev, 10*sizeof(int));
This will result in this code
int *arr_ptr = arr_dev + 5;
passing a pointer to the device which is out of bounds.

global vs shared memory in CUDA

I have two CUDA kernels that compute similar stuff. One is using global memory (myfun is a device function that reads a lot from global memory and do the computation). The second kernel transfers that chunk of data from global memory to shared memory so the data can be shared among different threads of a block. My kernel that uses global memory is much faster than the one with shared memory. What are the possible reasons?
loadArray just copies a small part of d_x to m.
__global__ void mykernel(float *d_x, float *d_y, int *d_z, float *d_u, int N, int K, int D)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int index = 0;
float max_s = 1e+37F;
if (tid < N)
{
for (int i = 0; i < K; i++)
{
float s = myfun(&d_x[i*D], d_y, tid);
if (s > max_s)
{
max_s = s;
index = i;
}
}
d_z[tid] = index;
d_u[tid] = max_s;
}
}
Using shared memory:
__global__ void mykernel(float *d_x, float *d_y, int *d_z, float *d_u, int N, int K)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int index = 0;
float max_s = 1e+37F;
extern __shared__ float m[];
if( threadIdx.x == 0 )
loadArray( m, d_x );
__syncthreads();
if (tid < N)
{
for (int i = 0; i < K; i++)
{
float s = myfun(m, d_y, tid);
if (s > max_s)
{
max_s = s;
index = i;
}
}
d_z[tid] = index;
d_u[tid] = max_s;
}
}
The problem is that only the first thread in each block is reading from global memory into shared memory, this is much slower than letting all threads read from global memory simultaneously.
Using shared memory is an advantage when a single thread needs to access neighbouring elements from global memory - but this doesn't appear to be the case here.
IMO, if you have parallel nsight installed on say a Windows machine and conduct a trace on the executions, you might have more insights. Alternatively, run the cudaprof through your app to try to figure out where are the possible latencies.