What is the proper way to use stride in cuda to do multiblock reduction? - cuda

Hello everyone I'm trying to use grid-stride method and atomic functions to do multi-block reduction.
I know that the usual way to do this is to launch two kernels or use lastblock method as directed in this note.(or this tutorial)
However, I thought this could also be done by using grid-stride with atomic code.
As I tested, it worked very well..
until for some number, it gives the wrong answer. (which is very weird)
I have tested for some "n"s and found that I get wrong answer for n = 1234565, 1234566, 1234567.
This is my whole code of doing n sum of 1. So the answer should be n.
Any help or comment is appreciated.
#include<iostream>
__global__ void stride_sum(const double* input,
const int size,
double* sumOut){
extern __shared__ double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
for(unsigned int s=i;
s<size;
s+=blockDim.x*gridDim.x){
sm[tid] = input[i];
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
}
int main(){
unsigned int n = 1234567;
int blockSize = 4;
int nBlocks = (n + blockSize - 1) / blockSize;
int sharedMemory = sizeof(double)*blockSize;
double *data, *sum;
cudaMallocManaged(&data, sizeof(double)*n);
cudaMallocManaged(&sum, sizeof(double));
std::fill_n(data,n,1.);
std::fill_n(sum,1,0.);
stride_sum<<<nBlocks, blockSize, sharedMemory>>>(data,n,sum);
cudaDeviceSynchronize();
printf("res: 10.f \n",sum[0]);
cudaFree(data);
cudaFree(sum);
return 0;
}

You have gotten quite a lot wrong in your implementation. This will work:
__global__ void stride_sum(const double* input,
const int size,
double* sumOut)
{
extern __shared__ volatile double sm[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + tid;
//doing grid loop using stride method.
double val = 0.;
for(unsigned int s=i; s<size; s+=blockDim.x*gridDim.x){
val += input[i];
}
// Load partial sum to memory
sm[tid] = val;
__syncthreads();
//doing parallel reduction.
for(unsigned int ss = blockDim.x/2;ss>0;ss>>=1){
if(tid<ss && tid+ss<size) sm[tid] += sm[tid+ss];
__syncthreads();
}
//atomically add results to sumOut.
if(tid==0) atomicAdd(sumOut, sm[0]);
}
[Never compiled and run, use a own risk]
In short -- do the grid strided summation, then a single shared memory reduction, then a single atomic update. Your implementation has undefined behaviour in a few places, especially the conditionally executed __syncthreads calls and using uninitialized shared memory when some threads fall out of the summation loop.

Related

cudaMallocManaged for 2D and 3D array

If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before.
The code looks like this(more or less)
__global__ void convert(float kelvin[], float celsius[]) //can pass
arrays in kernel
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<N)
kelvin[i]=celsius[i]+273.15;
}
int main()
{
float *celsius =(float *)malloc(N*sizeof(float));
float *kelvin =(float *)malloc(N*sizeof(float));
cudaMallocManaged(&celsius, N*sizeof(float));
cudaMallocManaged(&kelvin, N*sizeof(float));
// init celsius here
dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();
//Doing stuff with the output here
return 0;
}
The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ // I thonk, 2D arrays can be passed as pointer to pointers
float **A = (float **)malloc(N*N*sizeof(float));
float **B = (float **)malloc(N*N*sizeof(float));
float **C = (float **)malloc(N*N*sizeof(float));
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all
}
But, It shows the following error
matrix_add.cu(22): error: too many initializer values
matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"
Your help is highly appreciated.
You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:
#include <algorithm>
#include <iostream>
const int N = 3;
__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
float* A; cudaMallocManaged(&A, N*N*sizeof(float));
float* B; cudaMallocManaged(&B, N*N*sizeof(float));
float* C; cudaMallocManaged(&C, N*N*sizeof(float));
const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);
std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(1, 1);
MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
reinterpret_cast<float (*)[N]>(B),
C_vals );
cudaDeviceSynchronize();
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
std::cout << C_vals[i][j] << " ";
}
std::cout << std::endl;
}
return 0;
}
Some important points:
Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
Initialization syntax and assignment syntax for arrays are not interchangeable.
All I can suggest is that you study it thoroughly until you understand how it works.

Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D

I'm new in CUDA, I appreciate your help and hope you can help me.
I need to store multiple elements of a 2D array into a vector, and then work with the vector, but my code does not work well, when I debug, I find a mistake in allocating the 2D array in the device with cudaMallocPitch and copying to that array with cudaMemcpy2D. This is my code:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cmath>
#define maxThreads 96
__global__ void extract(int mSize, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while(idx<N)
{
dev_vector[idx] = *(dev_matrix+(mSize*idx+N));
idx += blockDim.x * gridDim.x;
}
}
int main()
{
//CPU variables
int mSize = 5;
float* matrix;
int N = 4; // Vector size
int i,j;
float* vector;
int blocks, threads;
float* dev_matrix;
float* dev_vector;
blocks = 1+((N-1)/maxThreads);
threads = 1+((N-1)/blocks);
unsigned long int pitch;
unsigned long int memsize_vector = N*sizeof(float);
unsigned long int memsize_matrix = mSize*sizeof(float);
matrix = new float[memsize_matrix*memsize_matrix];
vector = new float[memsize_vector];
//Create 2D array
for(i=0; i<mSize; i++)
for(j=0; j<mSize; j++)
{
matrix[i+mSize*j] = ((i+1)+(j+1));
}
printf("\n");
for (i=0; i<mSize; i++){
for(j=0; j<mSize; j++){
printf("% 1.5f ", matrix[i+mSize*j]);
}
printf("\n");
}
printf("\n");
cudaMallocPitch((void **)&dev_matrix, &pitch, memsize_matrix, mSize);
cudaMalloc((void **)&dev_vector, memsize_vector);
cudaMemcpy2D(dev_matrix, pitch, matrix, memsize_matrix, memsize_matrix, mSize,
cudaMemcpyHostToDevice);
extract<<<blocks,threads>>>(mSize, dev_vector, dev_matrix, N);
cudaDeviceSynchronize();
cudaMemcpy(vector, dev_vector, memsize_vector, cudaMemcpyDeviceToHost);
printf("Vector values are:\n");
for(i=0; i<N; i++)
printf(" % 1.5f ", vector[i]);
printf("\n");
cudaFree(dev_matrix);
cudaFree(dev_vector);
}
There are lots of problems in this code, including but not limited to using array sizes in bytes and word sizes interchangeably in several places in code, using incorrect types (note that size_t exists for a very good reason) , potential truncation and type casting problems, and more.
But the core problem is the addressing of pitched memory inside the kernel, to which you are never even passing the pitch value. Reading the documentation for cudaMallocPitch will give you the correct method for addressing pitched memory inside a kernel. Your kernel might then look like this:
__global__ void extract(size_t mpitch, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * mpitch) + N );
idx += stride;
}
}
[disclaimer: never compiled or tested, use at own risk].
You will have to fix then all the problems in the host code to reflect whatever kernel changes you make.
Thanks to all, Alex I had not seen that, and fix it, thanks.
talonmies, thank you, my code works, with your suggestions. thanks a lot, finally this my kernel:
__global__ void sumreduct(size_t pitch, float* dev_vector, float* dev_matrix, int columns, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * pitch) + columns);
idx += stride;
}
}
About "size_t", I was using "Unsigned int" because Nsight show me the next warning:
Type 'size_t' could not be resolved
Thanks
Did you really mean to declare a source matrix of length [memsizeMatrix*memsizeMatrix] ?
This will allocate 400 floats, or 1600 bytes. This means your source-pitch is off, and the Memcpy2D call is failing.
I'm assuming you meant to say
matrix = new float[mSize*mSize];

reduction for sum of vector when size is not power of 2?

For the classical reduction algorithm on GPU, it works perfectly if the size of vector is the power of 2. What if it is not the case? At some point we will have to find the sum of odd number of element. What is the best way to deal with that?
You can compute the sum of a matrix that doesn't have a size of a power of two. Look at the example :
#include <math.h>
#define N 1022 //total size
__global__ void sum(int *A, int *C)
{
__shared__ int temp[blockDim.x];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int local_idx = threadIdx.x;
temp[local_idx] = A[idx];
int i=ceil(blockDim.x/2);
__syncthreads();
while(i!=0)
{
if(idx+i<N && local_idx<i)
temp[local_idx] += tmp[local_idx+i];
i/=2;
__syncthreads();
}
if(local_idx == 0)
C[blockIdx.x] = temp[0];
}
Set
int i = 1024;
instead of
int i=ceil(blockDim.x/2);
i.e. int i should be interger^2 > N.

Shared memory mutex with CUDA - adding to a list of items

My problem is the following: I have an image in which I detect some points of interest using the GPU. The detection is a heavyweight test in terms of processing, however only about 1 in 25 points pass the test on average. The final stage of the algorithm is to build up a list of the points. On the CPU this would be implemented as:
forall pixels x,y
{
if(test_this_pixel(x,y))
vector_of_coordinates.push_back(Vec2(x,y));
}
On the GPU I have each CUDA block processing 16x16 pixels. The problem is that I need to do something special to eventually have a single consolidated list of points in global memory. At the moment I am trying to generate a local list of points in shared memory per block which eventually will be written to global memory. I am trying to avoid sending anything back to the CPU because there are more CUDA stages after this.
I was expecting that I could use atomic operations to implement the push_back function on shared memory. However I am unable to get this working. There are two issues. The first annoying issue is that I am constantly running into the following compiler crash: "nvcc error : 'ptxas' died with status 0xC0000005 (ACCESS_VIOLATION)" when using atomic operations. It is hit or miss whether I can compile something. Does anyone know what causes this?
The following kernel will reproduce the error:
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ unsigned int test;
atomicInc(&test, 1000);
}
Secondly, my code which includes a mutex lock on shared memory hangs the GPU and I dont understand why:
__device__ void lock(unsigned int *pmutex)
{
while(atomicCAS(pmutex, 0, 1) != 0);
}
__device__ void unlock(unsigned int *pmutex)
{
atomicExch(pmutex, 0);
}
__global__ void gpu_kernel_non_max_suppress(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ RtmPoint localPoints[64];
__shared__ int localCount;
__shared__ unsigned int mutex;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int blockid = blockIdx.y * gridDim.x + blockIdx.x;
if(threadid==0)
{
localCount = 0;
mutex = 0;
}
__syncthreads();
if(x<w && y<h)
{
if(some_test_on_pixel(x,y))
{
RtmPoint point;
point.x = x;
point.y = y;
// this is a local push_back operation
lock(&mutex);
if(localCount<64) // we should never get >64 points per block
localPoints[localCount++] = point;
unlock(&mutex);
}
}
__syncthreads();
if(threadid==0)
pCounts[blockid] = localCount;
if(threadid<localCount)
pPoints[blockid * 64 + threadid] = localPoints[threadid];
}
In the example code at this site, the author manages to successfully use atomic operations on shared memory, so I am confused as to why my case does not function. If I comment out the lock and unlock lines, the code runs ok, but obviously incorrectly adding to the list.
I would appreciate some advice about why this problem is happening and also perhaps if there is a better solution to achieving the goal, since I am concerned anyway about the performance issues with using atomic operations or mutex locks.
I suggest using prefix-sum to implement that part to increase parallelism. To do that you need to use a shared array. Basically prefix-sum will turn an array (1,1,0,1) into (0,1,2,2,3), i.e., will calculate an in-place running exclusive sum so that you'll get per-thread write indices.
__shared__ uint8_t vector[NUMTHREADS];
....
bool emit = (x<w && y<h);
emit = emit && some_test_on_pixel(x,y);
__syncthreads();
scan(emit, vector);
if (emit) {
pPoints[blockid * 64 + vector[TID]] = point;
}
prefix-sum example:
template <typename T>
__device__ uint32 scan(T mark, T *output) {
#define GET_OUT (pout?output:values)
#define GET_INP (pin?output:values)
__shared__ T values[numWorkers];
int pout=0, pin=1;
int tid = threadIdx.x;
values[tid] = mark;
syncthreads();
for( int offset=1; offset < numWorkers; offset *= 2) {
pout = 1 - pout; pin = 1 - pout;
syncthreads();
if ( tid >= offset) {
GET_OUT[tid] = (GET_INP[tid-offset]) +( GET_INP[tid]);
}
else {
GET_OUT[tid] = GET_INP[tid];
}
syncthreads();
}
if(!pout)
output[tid] =values[tid];
__syncthreads();
return output[numWorkers-1];
#undef GET_OUT
#undef GET_INP
}
Based on recommendations here, I include the code that I used in the end. It uses 16x16 pixel blocks. Note that I am now writing the data out in one global array without breaking it up. I used the global atomicAdd function to compute a base address for each set of results. Since this only gets called once per block, I did not find too much of a slow down, while I gained a lot more convenience by doing this. I'm also avoiding shared buffers for the input and output of prefix_sum. GlobalCount is set to zero prior to the kernel call.
#define BLOCK_THREADS 256
__device__ int prefixsum(int threadid, int data)
{
__shared__ int temp[BLOCK_THREADS*2];
int pout = 0;
int pin = 1;
if(threadid==BLOCK_THREADS-1)
temp[0] = 0;
else
temp[threadid+1] = data;
__syncthreads();
for(int offset = 1; offset<BLOCK_THREADS; offset<<=1)
{
pout = 1 - pout;
pin = 1 - pin;
if(threadid >= offset)
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid] + temp[pin * BLOCK_THREADS + threadid - offset];
else
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid];
__syncthreads();
}
return temp[pout * BLOCK_THREADS + threadid];
}
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pGlobalCount)
{
__shared__ int write_base;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int valid = 0;
if(x<w && y<h)
{
if(test_pixel(x,y))
{
valid = 1;
}
}
int index = prefixsum(threadid, valid);
if(threadid==BLOCK_THREADS-1)
{
int total = index + valid;
if(total>64)
total = 64; // global output buffer is limited to 64 points per block
write_base = atomicAdd(pGlobalCount, total); // get a location to write them out
}
__syncthreads(); // ensure write_base is valid for all threads
if(valid)
{
RtmPoint point;
point.x = x;
point.y = y;
if(index<64)
pPoints[write_base + index] = point;
}
}

Pointer arithmetic with shared memory

I don't understand what exactly happens in the following lines:
unsigned char *membershipChanged = (unsigned char *)sharedMemory; and
float *clusters = (float *)(sharedMemory + blockDim.x);
I assume that in #1 sharedMemory is effectively renamed into membershipChanged, but why would you add the blockDim to the sharedMemorypointer. Where does this address point?
sharedMemory was created with extern __shared__ char sharedMemory[];
The code I found in a CUDA kmeans implementation.
void find_nearest_cluster(int numCoords,
int numObjs,
int numClusters,
float *objects, // [numCoords][numObjs]
float *deviceClusters, // [numCoords][numClusters]
int *membership, // [numObjs]
int *intermediates)
{
extern __shared__ char sharedMemory[];
// The type chosen for membershipChanged must be large enough to support
// reductions! There are blockDim.x elements, one for each thread in the
// block.
unsigned char *membershipChanged = (unsigned char *)sharedMemory;
float *clusters = (float *)(sharedMemory + blockDim.x);
membershipChanged[threadIdx.x] = 0;
// BEWARE: We can overrun our shared memory here if there are too many
// clusters or too many coordinates!
for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
for (int j = 0; j < numCoords; j++) {
clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
}
}
.....
sharedMemory + blockDim.x points blockDim.x bytes away from the base of the shared memory region.
The reason you might do something like this is to suballocate in shared memory. The launch site of the kernel which includes find_nearest_cluster dynamically allocates some amount of shared storage for the kernel. The code implies that two logically different arrays reside in the shared storage pointed to by sharedMemory -- membershipChanged, and clusters. The pointer arithmetic is simply a means to get a pointer to the second array.