CUDA: Writing to Global memory slow when in if statement - cuda

I have two kernels:
template <typename T>
__global__ void bpcKernel(T* finalOutputPtr, const T* heatMapPtr, const T* peaksPtrA, const T* peaksPtrB, const unsigned int* bodyPartPairsPtr, const unsigned int* mapIdxPtr, const int POSE_MAX_PEOPLE, const int TOTAL_BODY_PARTS, const int heatmapWidth, const int heatmapHeight)
{
const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto k = (blockIdx.z * blockDim.z) + threadIdx.z;
const T* bodyPartA = peaksPtrA + (bodyPartPairsPtr[i*2]*POSE_MAX_PEOPLE*3 + j*3);
const T* bodyPartB = peaksPtrB + (bodyPartPairsPtr[i*2 + 1]*POSE_MAX_PEOPLE*3 + k*3);
finalOutputPtr[i*POSE_MAX_PEOPLE*POSE_MAX_PEOPLE + j*POSE_MAX_PEOPLE + k] = -1;
if(bodyPartA[2] >= 0.05 && bodyPartB[2] >= 0.05){
//finalOutputPtr[i*POSE_MAX_PEOPLE*POSE_MAX_PEOPLE + j*POSE_MAX_PEOPLE + k] = -1;
}
}
This one computes an if statement, but all threads write to the finalOutputPtr
template <typename T>
__global__ void bpcKernel(T* finalOutputPtr, const T* heatMapPtr, const T* peaksPtrA, const T* peaksPtrB, const unsigned int* bodyPartPairsPtr, const unsigned int* mapIdxPtr, const int POSE_MAX_PEOPLE, const int TOTAL_BODY_PARTS, const int heatmapWidth, const int heatmapHeight)
{
const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto j = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto k = (blockIdx.z * blockDim.z) + threadIdx.z;
const T* bodyPartA = peaksPtrA + (bodyPartPairsPtr[i*2]*POSE_MAX_PEOPLE*3 + j*3);
const T* bodyPartB = peaksPtrB + (bodyPartPairsPtr[i*2 + 1]*POSE_MAX_PEOPLE*3 + k*3);
//finalOutputPtr[i*POSE_MAX_PEOPLE*POSE_MAX_PEOPLE + j*POSE_MAX_PEOPLE + k] = -1;
if(bodyPartA[2] >= 0.05 && bodyPartB[2] >= 0.05){
finalOutputPtr[i*POSE_MAX_PEOPLE*POSE_MAX_PEOPLE + j*POSE_MAX_PEOPLE + k] = -1;
}
}
This thread does the same operation, but only writes when those two conditions are satisfied.
But for some reason, the 2nd kernel takes 6 more ms to compute. Its almost 4 times slower. Why is this the case?

Albeit the difference in code may seem minor, the two kernels you have here perform some very different computations if you think about it. The first kernel just uniformly fills a buffer with -1 (the compiler can and will just optimize away the loads from bodyPartPairsPtr since no observable behavior depends on their result). The second kernel loads two unsigned int from memory which are then used as an offset to load two further values, depending on which it will write or not write a -1 to the buffer. So while the first kernel just performs a single, potentially perfectly coalesced, store, the second kernel performs four loads and a dependent store. And that is ignoring details such as that it will also need two additional constant memory loads to fetch the additional kernel parameters, which are not used in the first kernel. From that perspective, it should be no surprise that the second kernel is slower; it simply produces a lot more memory transfer.
As always with performance questions, there is only one way to find the answer: profiling. But if you go ahead and profile your kernel, I would expect you to find it limited by memory transfer. And most likely you will see exactly an about 4× difference in memory transfer which will explain your results…

Related

Cuda In-Situ memory race issue for algorithms such as convolution of morphologicam dilation

I wrote a dilation kernel in CUDA and it works well when my input and my output images are different buffers, but I am facing what I understand to be a memory race issue when I call my kernel in an in-situ case, i.e. the input and the output buffers point to the same memory location.
I tried :
a. using cooperative groups,
b. using a mutex and an atomic addition but as suggested in this paper and in several sources on the web,
c. using a lock-free inter-block synchronization, the synchronization proposed in this same paper.
All my attempts failed because :
a. did not work because my input buffer is a const pointer and I have a compilation error when I have to cast it into a void* parameter (which makes sense), so I could not go further.
b. did not work because I faced a wierd behaviour : I have 16x16 blocks, each with 32x32 threads. Synchronizing the blocks should increase the mutex to 256 but the program blocks after 48 atomic additions.
c. did not work because it seams to be no inter-block synchronization, although the code I used directly from the paper seems good to me. I could improve a little the race effect by adding some __syncthreads()
This is the dilation function ;
template <typename T>
__global__ void GenericDilate2dImg_knl(const ImageSizeInfo imgSizeInfo,
volatile int* syncArrayIn, volatile int* syncArrayOut,
const unsigned long localSizeX, const unsigned long localSizeY,
const int borderPolicyType, const T outOfImageValue,
const struct StructuringElementInfo seInfo,
const T* pInBuf, T* pOutBuf)
{
// Extract sizeX, sizeY, etc. from imgSizeInfo
SPLIT_SIZES_FROM_STRUCT(imgSizeInfo)
// Declare the shared buffer pSharedBuf
extern __shared__ char pSharedMem[];
T* pSharedBuf = reinterpret_cast<T*>(pSharedMem);
const unsigned long x = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned long y = blockDim.y * blockIdx.y + threadIdx.y;
const unsigned long planIdx = blockDim.z * blockIdx.z + threadIdx.z;
const unsigned long nbPlans = sizeZ * sizeC * sizeT;
const unsigned long idx = x + y * sizeX + planIdx * sizeX*sizeY;
// Copy the input image data into shared memory
if (x < blockDim.x * gridDim.x && y < blockDim.y * gridDim.y && planIdx < blockDim.z * gridDim.z) {
copyDataToSharedMemory2d(pInBuf, sizeX, sizeY, planIdx,
localSizeX, localSizeY,
seInfo._paddingX, seInfo._paddingY,
borderPolicyType, outOfImageValue,
pSharedBuf);
}
// Wait to ensure that the copy is terminated
if (pInBuf == pOutBuf) {
// Grid synchronization for in-situ case
//__gpu_sync(gridDim.x * gridDim.y); // Use a mutex
__gpu_sync2(1, syncArrayIn, syncArrayOut); // Use a lock-free barrier
}
else
// The input and ouput buffers point to different data
// -> we simply need to synchronize the threads inside the block
__syncthreads();
// Compute the convolution for pixels inside the image
if (x < sizeX && y < sizeY && planIdx < nbPlans) {
T vMax = 0;
for (unsigned int curCoefIdx = 0; curCoefIdx < seInfo._nbOffsets; ++curCoefIdx) {
const unsigned int sx = threadIdx.x + seInfo._paddingX + seInfo._pOffsetsX[curCoefIdx];
const unsigned int sy = threadIdx.y + seInfo._paddingY + seInfo._pOffsetsY[curCoefIdx];
const unsigned long sidx = sx + sy * localSizeX;
const T curVal = pSharedBuf[sidx];
vMax = (vMax > curVal ? vMax : curVal);
}
// Round the result
pOutBuf[idx] = vMax;
}
}
My function to copy from global to shared memory is :
template <typename T>
__device__ void copyDataToSharedMemory2d(const T* pInBuf,
const unsigned long sizeX, const unsigned long sizeY, const unsigned long planIdx,
const unsigned long localSizeX, const unsigned long localSizeY,
const int paddingX, const int paddingY,
const int borderPolicyType, const T outOfImageValue,
T* pSharedBuf)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int localX = threadIdx.x;
const int localY = threadIdx.y;
// Fill the shared buffer tile by tile
// A tile is related to the group size
const unsigned int groupSizeX = blockDim.x;
const unsigned int groupSizeY = blockDim.y;
// For each tile
for (int offsetY = 0; offsetY < localSizeY; offsetY += groupSizeY) {
int curLocalY = localY + offsetY;
int curGlobalY = y + offsetY - paddingY;
for (int offsetX = 0; offsetX < localSizeX; offsetX += groupSizeX) {
int curLocalX = localX + offsetX;
int curGlobalX = x + offsetX - paddingX;
// If the current coordinate is inside the shared sub-image
if (curLocalX < localSizeX && curLocalY < localSizeY) {
const int idx = curLocalX + curLocalY * localSizeX;
pSharedBuf[idx] = getPixel2d(pInBuf, sizeX, sizeY, curGlobalX, curGlobalY, planIdx, borderPolicyType, outOfImageValue);
}
}
}
}
Where getPixel2d allows me to manage the data out of the image:
template <typename T>
__device__
T getPixel2d(const T* pInBuf,
const unsigned long sizeX, const unsigned long sizeY,
const int x, const int y, const int z,
const int borderPolicyType, const T outOfImageValue)
{
int x_inside = x;
if (x < 0 || x >= sizeX) {
switch (borderPolicyType) {
case 0://outside the image, there is a constant value
return outOfImageValue;
case 1://outside the image, we propagate the data at the image borders
if (x < 0)
x_inside = 0;
else // x >= sizeX
x_inside = sizeX - 1;
break;
case 2://Miror effect
if (x < 0)
x_inside = -(x + 1);
else // x >= sizeX
x_inside = sizeX - ((x - sizeX) + 1);
break;
}
}
// y-coordinate inside the image
int y_inside = y;
if (y < 0 || y >= sizeY) {
switch (borderPolicyType) {
case 0://outside the image, there is a constant value
return outOfImageValue;
case 1://outside the image, we propagate the data at the image borders
if (y < 0)
y_inside = 0;
else // y >= sizeY
y_inside = sizeY - 1;
break;
case 2://Miror effect
if (y < 0)
y_inside = -(y + 1);
else // y >= sizeY
y_inside = sizeY - ((y - sizeY) + 1);
break;
default: break;
}
}
return pInBuf[x_inside + y_inside * sizeX + z * sizeX * sizeY];
}
and now, here are my inter-block synchronization functions :
// Using a mutex
__device__ volatile int g_mutex;
__device__ void __gpu_sync(int goalVal) {
//thread ID in a block
int tid_in_block = threadIdx.x * blockDim.y + threadIdx.y;
// only thread 0 is used for synchronization
if (tid_in_block == 0) {
atomicAdd((int*)&g_mutex, 1);
printf("[%d] %d Vs %d\n", blockIdx.x * gridDim.y + blockIdx.y, g_mutex, goalVal);
//only when all blocks add 1 to g_mutex
//will g_mutex equal to goalVal
while (g_mutex </*!=*/ goalVal) {
;//Do nothing here
}
}
__syncthreads();
}
// Lock-free barrier
__device__ void __gpu_sync2(int goalVal, volatile int* Arrayin, volatile int* Arrayout) {
// thread ID in a block
int tid_in_blk = threadIdx.x * blockDim.y + threadIdx.y;
int nBlockNum = gridDim.x * gridDim.y;
int bid = blockIdx.x * gridDim.y + blockIdx.y;
// only thread 0 is used for synchronization
if (tid_in_blk == 0) {
Arrayin[bid] = goalVal;
}
if (bid == 1) {
if (tid_in_blk < nBlockNum) {
while (Arrayin[tid_in_blk] != goalVal) {
;//Do nothing here
}
}
__syncthreads();
if (tid_in_blk < nBlockNum) {
Arrayout[tid_in_blk] = goalVal;
}
}
if (tid_in_blk == 0) {
while (Arrayout[bid] != goalVal) {
;//Do nothing here
}
}
__syncthreads();
}
The image I get for in-situ calculation is :
I used a 11x15 structuring emelent and the size of the shared buffer is (nbThreadsPerBlock+2*paddindX) * (nbThreadsPerBlock+2*paddindY). The wrong result (showed by the arrows) appears at the top of some blocks, but always at the same location and with the same values. I'd expect a more random result for memory race effect...
Is there a better approach to manage in-situ calculation or any reason that would prevent the grid synchronization to work?
EDIT
The size of the image I used is 510x509 and I run my code on a NVidia Quadro RTX 5000.
I would normally suggest minimal reproducible example for a question like this, as well as an indication of the GPU you are running on, but we can probably proceed without that. In short, what you are trying to do will not work reliably, as you've already discovered.
You have chosen a thread strategy of assigning one thread in your grid per output point:
pOutBuf[idx] = vMax;
which is sensible and fine. I imagine based on this:
I have 16x16 blocks, each with 32x32 threads.
that your input images are 512x512 (16x32 threads in each direction, one thread per output point).
And as you've already stated, you have 256 blocks (each of 1024 threads) in your grid. Furthermore, for the in-situ case, we can simplify your kernel to the following pseudo-code:
__global__ void GenericDilate2dImg_knl(...){
read_in_image();
grid_wide_sync();
write_out_image();
}
For such a methodology to work, then, the read_in_image() step must be able to read the entire image, before any writing occurs. However your methodology will not work in the general case, and evidently not on your specific GPU, either. In order to read in the entire image as per above, we must have every threadblock in the grid simultaneously resident on the SMs in your GPU. All 256 blocks need to be deposited, and running on an SM. But the GPU provides no inherent guarantees of such a thing. If your GPU has, for example 24 SMs in it, each of which can hold a maximum of 2048 threads, then your GPU would have a "running" or "instantaneous" capacity of 24*2048 threads, or 48 of your threadblocks. There would not be enough room for all 256 threadblocks to be running. Not only does your algorithm depend on that, but all 3 of your grid sync methods depend on that notion as well.
The fact that your 2nd grid sync method stops after 48 "atomic additions" suggested the example numbers above to me. It's a plausible proximal explanation for why that method may have failed that way: your GPU only allowed 48 of your threadblocks to be resident, and the other 208 threadblocks were waiting in the wings, not yet deposited on any SM, and therefore not allowing any of their threads to run. Those threads in those 208 threadblocks need to run to pick up the relevant input data, as well as to satisfy the requirements of the grid-wide sync. But they are not running, because they are waiting for room to open up on a SM. And room never opens up on a SM, because the full SMs have threadblocks that are waiting at the grid sync point. So you have deadlock.
This problem is not easily solvable in the general case. Any grid sync mechanism, including cooperative groups, has an inherent requirement that all threadblocks be actually simultaneously schedulable on your particular GPU. Therefore in the general case, where we don't know the data set size or the GPU we will be running on, the problem is quite difficult.
One possible approach is to divide your input data set into regions, and have your kernel process a region at a time. This may require multiple grid syncs, one to handle the in/out division in each region, and one to handle the progression of the kernel as it steps through regions. You would also have to handle the region edges carefully.
Another possible approach if you know the specifics of the data set size and the GPU you are running on, is just to make sure you are running on a GPU "large enough" to handle the data set size. For example, an A100 GPU could probably have as many 216 blocks simultaneously resident, so for that case you could handle a somewhat smaller image size, perhaps 14x32=448 height and 448 width dimensions.
Given that these approaches for in-place or in-situ work for this particular example require considerable complexity, I personally would be strongly motivated to use the methodology where output is different than input. That approach will likely run noticeably quicker as well. A grid wide sync is not a "free" construct from a performance perspective.

Udacity parallel programming, unspecified launch failure cudaGetLastError()

I am trying to complete homework #2 for Udacity course parallel programming. I have ran into a CUDA error that I just can't get around. The error is encoutnered when I launch a kernel that is meant to separate an image in the format "RGBRGBRGB" to three separate arrays of "RRR" "GGG" and "BBB". Seeing as the error "unspecified launch failure" does not give me anything specific to go on I am not sure how to trouble shoot my issue.
Here is the "main" function called to start the entire process. I left out the rest after the error is encountered so that I don't post the rest of my work for someone to find later.
void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA, uchar4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
unsigned char *d_redBlurred,
unsigned char *d_greenBlurred,
unsigned char *d_blueBlurred,
const int filterWidth)
{
// Maximum number of threads per block = 512; do this
// to keep this compatable with CUDa 5 and lower
// MAX > threadsX * threadsY * threadsZ
int MAXTHREADSx = 16;
int MAXTHREADSy = 16; // 16 x 16 x 1 = 512
// We want to fill the blocks so we don't waste this blocks threads
// I wonder if blocks can intermix in a physical core?
// Either way this method makes things "clean"; one thread per px
int nBlockX = numCols / MAXTHREADSx + 1;
int nBlockY = numRows / MAXTHREADSy + 1;
const dim3 blockSize(MAXTHREADSx, MAXTHREADSy, 1);
const dim3 gridSize(nBlockX, nBlockY, 1);
separateChannels<<<gridSize, blockSize>>>(
h_inputImageRGBA,
numRows,
numCols,
d_red,
d_green,
d_blue);
// Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
And here is the function separateChannels
//This kernel takes in an image represented as a uchar4 and splits
//it into three images consisting of only one color channel each
__global__
void separateChannels(const uchar4* const inputImageRGBA,
int numRows,
int numCols,
unsigned char* const redChannel,
unsigned char* const greenChannel,
unsigned char* const blueChannel)
{
//const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y);
const int col = blockIdx.x * blockDim.x + threadIdx.x;
const int row = blockIdx.y * blockDim.y + threadIdx.y;
//if (thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows)
// return;
if (col >= numCols || row >= numRows)
return;
//const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x;
int arrayPos = row * numCols + col;
uchar4 rgba = inputImageRGBA[arrayPos];
redChannel[arrayPos] = rgba.x;
greenChannel[arrayPos] = rgba.y;
blueChannel[arrayPos] = rgba.z;
}
I think I put in anything necessary, please let me know if not.
Without seeing the rest of the code I cannot tell for sure, but I believe you are sending pointer to host memory as a parameter to cuda kernel - not a good thing to do. In kernel launch you are sending in a h_inputImageRGBA while I believe you want to send in a d_inputImageRGBA.
Typically h_ prefix stands for host memory while d_ represents device.

Is 1D texture memory access faster than 1D global memory access?

I am measuring the difference between the standard and 1Dtexture access to memory. To do so I have created two kernels
__global__ void texture1D(float* doarray,int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]=tex1Dfetch(texreference,index);
return;
}
__global__ void standard1D(float* diarray, float* doarray, int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]= diarray[index];
return;
}
Then, I call eache kernel measuring the time it takes:
//copy array from host to device memory
cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice);
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventRecord(startEvent, 0) );
//bind texture reference with linear memory
cudaBindTexture(0,texreference,diarray,sizeof(float)*size);
//execute device kernel
texture1D<<<(int)ceil((float)size/threadSize),threadSize>>>(doarray,size);
//unbind texture reference to free resource
cudaUnbindTexture(texreference);
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
//copy result array from device to host memory
cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);
//check result
checkResutl(horray, harray, size);
cudaEvent_t startEvent2, stopEvent2;
checkCuda( cudaEventCreate(&startEvent2) );
checkCuda( cudaEventCreate(&stopEvent2) );
checkCuda( cudaEventRecord(startEvent2, 0) );
standard1D<<<(int)ceil((float)size/threadSize),threadSize>>>(diarray,doarray,size);
checkCuda( cudaEventRecord(stopEvent2, 0) );
checkCuda( cudaEventSynchronize(stopEvent2) );
//copy back to CPU
cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);
and print results:
float time,time2;
checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) );
checkCuda( cudaEventElapsedTime(&time2, startEvent2, stopEvent2) );
printf("Texture bandwidth (GB/s): %f\n",bytes * 1e-6 / time);
printf("Standard bandwidth (GB/s): %f\n",bytes * 1e-6 / time2);
It turns out that, no matters the size of the array I am allocating (size), the standard bandwidth is always much higher.
Is that how it suppose to be or am I screwing it up at some point?
My understanding of Texture memory access was that it can speed up global memory access.
I have made a comparison between global memory and texture memory (used for caching purposes only, and not for filtering) for the interpolation of a 1D complex valued function.
The kernels I'm comparing are the 4, 2 using global memory and 2 using texture memory. They are distinguished according to the way complex values are accessed (1 float2 or 2 floats) and are reported below. I will post somewhere the full Visual Studio 2010 in case someone like to make some criticisms or perform his own testing.
__global__ void linear_interpolation_kernel_function_GPU(float* __restrict__ result_d, const float* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j/2]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float dk = data_d[2*k+(j&1)];
float dkp1 = data_d[2*k+2+(j&1)];
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}
__global__ void linear_interpolation_kernel_function_GPU_alternative(float2* __restrict__ result_d, const float2* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float2 dk = data_d[k];
float2 dkp1 = data_d[k+1];
result_d[j].x = a * dkp1.x + (-dk.x * a + dk.x);
result_d[j].y = a * dkp1.y + (-dk.y * a + dk.y);
}
}
__global__ void linear_interpolation_kernel_function_GPU_texture(float2* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float2 dk = tex1Dfetch(data_d_texture,k);
float2 dkp1 = tex1Dfetch(data_d_texture,k+1);
result_d[j].x = a * dkp1.x + (-dk.x * a + dk.x);
result_d[j].y = a * dkp1.y + (-dk.y * a + dk.y);
}
}
__global__ void linear_interpolation_kernel_function_GPU_texture_alternative(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
if(j<N)
{
float reg_x_out = x_out_d[j/2]+M/4;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float dk = tex1Dfetch(data_d_texture2,2*k+(j&1));
float dkp1 = tex1Dfetch(data_d_texture2,2*k+2+(j&1));
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}
I have considered 4 different GPUs, namely, GeForce GT540M (cc 2.1), Tesla C2050 (cc 2.0), Kepler K20c (cc 3.5) and GT210 (cc 1.2). The results are reported in the figures below. As it can be seen, using textures as cache with older compute capabilities improves over the use of global memory, while the two solutions are pretty equivalent for the newest architecture.
Of course, this example is not exhaustive and there may be in practice other cases when the former or the latter should be preferred for particular applications.
p.s. The processing times are in [ms] and not in [s] as indicated in the figure labels.

Using CUDA Shared Memory to Improve Global Access Patterns

I have the following kernel to get the magnitude of a bunch of vectors:
__global__ void norm_v1(double *in, double *out, int n)
{
const uint i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
double x = in[3*i], y = in[3*i+1], z = in[3*i+2];
out[i] = sqrt(x*x + y*y + z*z);
}
}
However due to the packing of in as [x0,y0,z0,...,xn,yn,zn] it performs poorly with the profiler indicating a 32% global load efficiency. Repacking the data as [x0, x1, ..., xn, y0, y1, ..., yn, z0, z1, ..., zn] improves things greatly (with the offsets for x, y, and z changing accordingly). Runtime is down and efficiency is up to 100%.
However, this packing is simply not practical for my application. I therefore wish to investigate the use of shared memory. My idea is for each thread in a block to copy three values (blockDim.x apart) from global memory -- yielding coalesced access. Under the assumption of a maximum blockDim.x = 256 I came up with:
#define BLOCKDIM 256
__global__ void norm_v2(double *in, double *out, int n)
{
__shared__ double invec[3*BLOCKDIM];
const uint i = blockIdx.x * blockDim.x + threadIdx.x;
invec[0*BLOCKDIM + threadIdx.x] = in[0*BLOCKDIM+i];
invec[1*BLOCKDIM + threadIdx.x] = in[1*BLOCKDIM+i];
invec[2*BLOCKDIM + threadIdx.x] = in[2*BLOCKDIM+i];
__syncthreads();
if (i < n)
{
double x = invec[3*threadIdx.x];
double y = invec[3*threadIdx.x+1];
double z = invec[3*threadIdx.x+2];
out[i] = sqrt(x*x + y*y + z*z);
}
}
However this is clearly deficient when n % blockDim.x != 0, requires knowing the maximum blockDim in advance and generates incorrect results for out[i > 255] when tested with an n = 1024. How should I best remedy this?
I think this can solve the out[i > 255] problem:
__shared__ double shIn[3*BLOCKDIM];
const uint blockStart = blockIdx.x * blockDim.x;
invec[0*blockDim.x+threadIdx.x] = in[ blockStart*3 + 0*blockDim.x + threadIdx.x];
invec[1*blockDim.x+threadIdx.x] = in[ blockStart*3 + 1*blockDim.x + threadIdx.x];
invec[2*blockDim.x+threadIdx.x] = in[ blockStart*3 + 2*blockDim.x + threadIdx.x];
__syncthreads();
double x = shIn[3*threadIdx.x];
double y = shIn[3*threadIdx.x+1];
double z = shIn[3*threadIdx.x+2];
out[blockStart+threadIdx.x] = sqrt(x*x + y*y + z*z);
As for n % blockDim.x != 0 I would suggest padding the input/output arrays with 0 to match the requirement.
If you dislike the BLOCKDIM macro - explore using extern __shared__ shArr[] and then passing 3rd parameter to kernel configuration:
norm_v2<<<gridSize,blockSize,dynShMem>>>(...)
the dynShMem is the dynamic shared memory usage (in bytes). This is extra shared memory pool with its size specified at run-time, where all extern __shared__ variables will be initially assigned to.
What GPU are you using? Fermi or Kepler might help your original code with their L1 caching.
If you don't want to pad your in array, or you end up doing similar trick somewhere else, you may want to consider implementing a device-side memcopy, something like this:
template <typename T>
void memCopy(T* destination, T* source, size_t numElements) {
//assuming sizeof(T) is a multiple of sizeof(int)
//assuming one-dimentional kernel (only threadIdx.x and blockDim.x matters)
size_t totalSize = numElements*sizeof(T)/sizeof(int);
int* intDest = (int*)destination;
int* intSrc = (int*)source;
for (size_t i = threadIdx.x; i < totalSize; i += blockDim.x) {
intDest[i] = intSrc[i];
}
__syncthreads();
}
It basically treats any array as an array of int-s and copy the data from one location to another. You may want to replace the underlying int type with double-s or long long int if you work with 64-bit types only.
Then you can replace the copying lines with:
memCopy(invec, in+blockStart*3, min(blockDim.x, n-blockStart));

Shared memory mutex with CUDA - adding to a list of items

My problem is the following: I have an image in which I detect some points of interest using the GPU. The detection is a heavyweight test in terms of processing, however only about 1 in 25 points pass the test on average. The final stage of the algorithm is to build up a list of the points. On the CPU this would be implemented as:
forall pixels x,y
{
if(test_this_pixel(x,y))
vector_of_coordinates.push_back(Vec2(x,y));
}
On the GPU I have each CUDA block processing 16x16 pixels. The problem is that I need to do something special to eventually have a single consolidated list of points in global memory. At the moment I am trying to generate a local list of points in shared memory per block which eventually will be written to global memory. I am trying to avoid sending anything back to the CPU because there are more CUDA stages after this.
I was expecting that I could use atomic operations to implement the push_back function on shared memory. However I am unable to get this working. There are two issues. The first annoying issue is that I am constantly running into the following compiler crash: "nvcc error : 'ptxas' died with status 0xC0000005 (ACCESS_VIOLATION)" when using atomic operations. It is hit or miss whether I can compile something. Does anyone know what causes this?
The following kernel will reproduce the error:
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ unsigned int test;
atomicInc(&test, 1000);
}
Secondly, my code which includes a mutex lock on shared memory hangs the GPU and I dont understand why:
__device__ void lock(unsigned int *pmutex)
{
while(atomicCAS(pmutex, 0, 1) != 0);
}
__device__ void unlock(unsigned int *pmutex)
{
atomicExch(pmutex, 0);
}
__global__ void gpu_kernel_non_max_suppress(int w, int h, RtmPoint *pPoints, int *pCounts)
{
__shared__ RtmPoint localPoints[64];
__shared__ int localCount;
__shared__ unsigned int mutex;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int blockid = blockIdx.y * gridDim.x + blockIdx.x;
if(threadid==0)
{
localCount = 0;
mutex = 0;
}
__syncthreads();
if(x<w && y<h)
{
if(some_test_on_pixel(x,y))
{
RtmPoint point;
point.x = x;
point.y = y;
// this is a local push_back operation
lock(&mutex);
if(localCount<64) // we should never get >64 points per block
localPoints[localCount++] = point;
unlock(&mutex);
}
}
__syncthreads();
if(threadid==0)
pCounts[blockid] = localCount;
if(threadid<localCount)
pPoints[blockid * 64 + threadid] = localPoints[threadid];
}
In the example code at this site, the author manages to successfully use atomic operations on shared memory, so I am confused as to why my case does not function. If I comment out the lock and unlock lines, the code runs ok, but obviously incorrectly adding to the list.
I would appreciate some advice about why this problem is happening and also perhaps if there is a better solution to achieving the goal, since I am concerned anyway about the performance issues with using atomic operations or mutex locks.
I suggest using prefix-sum to implement that part to increase parallelism. To do that you need to use a shared array. Basically prefix-sum will turn an array (1,1,0,1) into (0,1,2,2,3), i.e., will calculate an in-place running exclusive sum so that you'll get per-thread write indices.
__shared__ uint8_t vector[NUMTHREADS];
....
bool emit = (x<w && y<h);
emit = emit && some_test_on_pixel(x,y);
__syncthreads();
scan(emit, vector);
if (emit) {
pPoints[blockid * 64 + vector[TID]] = point;
}
prefix-sum example:
template <typename T>
__device__ uint32 scan(T mark, T *output) {
#define GET_OUT (pout?output:values)
#define GET_INP (pin?output:values)
__shared__ T values[numWorkers];
int pout=0, pin=1;
int tid = threadIdx.x;
values[tid] = mark;
syncthreads();
for( int offset=1; offset < numWorkers; offset *= 2) {
pout = 1 - pout; pin = 1 - pout;
syncthreads();
if ( tid >= offset) {
GET_OUT[tid] = (GET_INP[tid-offset]) +( GET_INP[tid]);
}
else {
GET_OUT[tid] = GET_INP[tid];
}
syncthreads();
}
if(!pout)
output[tid] =values[tid];
__syncthreads();
return output[numWorkers-1];
#undef GET_OUT
#undef GET_INP
}
Based on recommendations here, I include the code that I used in the end. It uses 16x16 pixel blocks. Note that I am now writing the data out in one global array without breaking it up. I used the global atomicAdd function to compute a base address for each set of results. Since this only gets called once per block, I did not find too much of a slow down, while I gained a lot more convenience by doing this. I'm also avoiding shared buffers for the input and output of prefix_sum. GlobalCount is set to zero prior to the kernel call.
#define BLOCK_THREADS 256
__device__ int prefixsum(int threadid, int data)
{
__shared__ int temp[BLOCK_THREADS*2];
int pout = 0;
int pin = 1;
if(threadid==BLOCK_THREADS-1)
temp[0] = 0;
else
temp[threadid+1] = data;
__syncthreads();
for(int offset = 1; offset<BLOCK_THREADS; offset<<=1)
{
pout = 1 - pout;
pin = 1 - pin;
if(threadid >= offset)
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid] + temp[pin * BLOCK_THREADS + threadid - offset];
else
temp[pout * BLOCK_THREADS + threadid] = temp[pin * BLOCK_THREADS + threadid];
__syncthreads();
}
return temp[pout * BLOCK_THREADS + threadid];
}
__global__ void gpu_kernel(int w, int h, RtmPoint *pPoints, int *pGlobalCount)
{
__shared__ int write_base;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int threadid = threadIdx.y * blockDim.x + threadIdx.x;
int valid = 0;
if(x<w && y<h)
{
if(test_pixel(x,y))
{
valid = 1;
}
}
int index = prefixsum(threadid, valid);
if(threadid==BLOCK_THREADS-1)
{
int total = index + valid;
if(total>64)
total = 64; // global output buffer is limited to 64 points per block
write_base = atomicAdd(pGlobalCount, total); // get a location to write them out
}
__syncthreads(); // ensure write_base is valid for all threads
if(valid)
{
RtmPoint point;
point.x = x;
point.y = y;
if(index<64)
pPoints[write_base + index] = point;
}
}