Questions about compute shader DispatchThreadID? - directcompute

Texture2D<float> InputTex : register( t0 );
RWTexture2D<float> OutputTex : register( u0 );
// Group size
#define size_x 20
#define size_y 20
// Declare one thread fo r each texel of the input texture.
[numthreads(size_x, size_y, 1)]
void CSMAIN( uint3 DispatchThreadID : SV_DispatchThreadID )
{
int3 texturelocation = int3( 0,0, 0 );
texturelocation.x = DispatchThreadID.x;
texturelocation.y = DispatchThreadID.y;
float Value = InputTex.Load( texturelocation );
OutputTex[DispatchThreadID.xy] = 2.0f * Value;
}
First question:
In this code, does DispatchThreadID.x and DispatchThreadID.y have the same value, like thread 17 of x and thread 17 of y?
Second question:
Can I write this?
OutputTex[ texturelocation ] = 2.0f * Value;
A yes or no answer is sufficient, but if no, please provide a short explanation why.

Related

NVIDIA CUDA YUV (NV12) to RGB conversion algorithm breakdown

I am trying to modify the original YUV->RGB kernel provided in sample code of NVIDIA Video SDK and I need help to understand some of its parts.
Here is the kernel code:
template<class YuvUnitx2, class Rgb, class RgbIntx2>
__global__ static void YuvToRgbKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pRgb, int nRgbPitch, int nWidth, int nHeight) {
int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
if (x + 1 >= nWidth || y + 1 >= nHeight) {
return;
}
uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
uint8_t* pDst = pRgb + x * sizeof(Rgb) + y * nRgbPitch;
YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
//YuvToRgbForPixel - returns rgba encoded in uint32_t (.d)
*(RgbIntx2*)pDst = RgbIntx2{
YuvToRgbForPixel<Rgb>(l0.x, ch.x, ch.y).d,
YuvToRgbForPixel<Rgb>(l0.y, ch.x, ch.y).d,
};
*(RgbIntx2*)(pDst + nRgbPitch) = RgbIntx2{
YuvToRgbForPixel<Rgb>(l1.x, ch.x, ch.y).d,
YuvToRgbForPixel<Rgb>(l1.y, ch.x, ch.y).d,
};
}
Here are my basic assumptions, some of them are possibly wrong:
NV12 has two planes, 1 for Luma and 2 for interleaved chroma.
The kernel tries to write 4 pixels at a time.
If assumption 2 is correct, the question is why same chroma (ch) values are used for all 4 pixels? And If I am wrong on 2, please explain what exactly happens here.
The Chroma-planes on NV12 or NV21 are subsampled by a factor of 2.
For every 2x2 macro pixel in the output there are 4 luma (Y) channels, 1 Cb and 1 Cr element.

Rearranging an array in CUDA

I have the following problem that I want to implement on CUDA:
I want to read an array (say "flag[20]"), and based on a certain condition, write indices of this array to another array (say "pindex[]")
Simple code implementation in C can be:
int N = 20;
int flag[N];
int pindex[N];
for(int i=0;i<N;i++)
flag[i] = -1;
for(int i=0;i<N;i+=2)
flag[i] = 0;
for(int i=0;i<N;i++)
pindex[i] = 0;
//operation: count # of times flag != -1 and write those indices in a different array
int pcount1 = 0;
for(int i=0;i<N;i++)
{
if(flag[i] != -1)
{
pindex[pcount1] = i;
++pcount1;
}
}
How will I implement this in CUDA?
I can use atomicAdd() to calculate total number of times my condition is satisfied. But, how do I write indices in a different array. For example, I tried the following:
__global__ void kernel_tryatomic(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
int n=(blockIdx.x*2+blockIdx.y)*BlockSize+tId;
if(n > N-1) return;
if(flag[n] != -1)
{
atomicAdd(pcount,1);
atomicExch(&pindex[*pcount],n);
//pindex[*pcount] = n;
}
}
This code calculates "pcount" correctly, but does not update "pindex" array.
I need help to do this operation on GPUs.
Thanks
Since your condition (flag) is conceptually a binary, you can use binary prefix sum (thoroughly explained here) to determine which place the thread with a positive flag should write.
For example if N is 20, with the help of below __device__ functions:
__device__ int lanemask_lt(int lane) {
return (1 << (lane)) − 1;
}
__device__ int warp_prefix_sums(int lane, int p) {
const int mask = lanemask_lt( lane );
int b = __ballot( p );
return __popc( b & mask );
}
your __global__ function can simply be written like below:
__global__ void kernel_scan(int N,int* pcount,int* flag, int* pindex)
{
int tId=threadIdx.x;
if(tId >= N)
return;
int threadFlag = ( flag[tId] == -1 ) ? 0 : 1;
int position_to_write = warp_prefix_sum( tId & (warpSize-1), threadFlag );
if( threadFlag )
pindex[ position_to_write ] = tId;
}
If N is bigger than the warp size (32), you can use intra-block binary prefix sum that is explained in the provided link.

CUDA kernel only work with 1D thread index

There is a weird problem. I have following code. When I call first function it does not give correct result. However, when I call the function2 (the second function) it works fine. It is so weird to me. Does anyone has any idea about the problem? Thanks!!!
__global__ void function(int w, class<double> C, float *result) {
int r = threadIdx.x + blockIdx.x * blockDim.x;
int c = threadIdx.y + blockIdx.y * blockDim.y;
int half_w = w /2;
if (r < w && c < w) {
double dis = sort((double)(r - half_w) * (r - half_w) + (double)(c_half_w) * (c - half_w));
result[c * w + r] = (float)C.getVal(dis);
}
}
__global__ void function2(int w, class<double> C, float *result) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int half_w = w /2;
int r = tid / w;
int c = tid % w;
if (r < w && c < w) {
double dis = sort((double)(r - half_w) * (r - half_w) + (double)(c_half_w) * (c - half_w));
result[c * w + r] = (float)C.getVal(dis);
}
}
UPDATE:
I use the function and function2 to draw an image. The pixel value is based on the distance between image center and current pixel position. Based on the distance, the class C getVal will calculate the value for the pixel. So, in the kernel, I just make every thread to calculate the distance and corresponding pixel value. The correct result is compared with CPU version. The function is just give some random value some very larger some very small. When I changed the result[c * w + r] = (float)C.getVal(dis) to result[c * w +r ] = 1.0f, the generated image seems does not change.
The image size is W x W, to launch function I set
dim3 grid_dim(w / 64 + 1, w / 64 + 1);
dim3 block_dim(64, 64);
function<<<grid_dim, block_dim>>>(W, C, cu_img);
To launch function2
function2<<<W / 128 + 1, 128>>>(W, C, cu_img)
Fixed:
I got the problem. I assigned too many threads to one block. The max threads in one block is 1024 in my device. Actually, when I run cuds-memcheck, I can see the function2 does not even launched.
I solved the problem. I assigned too many threads to one block. The max threads in one block is 1024 in my device. Actually, when I ran cuda-memcheck, I can see the function2 was not ever launched.

Learning CUDA, but currently stuck

So I've been trying to learn CUDA as of late, but am currently stuck and don't know what I'm doing wrong. I am trying to set the initial value of the opool array based on a random float between 0 and 1. If anyone could shed some light on what I did wrong it would be greatly appreciated.
Note - I omitted some code for brevity (cudaFree() & free() calls mainly). I apologize if I left any code of importance out.
__global__ void FirstLoop( int *opool, float *randomSet, int omax, int anumber )
{
int tid_loci = threadIdx.x;
int tid_2 = threadIdx.y;
int bid_omax = blockIdx.x;
int index = omax*tid_loci*2 + omax*tid_2 + bid_omax;
float r = randomSet[ index ];
// Commented out code is what it should be set to, but they are set to 5 or 15
// to determine if the values are correctly being set.
if ( r < 0.99 )
opool[ index ] = 15; //(int)((r * 100.0) * -1.0);
else
opool[ index ] = 5; //(int)((r)*(float)(anumber-4)) +5;
}
int main()
{
int loci = 10;
int omax = 20;
// Data stored on the host
int *h_opool;
float *h_randomSet;
// Data stored on the device
int *d_opool;
float *d_randomSet;
int poolSize = helpSize * omax;
int randomSize = loci * 2 * omax * sizeof(float);
// RESIZE ARRAYS TO NEEDED SIZE
h_opool = (int*)malloc( poolSize );
h_randomSet= (float*)malloc( randomSize );
cudaMalloc( &d_opool, poolSize );
cudaMalloc( &d_randomSet,randomSize );
for (sim=0; sim<smax; sim++)
{
for (i=0; i<poolSize; i++)
h_randomSet[i] = rndm();
dim3 blocks(omax);
dim3 thread(loci, 2);
cudaMemcpy( d_randomSet, h_randomSet, randomSize, cudaMemcpyHostToDevice );
cudaMemcpy( d_opool, h_opool, poolSize, cudaMemcpyHostToDevice );
FirstLoop<<< blocks, thread >>>(d_opool, d_randomSet, omax, anumber );
cudaMemcpy( h_opool, d_opool, poolSize, cudaMemcpyDeviceToHost );
// Here is when I call printf to see the values stored in h_opool, but they are
// completely wrong
}
}
float rndm()
{
int random = rand();
return ((float)random / (float)RAND_MAX);
}
Change the following
int index = omax*tid_loci*2 + omax*tid_2 + bid_omax;
to
int index = bid_omax * tid_2 + tid_loci;
However a block configuration of 10x2 may not be the most ideal one. Try using 32 x 1 or 16 x 2.

CUDA Warp Synchronization Problem

In generalizing a kernel thats shifts the values of a 2D array one space to the right (wrapping around the row boundaries), I have come across a warp synchronization problem. The full code is attached and included below.
The code is meant to work for arbitrary array width, array height, number of thread blocks, and number of threads per block. When choosing a thread size of 33 (i.e. one more thread than a full warp), the 33rd thread doesn't synchronize with __syncthreads() is called. This causes problems with the output data. The problem is only present when there is more than one warp, and the width of the array is more than the number of threads (e.g. with width=35 and 34 threads).
The following is a downsized example of what happens (in reality the array would need to have more elements for the kernel to produce the error).
Initial array:
0 1 2 3 4
5 6 7 8 9
Expected Result:
4 0 1 2 3
9 5 6 7 8
Kernel Produces:
4 0 1 2 3
8 5 6 7 8
The first line is done correctly (for each block if there are more than one), with all subsequent lines having the second last value repeated. I have tested this one two different cards (8600GT and GTX280) and get the same results. I would like to know if this is just a bug with my kernel, or a problem that can't be fixed by adjusting my code?
The full source file is included below.
Thank you.
#include <cstdio>
#include <cstdlib>
// A method to ensure all reads use the same logical layout.
inline __device__ __host__ int loc(int x, int y, int width)
{
return y*width + x;
}
//kernel to shift all items in a 2D array one position to the right (wrapping around rows)
__global__ void shiftRight ( int* globalArray, int width, int height)
{
int temp1=0; //temporary swap variables
int temp2=0;
int blockRange=0; //the number of rows that a single block will shift
if (height%gridDim.x==0) //logic to account for awkward array sizes
blockRange = height/gridDim.x;
else
blockRange = (1+height/gridDim.x);
int yStart = blockIdx.x*blockRange;
int yEnd = yStart+blockRange; //the end condition for the y-loop
yEnd = min(height,yEnd); //make sure that the array doesn't go out of bounds
for (int y = yStart; y < yEnd ; ++y)
{
//do the first read so the swap variables are loaded for the x-loop
temp1 = globalArray[loc(threadIdx.x,y,width)];
//Each block shifts an entire row by itself, even if there are more columns than threads
for (int threadXOffset = threadIdx.x ; threadXOffset < width ; threadXOffset+=blockDim.x)
{
//blockDim.x is added so that we store the next round of values
//this has to be done now, because the next operation will
//overwrite one of these values
temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
__syncthreads(); //sync before the write to ensure all the values have been read
globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
__syncthreads(); //sync after the write so ensure all the values have been written
temp1 = temp2; //swap the storage variables.
}
if (threadIdx.x == 0 && y == 0)
globalArray[loc(12,2,width)]=globalArray[67];
}
}
int main (int argc, char* argv[])
{
//set the parameters to be used
int width = 34;
int height = 3;
int threadsPerBlock=33;
int numBlocks = 1;
int memSizeInBytes = width*height*sizeof(int);
//create the host data and assign each element of the array to equal its index
int* hostData = (int*) malloc (memSizeInBytes);
for (int y = 0 ; y < height ; ++y)
for (int x = 0 ; x < width ; ++x)
hostData [loc(x,y,width)] = loc(x,y,width);
//create an allocate the device pointers
int* deviceData;
cudaMalloc ( &deviceData ,memSizeInBytes);
cudaMemset ( deviceData,0,memSizeInBytes);
cudaMemcpy ( deviceData, hostData, memSizeInBytes, cudaMemcpyHostToDevice);
cudaThreadSynchronize();
//launch the kernel
shiftRight<<<numBlocks,threadsPerBlock>>> (deviceData, width, height);
cudaThreadSynchronize();
//copy the device data to a host array
int* hostDeviceOutput = (int*) malloc (memSizeInBytes);
cudaMemcpy (hostDeviceOutput, deviceData, memSizeInBytes, cudaMemcpyDeviceToHost);
cudaFree (deviceData);
//Print out the expected/desired device output
printf("---- Expected Device Output ----\n");
printf(" | ");
for (int x = 0 ; x < width ; ++x)
printf("%4d ",x);
printf("\n---|-");
for (int x = 0 ; x < width ; ++x)
printf("-----");
for (int y = 0 ; y < height ; ++y)
{
printf("\n%2d | ",y);
for (int x = 0 ; x < width ; ++x)
printf("%4d ",hostData[loc((x-1+width)%width,y,width)]);
}
printf("\n\n");
printf("---- Actual Device Output ----\n");
printf(" | ");
for (int x = 0 ; x < width ; ++x)
printf("%4d ",x);
printf("\n---|-");
for (int x = 0 ; x < width ; ++x)
printf("-----");
for (int y = 0 ; y < height ; ++y)
{
printf("\n%2d | ",y);
for (int x = 0 ; x < width ; ++x)
printf("%4d ",hostDeviceOutput[loc(x,y,width)]);
}
printf("\n\n");
}
Because not all threads are executing the same number of loop iterations, synchronisation is a problem! All threads should hit the same __syncthreads()-s all the time.
I would suggest transforming your innermost for loop into something like this:
for(int blockXOffset=0; blockXOffset < width; blockXOffset+=blockDim.x) {
int threadXOffset=blockXOffset+threadIdx.x;
bool isActive=(threadXOffset < width);
if (isActive) temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
__syncthreads();
if (isActive) globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
__syncthreads();
temp1 = temp2;
}
From the Programming Guide:
__syncthreads() is allowed in
conditional code but only if the
conditional evaluates identically
across the entire thread block,
otherwise the code execution is likely
to hang or produce unintended side
effects.
In my example, not all threads are executing the same number of loop iterations, so synchronization doesn't happen.