How to pass struct containing matrices in Cuda - cuda

As the titles says , i'm trying to pass a struct containing 4 matrices to a Cuda Kernel. The problem is that i get no errors, but the program crashes goes nuts whenever i try to execute it.All of the values returned are 0 and the clock value overflows.
Here's what i've made so far :
#define ROWS 700
#define COLS 1244
struct sobel {
int Gradient[ROWS][COLS];
int Image_input[ROWS][COLS];
int G_x[ROWS][COLS];
int G_y[ROWS][COLS];
};
__global__ void sobel(struct sobel* data)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int XLENGTH = ROWS;
int YLENGTH = COLS;
if ((x < XLENGTH) && (y < YLENGTH))
{
if (x == 0 || x == XLENGTH - 1 || y == 0 || y == YLENGTH - 1)
{
data->G_x[x][y] = data->G_y[x][y] = data->Gradient[x][y] = 0;
}
else
{
data->G_x[x][y] = data->Image_input[x + 1][y - 1]
+ 2 * data->Image_input[x + 1][y]
+ data->Image_input[x + 1][y + 1]
- data->Image_input[x - 1][y - 1]
- 2 * data->Image_input[x - 1][y]
- data->Image_input[x - 1][y + 1];
data->G_y[x][y] = data->Image_input[x - 1][y + 1]
+ 2 * data->Image_input[x][y + 1]
+ data->Image_input[x + 1][y + 1]
- data->Image_input[x - 1][y - 1]
- 2 * data->Image_input[x][y - 1]
- data->Image_input[x + 1][y - 1];
data->Gradient[x][y] = abs(data->G_x[x][y]) + abs(data->G_y[x][y]);
if (data->Gradient[x][y] > 255) {
data->Gradient[x][y] = 255;
}
}
}
}
int main() {
struct sobel* data = (struct sobel*)calloc(sizeof(*data), 1);
struct sobel* dev_data;
cudaMalloc((void**)&dev_data, sizeof(*data));
cudaMemcpy(dev_data, data, sizeof(data), cudaMemcpyHostToDevice);
dim3 blocksize(16, 16);
dim3 gridsize;
gridsize.x = (ROWS + blocksize.x - 1) / blocksize.x;
gridsize.y = (COLS + blocksize.y - 1) / blocksize.y;
sobel <<< gridsize, blocksize >>> (dev_data);
cudaMemcpy(data, dev_data, sizeof(data), cudaMemcpyDeviceToHost);
free(data);
cudaFree(dev_data);
return 0;
}
Do i also have to allocate device memory for each obe of the matrices ?
Any advice would be appreciated.
Edit : I switched a couple of things here but the program seems to ignore the nested else statement and all the values returned are 0 .

There (at least) are 2 errors in your code.
You have not allocated a correct size for the device struct:
cudaMalloc((void**)&dev_data, sizeof(data));
^
just like you did in your calloc call, that should be sizeof(*data) not sizeof(data) (Both cudaMemcpy calls should probably be updated to reflect this size as well.)
You need a proper thread check in your kernel code, something like this:
if (( x < XLENGTH ) && ( y < YLENGTH )){ // add this line
if (x == 0 || x == XLENGTH - 1 || y == 0 || y == YLENGTH - 1)
{
data->G_x[x][y] = data->G_y[x][y] = data->Gradient[x][y] = 0;
Without that, your next if test line may allow out-of-bounds threads to participate in the zeroing operation. For example any thread where x == 0 will pass that if-test. But that thread may have an out-of-bounds y-value.

Related

Box filter in CUDA using Google Colab

I have to implement Box filter using GPU with CUDA and I'm doing it on Google Colab. The code runs without any errors but my resulting image is all black.
This is my blurring function:
__global__ void apply_box_blur(int height, int width, unsigned char* buffer, unsigned char* out) {
int i, j;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (row < 2 || col < 2 || row >= height -3 || col >= width -3 ) return ;
float v = 1.0 / 9.0;
float kernel[3][3] = { {v,v,v},
{v,v,v},
{v,v,v} };
float sum0 = 0.0;
float sum1 = 0.0;
float sum2 = 0.0;
for (i = -1; i <= 1; i++)
{
for (j = -1; j <= 1; j++)
{
// matrix multiplication with kernel with every color plane
sum0 = sum0 + (float)kernel[i + 1][j + 1] * buffer[((row + i) * width + (col + j)) * 3 + 0];
sum1 = sum1 + (float)kernel[i + 1][j + 1] * buffer[((row + i) * width + (col + j)) * 3 + 1];
sum2 = sum2 + (float)kernel[i + 1][j + 1] * buffer[((row + i) * width + (col + j)) * 3 + 2];
}
}
out[(row * width + col) * 3 + 0] = (unsigned char)sum0;
out[(row * width + col) * 3 + 1] = (unsigned char)sum1;
out[(row * width + col) * 3 + 2] = (unsigned char)sum2;
};
And my main function:
// device copies
unsigned char* d_buffer;
unsigned char* d_out;
// allocate space for device copies
cudaMalloc((void**)&d_buffer, size * 3 * sizeof(unsigned char));
cudaMalloc((void**)&d_out, size * 3 * sizeof(unsigned char));
// Copy inputs to device
cudaMemcpy(d_buffer, buffer, size * 3 * sizeof(unsigned char), cudaMemcpyHostToDevice);
// perform the Box blur and store the resulting pixels in the output buffer
dim3 block(16, 16);
dim3 grid(width / 16, height / 16);
apply_box_blur <<<grid, block>>> (height, width, d_buffer, d_out);
cudaMemcpy(out, d_out, size * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
Am I doing something wrong with the block and grid sizes? Or is there something wrong with my blurring function? Is it maybe a Google Colab issue?
Found the issue.
The block and grid sizes should've been this:
dim3 blockSize(16, 16, 1);
dim3 gridSize((size*3)/blockSize.x, (size*3)/blockSize.y, 1);
Also my Google Colab wasn't connected to a GPU.

NVIDIA CUDA YUV (NV12) to RGB conversion algorithm breakdown

I am trying to modify the original YUV->RGB kernel provided in sample code of NVIDIA Video SDK and I need help to understand some of its parts.
Here is the kernel code:
template<class YuvUnitx2, class Rgb, class RgbIntx2>
__global__ static void YuvToRgbKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pRgb, int nRgbPitch, int nWidth, int nHeight) {
int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
if (x + 1 >= nWidth || y + 1 >= nHeight) {
return;
}
uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
uint8_t* pDst = pRgb + x * sizeof(Rgb) + y * nRgbPitch;
YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
//YuvToRgbForPixel - returns rgba encoded in uint32_t (.d)
*(RgbIntx2*)pDst = RgbIntx2{
YuvToRgbForPixel<Rgb>(l0.x, ch.x, ch.y).d,
YuvToRgbForPixel<Rgb>(l0.y, ch.x, ch.y).d,
};
*(RgbIntx2*)(pDst + nRgbPitch) = RgbIntx2{
YuvToRgbForPixel<Rgb>(l1.x, ch.x, ch.y).d,
YuvToRgbForPixel<Rgb>(l1.y, ch.x, ch.y).d,
};
}
Here are my basic assumptions, some of them are possibly wrong:
NV12 has two planes, 1 for Luma and 2 for interleaved chroma.
The kernel tries to write 4 pixels at a time.
If assumption 2 is correct, the question is why same chroma (ch) values are used for all 4 pixels? And If I am wrong on 2, please explain what exactly happens here.
The Chroma-planes on NV12 or NV21 are subsampled by a factor of 2.
For every 2x2 macro pixel in the output there are 4 luma (Y) channels, 1 Cb and 1 Cr element.

how to avoid thread divergence in this CUDA kernel?

for the CUDA kernel function, get branching divergence shown below, how to optimize it?
int gx = threadIdx.x + blockDim.x * blockIdx.x;
val = g_data[gx];
if (gx % 4 == 0)
val = op1(val);
else if (gx % 4 == 1)
val = op2(val);
else if (gx % 4 == 2)
val = op3(val);
else if (gx % 4 == 3)
val = op4(val);
g_data[gx] = val;
If I were programming in CUDA, I certainly wouldn't do any of this. However to answer your question:
how to avoid thread divergence in this CUDA kernel?
You could do something like this:
int gx = threadIdx.x + blockDim.x * blockIdx.x;
val = g_data[gx];
int gx_bit_0 = gx & 1;
int gx_bit_1 = (gx & 2) >> 1;
val = (1-gx_bit_1)*(1-gx_bit_0)*op1(val) + (1-gx_bit_1)*(gx_bit_0)*op2(val) + (gx_bit_1)*(1-gx_bit_0)*op3(val) + (gx_bit_1)*(gx_bit_0)*op4(val);
g_data[gx] = val;
Here is a full test case:
$ cat t1914.cu
#include <iostream>
__device__ float op1(float val) { return val + 1.0f;}
__device__ float op2(float val) { return val + 2.0f;}
__device__ float op3(float val) { return val + 3.0f;}
__device__ float op4(float val) { return val + 4.0f;}
__global__ void k(float *g_data){
int gx = threadIdx.x + blockDim.x * blockIdx.x;
float val = g_data[gx];
int gx_bit_0 = gx & 1;
int gx_bit_1 = (gx & 2) >> 1;
val = (1-gx_bit_1)*(1-gx_bit_0)*op1(val) + (1-gx_bit_1)*(gx_bit_0)*op2(val) + (gx_bit_1)*(1-gx_bit_0)*op3(val) + (gx_bit_1)*(gx_bit_0)*op4(val);
g_data[gx] = val;
}
const int N = 32;
int main(){
float *data;
cudaMallocManaged(&data, N*sizeof(float));
for (int i = 0; i < N; i++) data[i] = 1.0f;
k<<<1,N>>>(data);
cudaDeviceSynchronize();
for (int i = 0; i < N; i++) std::cout << data[i] << std::endl;
}
$ nvcc -o t1914 t1914.cu
$ compute-sanitizer ./t1914
========= COMPUTE-SANITIZER
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
2
3
4
5
========= ERROR SUMMARY: 0 errors
$
Solution by changing the work per thread
The best solution with the existing data layout is to let every thread compute 4 consecutive values. It's better to have fewer threads that can work properly than have more that can't.
float* g_data;
int gx = threadIdx.x + blockDim.x * blockIdx.x;
g_data[4 * gx] = op1(g_data[4 * gx]);
g_data[4 * gx + 1] = op2(g_data[4 * gx + 1]);
g_data[4 * gx + 2] = op3(g_data[4 * gx + 2]);
g_data[4 * gx + 3] = op4(g_data[4 * gx + 3]);
If the size of g_data is not a multiple of 4, put an if around the index operations. If it is always a multiple of 4 and properly aligned, load and store 4 values as a float4 for better performance.
Solution by reordering the work
As all my talk about float4 may have suggested, your input data appears to be some form of 2D structure where one every four elements share a similar function. Maybe it is an array of structs or an array of vectors -- in other words, a matrix.
For the purpose of explaining what I mean, I consider it a Nx4 matrix. If you transpose this into a 4xN matrix and apply a kernel to this, most of your problems disappear. Because then entries for which the same operation has to be done are placed next to each other in memory and that makes writing an efficient kernel easier. Something like this:
float* g_data;
int rows_in_g;
int gx = threadIdx.x + blockDim.x * blockIdx.x;
int gy = threadIdx.y;
float& own_g = g_data[gx + rows_in_g * gy];
switch(gy) {
case 0: own_g = op1(own_g); break;
case 1: own_g = op2(own_g); break;
case 2: own_g = op3(own_g); break;
case 3: own_g = op4(own_g); break;
default: break;
}
Start this as a 2D kernel with blocksize x=32, y=4 and gridsize x=N/32, y=1.
Now your kernel is still divergent, but all threads within a warp will execute the same case and access consecutive floats in memory. That's the best you can achieve. Of course this all depends on whether you can change the data layout.

How do I sum an array on GPU with CUDA?

I am trying to use GPU to sum an array with such code:
__global__ void sum_array(int* a, uint n) {
uint idx = threadIdx.x + blockIdx.x * blockDim.x;
for (int s = 1; s < n; s *= 2) {
uint i1 = s * 2 * idx;
uint i2 = s * (2 * idx + 1);
if (i2 < n) {
a[i1] += a[i2];
}
__syncthreads();
}
}
For the test I generated my array as [0, 1, 2 ... 99], so the result should be 4950. When I set block as [1024, 1, 1] and grid as [1, 1] everything works fine: the value of a[0] contains the correct result after the calculation. But if I set block=[4, 1, 1] and grid=[25, 1], I get the result 4754 that is wrong (but from time to time, the function provides the correct result). It looks like all the threads are not synced properly in different blocks. How can I fix my code to make it work correctly with multiple blocks? I am going to sum long arrays that are longer than the number of threads I can use, so I need a solution for many blocks (blockDim.x > 1).
I found this solution:
__global__ void sum_array(int* a, uint n) {
uint tid = threadIdx.x;
uint offset = 2 * blockIdx.x * blockDim.x;
for (uint s = 1; s <= blockDim.x; s *= 2) {
if (tid % s == 0) {
uint idx = 2 * tid + offset;
if (idx + s < n) {
atomicAdd(a + idx, a[idx + s]);
}
}
__syncthreads();
}
if ((offset != 0) && (tid == 0)) {
atomicAdd(a, a[offset]);
}
}
In short, I applied similar algorithm as in the question, but for each block separately (not for the whole array). So after that I needed to add all the results from each block into a[0] in the end. I also replaced my sum operator with atomicAdd to ensure the correct adding between blocks (in the end).

What did I miss while converting from CUDA to OpenCL? Or why is my kernel returning different output than serial code

This is my code for multiplication of a sparse matrix in compressed column format
__kernel void mykernel(__global int* colvector,
__global int* val,
__global int* result,
__global int* index,
__global int* rowptr,
__global int* sync )
{
__local int vals[1000];
for(int i=0;i<4;i++)
{
result[i]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
const int items_per_row=32;//total threads working in a row
const int thread_id=get_global_id(0)+get_local_id(0);//total threads in the program
const int warpid = thread_id/items_per_row;//warp id is actual row
int lane=thread_id&(items_per_row-1);//thread id within the warp
int row = warpid;
if(row<4)
{
int sum = 0;
int row_start = rowptr[row];
int row_end = rowptr[row+1];
vals[get_global_id(0)]=0;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = row_start+lane; i<row_end; i+=items_per_row)
{
vals[get_local_id(0)]+=val[i]*colvector[index[i]];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if (lane < 16 ) vals[get_local_id(0)] += vals[get_local_id(0) + 16];
if (lane < 8 ) vals[get_local_id(0)] += vals[get_local_id(0) + 8];
if (lane < 4 ) vals[get_local_id(0)] += vals[get_local_id(0) +4];
if (lane < 2 ) vals[get_local_id(0)] += vals[get_local_id(0) + 2];
if (lane < 1 ) vals[get_local_id(0)] += vals[get_local_id(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if(lane==0)
{
result[row] += vals[get_local_id(0)];
}
}
}
the above OpenCL code was converted from the CUDA code given below:
spmv_csr_vector_kernel(const int num_rows,
const int * ptr,
const int * indices,
const float * data,
const float * x,
float * y )
{
__shared__ float vals[];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x; // global thread index
int warp_id = thread_id / 32; // global warp index
int lane = thread_id & (32 - 1); // thread index within the warp
// one warp per row
int row = warp_id;
if (row < num_rows)
{
int row_start = ptr[row];
int row_end = ptr[row+1];
// compute running sum per thread
vals[threadIdx.x] = 0;
for(int jj = row_start + lane; jj < row_end; jj += 32)
{
vals[threadIdx.x] += data[jj] * x[indices[jj]];
}
// parallel reduction in shared memory
if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16];
if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8];
if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4];
if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2];
if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1];
// first thread writes the result
if (lane == 0)
{
y[row] += vals[threadIdx.x];
}
}
}
The CUDA code is correct but my OpenCL kernel is not returning correct output. I have been trying for a week now but no solution. Does anybody know what mistake I am making?
I can at least see one mistake. thread_id is not the same in each code. blockDim.x * blockIdx.x + threadIdx.x in CUDA == get_global_id(0) in OpenCL, not get_global_id(0)+get_local_id(0). Also get_local_id(0) == threadIdx.x
Try using swan, this might help you understand your problem.
you can find an article here about it.