Box filter in CUDA using Google Colab - cuda

I have to implement Box filter using GPU with CUDA and I'm doing it on Google Colab. The code runs without any errors but my resulting image is all black.
This is my blurring function:
__global__ void apply_box_blur(int height, int width, unsigned char* buffer, unsigned char* out) {
int i, j;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (row < 2 || col < 2 || row >= height -3 || col >= width -3 ) return ;
float v = 1.0 / 9.0;
float kernel[3][3] = { {v,v,v},
{v,v,v},
{v,v,v} };
float sum0 = 0.0;
float sum1 = 0.0;
float sum2 = 0.0;
for (i = -1; i <= 1; i++)
{
for (j = -1; j <= 1; j++)
{
// matrix multiplication with kernel with every color plane
sum0 = sum0 + (float)kernel[i + 1][j + 1] * buffer[((row + i) * width + (col + j)) * 3 + 0];
sum1 = sum1 + (float)kernel[i + 1][j + 1] * buffer[((row + i) * width + (col + j)) * 3 + 1];
sum2 = sum2 + (float)kernel[i + 1][j + 1] * buffer[((row + i) * width + (col + j)) * 3 + 2];
}
}
out[(row * width + col) * 3 + 0] = (unsigned char)sum0;
out[(row * width + col) * 3 + 1] = (unsigned char)sum1;
out[(row * width + col) * 3 + 2] = (unsigned char)sum2;
};
And my main function:
// device copies
unsigned char* d_buffer;
unsigned char* d_out;
// allocate space for device copies
cudaMalloc((void**)&d_buffer, size * 3 * sizeof(unsigned char));
cudaMalloc((void**)&d_out, size * 3 * sizeof(unsigned char));
// Copy inputs to device
cudaMemcpy(d_buffer, buffer, size * 3 * sizeof(unsigned char), cudaMemcpyHostToDevice);
// perform the Box blur and store the resulting pixels in the output buffer
dim3 block(16, 16);
dim3 grid(width / 16, height / 16);
apply_box_blur <<<grid, block>>> (height, width, d_buffer, d_out);
cudaMemcpy(out, d_out, size * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
Am I doing something wrong with the block and grid sizes? Or is there something wrong with my blurring function? Is it maybe a Google Colab issue?

Found the issue.
The block and grid sizes should've been this:
dim3 blockSize(16, 16, 1);
dim3 gridSize((size*3)/blockSize.x, (size*3)/blockSize.y, 1);
Also my Google Colab wasn't connected to a GPU.

Related

Shared Memory slows down the blurring operation compared to the one without shared memory

When I use shared memory on gaussian blur kernel, the execution time is slower than the one without shared memory. The code is as the following. Could you help me to resolve this issue?
The execution time for shared memory is 0.27 ms however, the execution time for the one without shared memory is 0.18 ms.
In addition to them the number of inactive threads is almost two times more than the one without shared memory.
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
The blurring kernel in which shared memory is not used is as the following
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if(row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
The problem is you are making ineffective use of shared memory. Replacing a few of the global loads with shared loads is not going to be sufficient. As a result, your else clause:
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
is getting invoked too many times, and is drowning out any benefit of shared usage in the if clause.
Instead you want to arrange a shared memory tile in such a way that all the data can be retrieved from shared memory, after it is properly loaded.
The following is an example of how it could be done (in gaus_xdirection_shared_i):
$ cat t145.cu
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
__global__
void gaus_xdirection_shared_i(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ float columns[];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x + 2*k;
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col+k; // block pixel = b_p
float temp;
if( row < height && col < width)
temp = in_channel[p];
else
temp = 0;
columns[b_p] = temp;
if (threadIdx.x < k){
// handle left edge/border
if (((p-k) >= row*width) && ((p-k) < width*height)) temp = in_channel[p-k];
else temp = 0;
columns[b_p-k] = temp;
// handle right edge/border
if (((p+blockDim.x) < (row+1)*width) && (row < height))
temp = in_channel[p+blockDim.x];
else
temp = 0;
columns[b_p+blockDim.x] = temp;}
__syncthreads();
//Load ends
temp = 0.0f;
for(int i = -k; i < k+1; ++i)
temp += gaussian_kernel[k+i] * columns[b_p + i];
if( row < height && col < width)
output_channel[p] = temp;
}
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if( row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if( (col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
int main(){
float *in_channel;
float *output_channel;
float *gaussian_kernel;
int width, height, k;
int th = 32;
width = 1024;
height = 1024;
k = 7;
cudaMalloc(&in_channel, width*height*sizeof(float));
cudaMalloc(&output_channel, width*height*sizeof(float));
cudaMalloc(&gaussian_kernel, (2*k+1)*sizeof(float));
dim3 b(th, th);
dim3 g((width+b.x-1)/b.x,(height+b.y-1)/b.y);
gaus_xdirection_shared<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection_shared_i<<<g,b,th*(th+2*k)*sizeof(float)>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
cudaDeviceSynchronize();
}
$ nvcc -o t145 t145.cu
$ cuda-memcheck ./t145
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t145
==27500== NVPROF is profiling process 27500, command: ./t145
==27500== Profiling application: ./t145
==27500== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 44.53% 1.0205ms 1 1.0205ms 1.0205ms 1.0205ms gaus_xdirection_shared(float*, float*, float*, int, int, int)
33.35% 764.46us 1 764.46us 764.46us 764.46us gaus_xdirection(float*, float*, float*, int, int, int)
22.12% 506.95us 1 506.95us 506.95us 506.95us gaus_xdirection_shared_i(float*, float*, float*, int, int, int)
API calls: 97.88% 141.58ms 3 47.192ms 115.32us 141.22ms cudaMalloc
1.58% 2.2808ms 1 2.2808ms 2.2808ms 2.2808ms cudaDeviceSynchronize
0.36% 514.21us 202 2.5450us 165ns 118.09us cuDeviceGetAttribute
0.10% 146.33us 2 73.166us 52.335us 93.998us cuDeviceTotalMem
0.04% 58.346us 2 29.173us 26.147us 32.199us cuDeviceGetName
0.03% 50.393us 3 16.797us 6.9170us 34.369us cudaLaunchKernel
0.01% 9.5440us 2 4.7720us 1.8600us 7.6840us cuDeviceGetPCIBusId
0.00% 1.3980us 3 466ns 279ns 801ns cuDeviceGetCount
0.00% 1.3100us 4 327ns 186ns 712ns cuDeviceGet
0.00% 564ns 2 282ns 237ns 327ns cuDeviceGetUuid
$
I have not carefully tested the above code, it may contain defects. But it should give you an idea of how to structure a larger shared memory tile, and it seems to run without runtime error, and it seems to be faster.

CUDA index blockDim.y is always 1

I'm trying to solve the 2D Laplace equation with shared memory. But one strange thing is that the blockDim.y value is always 1.Could someone help me?
host code
checkCudaErrors(cudaMalloc((void**)&d_A, h*h * sizeof(float)));
checkCudaErrors(cudaMalloc((void**)&d_out, h*h * sizeof(float)));
checkCudaErrors(cudaMemcpy(d_A, A, h*h * sizeof(float), cudaMemcpyHostToDevice));
dim3 blockSize = (BLOCK_SIZE, BLOCK_SIZE);
dim3 gridSize = ((h+BLOCK_SIZE-1)/BLOCK_SIZE, (h + BLOCK_SIZE - 1) / BLOCK_SIZE);
LaplaceDifference << <gridSize, blockSize >> > (d_A, h, d_out);
checkCudaErrors(cudaMemcpy(B, d_out, h*h * sizeof(float), cudaMemcpyDeviceToHost));
kernel code
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
__shared__ float A_ds[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
int n = 1;
//Load data in shared memory
int halo_index_left = (blockIdx.x - 1)*blockDim.x + threadIdx.x;
int halo_index_right = (blockIdx.x + 1)*blockDim.x + threadIdx.x;
int halo_index_up = (blockIdx.y - 1)*blockDim.y + threadIdx.y;
int halo_index_down = (blockIdx.y + 1)*blockDim.y + threadIdx.y;
A_ds[n + threadIdx.y][n + threadIdx.x] = A[idy * h +idx];
if (threadIdx.x >= blockDim.x - n) {
A_ds[threadIdx.y + n][threadIdx.x - (blockDim.x - n)] = (halo_index_left < 0) ? 0 : A[idy*h + halo_index_left];
}
if (threadIdx.x < n) {
A_ds[threadIdx.y + n][blockDim.x + n + threadIdx.x] = (halo_index_right >= h) ? 0 : A[idy*h + halo_index_right];
}
if (threadIdx.y >= blockDim.y - n) {
A_ds[threadIdx.y - (blockDim.y - n)][threadIdx.x+n] = (halo_index_up < 0) ? 0 : A[halo_index_up*h + idx];
}
if (threadIdx.y < n) {
A_ds[blockDim.y + n + threadIdx.y][threadIdx.x + n] = (halo_index_down >= h) ? 0 : A[halo_index_down*h + idx];
}
__syncthreads();
P[idy*h + idx] = 0.25*(A_ds[threadIdx.y + n - 1][threadIdx.x + n] + A_ds[threadIdx.y + n + 1][threadIdx.x + n] + A_ds[threadIdx.y + n][threadIdx.x + n - 1] + A_ds[threadIdx.y + n][threadIdx.x + n + 1]);
(I spent quite some time looking for a dupe, but could not find it.)
A dim3 variable is a particular data type defined in the CUDA header file vector_types.h.
It provides several constructors. Here are a couple valid uses of constructors for this variable:
dim3 grid(gx, gy, gz);
dim3 grid = dim3(gx, gy, gz);
What you have shown:
dim3 blockSize = (BLOCK_SIZE, BLOCK_SIZE);
won't work the way you expect.
Since there is no dim3 usage on the right hand side of the equal sign, the compiler will use some other method to process what is there. It is not a syntax error, because both the use of parentheses and the comma are legal in this form, from a C++ language perspective.
Hopefully you understand how parentheses work in C++. I'm not going to try to describe the comma operator, you can read about it here and here. The net effect is that the compiler will evaluate each of the two expressions (one on the left of the comma, one on the right) and it will evaluate the overall expression value as the value produced by the evaluation of the expression on the right. So this:
(BLOCK_SIZE, BLOCK_SIZE)
becomes this:
BLOCK_SIZE
which is quite obviously a scalar quantity, not multi-dimensional.
When you assign a scalar to a dim3 variable:
dim3 blockSize = BLOCK_SIZE;
You end up with a dim3 variable that has these dimensions:
(BLOCK_SIZE, 1, 1)
One method to fix what you have is as follows:
dim3 blockSize = dim3(BLOCK_SIZE, BLOCK_SIZE);
^^^^
This line:
dim3 blockSize = (BLOCK_SIZE, BLOCK_SIZE);
initializes a 1D block size. What you want is:
dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);

Calculating indices for nested loops in CUDA

I'm trying to learn CUDA and I'm a bit confused about calculating thread indices. Let's say I have this loop I'm trying to parallelize:
...
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
In PyCUDA, I set:
block = (8, 8, 8)
grid = (96, 96, 16)
Most of the examples I've seen for parallelizing loops calculate thread indices like this:
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1)
DIM_x = 580, DIM_y = 550, psize = 50
However, if I print x, I see that multiple threads with the same thread Id are created, and the final result is wrong.
Instead, if I use this (3D grid of 3D blocks):
int blockId = blockIdx.x + blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int x = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x) + threadIdx.x;
It fixes the multiple same thread Ids problem for x, but I'm not sure how I'd parallelize y and dx.
If anyone could help me understand where I'm going wrong, and show me the right way to parallelize the loops, I'd really appreciate it.
However, if I print x, I see that multiple threads with the same
thread Id are created, and the final result is wrong.
It would be normal for you to see multiple threads with the same x thread ID in a multi-dimensional grid, as it would also be normal to observe many iterations of the loops in your host code with the same x value. If the result is wrong, it has nothing to do with any of the code you have shown, viz:
#include <vector>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <assert.h>
void host(int* array, int DIM_x, int DIM_y, int psize)
{
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
}
__global__
void kernel(int* array, int DIM_x, int DIM_y, int psize)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1);
}
int main()
{
dim3 block(8, 8, 8);
dim3 grid(96, 96, 16);
int DIM_x = 580, DIM_y = 550, psize = 50;
std::vector<int> array_h(DIM_x * DIM_y * psize, 0);
std::vector<int> array_hd(DIM_x * DIM_y * psize, 0);
thrust::device_vector<int> array_d(DIM_x * DIM_y * psize, 0);
kernel<<<grid, block>>>(thrust::raw_pointer_cast(array_d.data()), DIM_x, DIM_y, psize);
host(&array_h[0], DIM_x, DIM_y, psize);
thrust::copy(array_d.begin(), array_d.end(), array_hd.begin());
cudaDeviceSynchronize();
for(int i=0; i<DIM_x * DIM_y * psize; i++) {
assert( array_h[i] == array_hd[i] );
}
return 0;
}
which when compiled and run
$ nvcc -arch=sm_52 -std=c++11 -o looploop loop_the_loop.cu
$ cuda-memcheck ./looploop
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
emits no errors and passes the check of all elements against the host code in your question.
If you are getting incorrect results, it is likely that you have a problem with initialization of the device memory before running the kernel. Otherwise I fail to see how incorrect results could be emitted by the code you have shown.
In general, performing a large number of atomic memory transactions, as your code does, is not the optimal way to perform computation on the GPU. Using non-atomic transactions would probably need to rely on other a priori information about the structure of the problem (such as a graph decomposition or a precise description of the write patterns of the problem).
In a 3D grid with 3D blocks, the thread ID is:
unsigned long blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
unsigned long threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
Not the x you computed. The x is only the x index of that 3D matrix.
There is a nice cheatsheet in this blog

CUDA 2d convolution boundary incorrect

I implemented a CUDA 2D convolution code with naive way and cannot get the boundary value correct. The error happens on the top and left borders with half-of-filter wide. For example, if my filter is 7x7, the error reside in top 3 pixels and left 3 pixels (compared to C result). Can some one help me to resolve this bug? Your help is very appreciated!
Attached is my cuda code and c code:
#define ISIZE 32//input image size ISIZE*ISIZE
#define MASK_RADIUS 3
#define MASK_WIDTH (2 * MASK_RADIUS + 1)
const int FILTER_SIZE = MASK_WIDTH * MASK_WIDTH * sizeof(float);
__device__ __constant__ float d_filter[FILTER_SIZE];
__global__ void convolution2D_cuda(float* d_Result, float* d_Data, int dataH, int dataW)
{
// global mem address for this thread
const int gLoc = threadIdx.x + blockIdx.x * blockDim.x +
(threadIdx.y + blockIdx.y * blockDim.y) * dataW;
float sum = 0;
float value = 0;
for(int i = -MASK_RADIUS; i <= MASK_RADIUS; i++) //row wise
{
for (int j = -MASK_RADIUS; j <= MASK_RADIUS; j++) //col wise
{
// check row
if ( (blockIdx.x == 0) && ((threadIdx.x + j) < 0) ) //left apron
value = 0;
else if ( blockIdx.x == (gridDim.x -1) && (threadIdx.x + j) > (blockDim.x-1) ) //right apron
value = 0;
else {
// check col
if ( blockIdx.y == 0 && (threadIdx.y + i) < 0) //top apron
value = 0;
else if ( blockIdx.y == (gridDim.y-1) && (threadIdx.y + i) > (blockDim.y-1) ) //bottom apron
value = 0;
else // load data
value = d_Data[gLoc + i * dataW + j];
}
//2d array case: non-separable filter
sum += value * d_filter[ (MASK_RADIUS - i) * MASK_WIDTH + (MASK_RADIUS - j) ];
}
}
d_Result[gLoc] = sum;
}
//c code
void convolution2D_cpu(float* result, float* input, float* filter, int dataW, int dataH, int k_Width, int k_Height, int radiusY, int radiusX)
{
int y, x, ky, kx;
for (y = 0; y < dataH; y++) { //row
for (x = 0; x < dataW; x++) {
result[y*dataW + x] = 0;
float sum=0;
for(ky = -radiusY; ky <= radiusY; ky++) {
for(kx = -radiusX; kx <= radiusX; kx++) {
int dy = y + ky;
int dx = x + kx;
if (dy >= 0 && dy < dataH) //left & upper borders
if (dx >= 0 && dx < dataW) //right & lower borders
sum += input[dy*dataW + dx] * filter[(radiusY-ky)*k_Width + (radiusX - kx)];
}
}
result[y*dataW+x] = sum;
}
}
}
Part of the main() code is :
dim3 blocks(16, 16);
dim3 grids(width/16, height/16);
checkCudaErrors( cudaMalloc( (void **)&d_data, data_size ));
checkCudaErrors( cudaMalloc( (void **)&d_result, data_size ));
checkCudaErrors( cudaMemcpy(d_data, indata, data_size, cudaMemcpyHostToDevice) );
checkCudaErrors( cudaThreadSynchronize() );
convolution2D_cuda<<<grids, blocks>>>(d_result, d_data, width, height);
checkCudaErrors( cudaThreadSynchronize() );
checkCudaErrors( cudaMemcpy(output, d_result, data_size, cudaMemcpyDeviceToHost) );
checkCudaErrors( cudaThreadSynchronize() );
//check with result of CPU
convolution2D_cpu(c_result, indata, filter, width, height, len, len, MASK_RADIUS, MASK_RADIUS);
I get to resolve this mystery. The error happens on thread index calculation. threadIdx is uint, nvcc thinks (threadIdx.x + j) as unsigned int. Ex. if j is -1, it is interpreted as 4294967295 (ffffffff) and the boundary index is incorrect.

Getting wrong results from CUDA matrix multiplication kernel [duplicate]

This question already has answers here:
Multiply Rectangular Matrices in CUDA
(5 answers)
Closed 7 years ago.
I am new to CUDA. I have a kernel to do matrix multiplication. It seems alright for me but it is failing in some cases. Please help me where the problem is.
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
//## Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
__syncthreads();
}
}
I am invoking the kernel as follows.
int BLOCKX = (int)(ceil((numCRows / 8.0)));
int BLOCKY = (int)(ceil((numCColumns / 8.0)));
printf("Number of blocks: %d\t%d\n", BLOCKX, BLOCKY);
dim3 DimGrid(BLOCKX, BLOCKY);
dim3 DimBlock(8 , 8, 1);
Your code will deadlock in the below :
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
__syncthreads();
}
Consider a block where for some threads, the condition is satisfied, while for some it is not. In that case, this will deadlock. Put __syncthreads() outside the if conditions
Also replace dim3 DimGrid(BLOCKX, BLOCKY); by dim3 DimGrid(BLOCKY, BLOCKX);. That should fix it