Shared Memory slows down the blurring operation compared to the one without shared memory - cuda

When I use shared memory on gaussian blur kernel, the execution time is slower than the one without shared memory. The code is as the following. Could you help me to resolve this issue?
The execution time for shared memory is 0.27 ms however, the execution time for the one without shared memory is 0.18 ms.
In addition to them the number of inactive threads is almost two times more than the one without shared memory.
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
The blurring kernel in which shared memory is not used is as the following
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if(row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}

The problem is you are making ineffective use of shared memory. Replacing a few of the global loads with shared loads is not going to be sufficient. As a result, your else clause:
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
is getting invoked too many times, and is drowning out any benefit of shared usage in the if clause.
Instead you want to arrange a shared memory tile in such a way that all the data can be retrieved from shared memory, after it is properly loaded.
The following is an example of how it could be done (in gaus_xdirection_shared_i):
$ cat t145.cu
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
__global__
void gaus_xdirection_shared_i(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ float columns[];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x + 2*k;
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col+k; // block pixel = b_p
float temp;
if( row < height && col < width)
temp = in_channel[p];
else
temp = 0;
columns[b_p] = temp;
if (threadIdx.x < k){
// handle left edge/border
if (((p-k) >= row*width) && ((p-k) < width*height)) temp = in_channel[p-k];
else temp = 0;
columns[b_p-k] = temp;
// handle right edge/border
if (((p+blockDim.x) < (row+1)*width) && (row < height))
temp = in_channel[p+blockDim.x];
else
temp = 0;
columns[b_p+blockDim.x] = temp;}
__syncthreads();
//Load ends
temp = 0.0f;
for(int i = -k; i < k+1; ++i)
temp += gaussian_kernel[k+i] * columns[b_p + i];
if( row < height && col < width)
output_channel[p] = temp;
}
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if( row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if( (col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
int main(){
float *in_channel;
float *output_channel;
float *gaussian_kernel;
int width, height, k;
int th = 32;
width = 1024;
height = 1024;
k = 7;
cudaMalloc(&in_channel, width*height*sizeof(float));
cudaMalloc(&output_channel, width*height*sizeof(float));
cudaMalloc(&gaussian_kernel, (2*k+1)*sizeof(float));
dim3 b(th, th);
dim3 g((width+b.x-1)/b.x,(height+b.y-1)/b.y);
gaus_xdirection_shared<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection_shared_i<<<g,b,th*(th+2*k)*sizeof(float)>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
cudaDeviceSynchronize();
}
$ nvcc -o t145 t145.cu
$ cuda-memcheck ./t145
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t145
==27500== NVPROF is profiling process 27500, command: ./t145
==27500== Profiling application: ./t145
==27500== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 44.53% 1.0205ms 1 1.0205ms 1.0205ms 1.0205ms gaus_xdirection_shared(float*, float*, float*, int, int, int)
33.35% 764.46us 1 764.46us 764.46us 764.46us gaus_xdirection(float*, float*, float*, int, int, int)
22.12% 506.95us 1 506.95us 506.95us 506.95us gaus_xdirection_shared_i(float*, float*, float*, int, int, int)
API calls: 97.88% 141.58ms 3 47.192ms 115.32us 141.22ms cudaMalloc
1.58% 2.2808ms 1 2.2808ms 2.2808ms 2.2808ms cudaDeviceSynchronize
0.36% 514.21us 202 2.5450us 165ns 118.09us cuDeviceGetAttribute
0.10% 146.33us 2 73.166us 52.335us 93.998us cuDeviceTotalMem
0.04% 58.346us 2 29.173us 26.147us 32.199us cuDeviceGetName
0.03% 50.393us 3 16.797us 6.9170us 34.369us cudaLaunchKernel
0.01% 9.5440us 2 4.7720us 1.8600us 7.6840us cuDeviceGetPCIBusId
0.00% 1.3980us 3 466ns 279ns 801ns cuDeviceGetCount
0.00% 1.3100us 4 327ns 186ns 712ns cuDeviceGet
0.00% 564ns 2 282ns 237ns 327ns cuDeviceGetUuid
$
I have not carefully tested the above code, it may contain defects. But it should give you an idea of how to structure a larger shared memory tile, and it seems to run without runtime error, and it seems to be faster.

Related

What is a correct way to implement memcpy inside a CUDA kernel?

I am implementing a PDE solver (Lax-Friedrichs) in CUDA that I previously wrote in C. Please find the C code below:
void solve(int M, double u[M+3][M+3], double unp1[M+3][M+3], double params[3]){
int i;
int j;
int n;
for (n=0; n<params[0]; n++){
for (i=0; i<M+2; i++)
for(j=0; j<M+2; j++){
unp1[i][j] = 0.25*(u[i+1][j] + u[i-1][j] + u[i][j+1] + u[i][j-1])
- params[1]*(u[i+1][j] - u[i-1][j])
- params[2]*(u[i][j+1] - u[i][j-1]);
}
memcpy(u, unp1, pow(M+3,2)*sizeof(double));
/*Periodic Boundary Conditions*/
for (i=0; i<M+2; i++){
u[0][i] = u[N+1][i];
u[i][0] = u[i][N+1];
u[N+2][i] = u[1][i];
u[i][N+2] = u[i][1];
}
}
}
And it works fine. But when I am trying to implement it in CUDA I do not get the same data. Unfortunately I cannot exactly pinpoint the exact problem since I am a totally beginner to the whole parallel programming thing, but I think it might have to do with the u[i*(N+3) + j] = unp1[i*(N+3) + j] on the solver, since I cannot really perform a memcpy inside the kernel, because it doesn't change anything, I don't know how to proceed. I took a look at This previous answer, but it unfortunately couldn't help solving my problem. Here is the solver in CUDA I am trying to code:
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
#include <algorithm>
/*Configuration of the grid*/
const int N = 100; //Number of nodes
const double xmin = -1.0;
const double ymin = -1.0;
const double xmax = 1.0;
const double ymax = 1.0;
const double tmax = 0.5;
/*Configuration of the simulation physics*/
const double dx = (xmax - xmin)/N;
const double dy = (ymax - ymin)/N;
const double dt = 0.009;
const double vx = 1.0;
const double vy = 1.0;
__global__ void initializeDomain(double *x, double *y){
/*Initializes the grid of size (N+3)x(N+3) to better accomodate Boundary Conditions*/
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int j=index ; j<N+3; j+=stride){
x[j] = xmin + (j-1)*dx;
y[j] = ymin + (j-1)*dy;
}
}
__global__ void initializeU(double *x, double *y, double *u0){
double sigma_x = 2.0;
double sigma_y = 6.0;
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+3; i += stride_x)
for (int j = index_y; j < N+3; j+= stride_y){
u0[i*(N+3) + j] = exp(-200*(pow(x[i],2)/(2*pow(sigma_x,2)) + pow(y[j],2)/(2*pow(sigma_y,2))));
u0[i*(N+3) + j] *= 1/(2*M_PI*sigma_x*sigma_y);
//u[i*(N+3) + j] = u0[i*(N+3) + j];
//unp1[i*(N+3) + j] = u0[i*(N+3) + j];
}
}
void initializeParams(double params[3]){
params[0] = round(tmax/dt);
params[1] = vx*dt/(2*dx);
params[2] = vy*dt/(2*dy);
}
__global__ void solve(double *u, double *unp1, double params[3]){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+2; i += stride_x)
for (int j = index_y; j < N+2; j += stride_y){
unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
- params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
- params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
}
}
__global__ void bc(double *u){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
/*Also BC are set on parallel */
for (int i = index_x; i < N+2; i += stride_x){
u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
}
}
int main(){
int i;
int j;
double *x = (double *)malloc((N+3)*sizeof(double));
double *y = (double *)malloc((N+3)*sizeof(double));
double *d_x, *d_y;
cudaMalloc(&d_x, (N+3)*sizeof(double));
cudaMalloc(&d_y, (N+3)*sizeof(double));
initializeDomain<<<1, 1>>>(d_x, d_y);
cudaDeviceSynchronize();
cudaMemcpy(x, d_x, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(y, d_y, (N+3)*sizeof(double), cudaMemcpyDeviceToHost);
FILE *fout1 = fopen("data_x.csv", "w");
FILE *fout2 = fopen("data_y.csv", "w");
for (i=0; i<N+3; i++){
if (i==N+2){
fprintf(fout1, "%.5f", x[i]);
fprintf(fout2, "%.5f", y[i]);
}
else{
fprintf(fout1, "%.5f, ", x[i]);
fprintf(fout2, "%.5f, ", y[i]);
}
}
dim3 Block2D(1,1);
dim3 ThreadsPerBlock(1,1);
double *d_u0;
double *u0 = (double *)malloc((N+3)*(N+3)*sizeof(double));
cudaMalloc(&d_u0, (N+3)*(N+3)*sizeof(double));
initializeU<<<Block2D, ThreadsPerBlock>>>(d_x, d_y, d_u0);
cudaDeviceSynchronize();
cudaMemcpy(u0, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
/*Initialize parameters*/
double params[3];
initializeParams(params);
/*Allocate memory for u and unp1 on device for the solver*/
double *d_u, *d_unp1;
cudaMalloc(&d_u, (N+3)*(N+3)*sizeof(double));
cudaMalloc(&d_unp1, (N+3)*(N+3)*sizeof(double));
cudaMemcpy(d_u, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_unp1, d_u0, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToDevice);
/*Solve*/
for (int n=0; n<params[0]; n++){
solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
double *temp = d_u;
d_u = d_unp1;
d_unp1 = temp;
bc<<<1,1>>>(d_u);
cudaDeviceSynchronize();
}
/*Copy results on host*/
double *u = (double *)malloc((N+3)*(N+3)*sizeof(double));
cudaMemcpy(u, d_u, (N+3)*(N+3)*sizeof(double), cudaMemcpyDeviceToHost);
FILE *fu = fopen("data_u.csv", "w");
for (i=0; i<N+3; i++){
for(j=0; j<N+3; j++)
if (j==N+2)
fprintf(fu, "%.5f", u[i*(N+3) + j]);
else
fprintf(fu, "%.5f, ", u[i*(N+3) + j]);
fprintf(fu, "\n");
}
fclose(fu);
free(x);
free(y);
free(u0);
free(u);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_u0);
cudaFree(d_u);
cudaFree(d_unp1);
return 0;
}
I unfortunately keep having the same issue: The data I get is 0.0000.
One thing that is tripping you up is that your original algorithm has an ordering that is required for correctness:
Update unp from u
copy unp to u
enforce boundary condition
repeat
Your algorithm requires that step 1 be completed entirely before step 2 begins, and likewise for step 2 before step 3. Your CUDA realization (putting steps 1 and 3, or 1,2,3) in a single kernel does not preserve or guarantee that ordering. CUDA threads can execute in any order. If you apply that rigorously to your code (for example, imagine that thread with index 0 executes completely before any other thread begins. That would be valid CUDA execution) then you will see that your kernel design does not preserve the ordering required.
So do something like this:
Create a solve kernel that is just the first step:
__global__ void solve(double *u, double *unp1, double params[3]){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
for (int i = index_x; i < N+2; i += stride_x)
for (int j = index_y; j < N+2; j += stride_y){
unp1[i*(N+3) + j] = 0.25*(u[(i+1)*(N+3) + j] + u[(i-1)*(N+3) + j] + u[i*(N+3) + (j+1)] + u[i*(N+3) + (j-1)]) \
- params[1]*(u[(i+1)*(N+3) + j] - u[(i-1)*(N+3) + j]) \
- params[2]*(u[i*(N+3) + (j+1)] - u[i*(N+3) + (j-1)]);
u[i*(N+3) + j] = unp1[i*(N+3) + j];
}
}
don't bother with the memcpy operation. the better way to do that is swapping pointers (in host code).
Create a separate kernel to enforce the boundary:
__global__ void bc(double *u, double *unp1, double params[3]){
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = blockDim.x * gridDim.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int stride_y = blockDim.y * gridDim.y;
/*Also BC are set on parallel */
for (int i = index_x; i < N+2; i += stride_x){
u[0*(N+3) + i] = u[(N+1)*(N+3) + i];
u[i*(N+3) + 0] = u[i*(N+3) + (N+1)];
u[(N+2)*(N+3) + i] = u[1*(N+3) + i];
u[i*(N+3) + (N+2)] = u[i*(N+3) + 1];
}
}
Modify your host code to call these kernels in sequence, with the pointer swap in-between:
/*Solve*/
for(int n = 0; n<params[0]; n++){
solve<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
double *temp = d_u;
d_u = d_unp1;
d_unp1 = temp;
bc<<<Block2D, ThreadsPerBlock>>>(d_u, d_unp1, params);
cudaDeviceSynchronize();
}
(coded in browser, not tested)
This will enforce the ordering that your algorithm requires.
NOTE: As identified in the comments below, the solve kernel as depicted above (and in OP's original post, and in their posted CPU code version) has indexing errors at least associated with i-1 and j-1 indexing patterns. These should be fixed otherwise the code is broken. Fixing them requires some decision as to what to do for the edge cases, which OP provides no guidance on, therefore I have left that code as-is.

How can I convolution image in CUDA

I have a question about image convolution in CUDA. When I test it with small maxtrix (16*16) evething is ok. But with larger matrix, the result is always change when I run.
I think problem is 2 for loops into kernel.
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i < img_width*img_height; i++)
{
printf("%d, ",(int)output[i]);
}
printf("\n\n");
}
Here is my result, I test it with 24*24 image, I run it 2 time, and I also write simple function to compared the output.
And here is result when I compare the output, there are 32 differents,at index 240, 241 ....
You have made a fairly common error in your program. When you create a grid of threads like this:
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
you are intentionally creating (usually) extra threads in each dimension, so as to fully cover the problem space (i.e. image size). There is nothing wrong with this.
However, it means we will be launching extra threads, which are outside the valid image dimension. We must ensure that these threads do nothing. The usual approach is to add a thread check to the kernel, so that threads outside the valid image dimensions do nothing. Here's a modified kernel and fully worked example showing that change:
$ cat t1219.cu
#include <iostream>
#include <cstdlib>
const int iw = 1025;
const int ih = 1025;
const int rng = 10;
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if ((x < img_width) && (y < img_height)){ // thread check
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
}
int main(){
float *in, *out;
int is = ih*iw;
in = new float[is];
out = new float[is];
for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;}
image_convolution(in,out, ih, iw);
for (int iy = 1; iy < ih-1; iy++)
for (int ix = 1; ix < iw-1; ix++){
float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]);
if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}}
return 0;
}
$ nvcc -o t1219 t1219.cu
$ cuda-memcheck ./t1219
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
For image dimensions which are exact multiples of the block size (16,16) (which was true for my previous test case) this problem won't show up -- the code will work correctly. For all other test cases, we need such a thread check.

CUDA 2d convolution boundary incorrect

I implemented a CUDA 2D convolution code with naive way and cannot get the boundary value correct. The error happens on the top and left borders with half-of-filter wide. For example, if my filter is 7x7, the error reside in top 3 pixels and left 3 pixels (compared to C result). Can some one help me to resolve this bug? Your help is very appreciated!
Attached is my cuda code and c code:
#define ISIZE 32//input image size ISIZE*ISIZE
#define MASK_RADIUS 3
#define MASK_WIDTH (2 * MASK_RADIUS + 1)
const int FILTER_SIZE = MASK_WIDTH * MASK_WIDTH * sizeof(float);
__device__ __constant__ float d_filter[FILTER_SIZE];
__global__ void convolution2D_cuda(float* d_Result, float* d_Data, int dataH, int dataW)
{
// global mem address for this thread
const int gLoc = threadIdx.x + blockIdx.x * blockDim.x +
(threadIdx.y + blockIdx.y * blockDim.y) * dataW;
float sum = 0;
float value = 0;
for(int i = -MASK_RADIUS; i <= MASK_RADIUS; i++) //row wise
{
for (int j = -MASK_RADIUS; j <= MASK_RADIUS; j++) //col wise
{
// check row
if ( (blockIdx.x == 0) && ((threadIdx.x + j) < 0) ) //left apron
value = 0;
else if ( blockIdx.x == (gridDim.x -1) && (threadIdx.x + j) > (blockDim.x-1) ) //right apron
value = 0;
else {
// check col
if ( blockIdx.y == 0 && (threadIdx.y + i) < 0) //top apron
value = 0;
else if ( blockIdx.y == (gridDim.y-1) && (threadIdx.y + i) > (blockDim.y-1) ) //bottom apron
value = 0;
else // load data
value = d_Data[gLoc + i * dataW + j];
}
//2d array case: non-separable filter
sum += value * d_filter[ (MASK_RADIUS - i) * MASK_WIDTH + (MASK_RADIUS - j) ];
}
}
d_Result[gLoc] = sum;
}
//c code
void convolution2D_cpu(float* result, float* input, float* filter, int dataW, int dataH, int k_Width, int k_Height, int radiusY, int radiusX)
{
int y, x, ky, kx;
for (y = 0; y < dataH; y++) { //row
for (x = 0; x < dataW; x++) {
result[y*dataW + x] = 0;
float sum=0;
for(ky = -radiusY; ky <= radiusY; ky++) {
for(kx = -radiusX; kx <= radiusX; kx++) {
int dy = y + ky;
int dx = x + kx;
if (dy >= 0 && dy < dataH) //left & upper borders
if (dx >= 0 && dx < dataW) //right & lower borders
sum += input[dy*dataW + dx] * filter[(radiusY-ky)*k_Width + (radiusX - kx)];
}
}
result[y*dataW+x] = sum;
}
}
}
Part of the main() code is :
dim3 blocks(16, 16);
dim3 grids(width/16, height/16);
checkCudaErrors( cudaMalloc( (void **)&d_data, data_size ));
checkCudaErrors( cudaMalloc( (void **)&d_result, data_size ));
checkCudaErrors( cudaMemcpy(d_data, indata, data_size, cudaMemcpyHostToDevice) );
checkCudaErrors( cudaThreadSynchronize() );
convolution2D_cuda<<<grids, blocks>>>(d_result, d_data, width, height);
checkCudaErrors( cudaThreadSynchronize() );
checkCudaErrors( cudaMemcpy(output, d_result, data_size, cudaMemcpyDeviceToHost) );
checkCudaErrors( cudaThreadSynchronize() );
//check with result of CPU
convolution2D_cpu(c_result, indata, filter, width, height, len, len, MASK_RADIUS, MASK_RADIUS);
I get to resolve this mystery. The error happens on thread index calculation. threadIdx is uint, nvcc thinks (threadIdx.x + j) as unsigned int. Ex. if j is -1, it is interpreted as 4294967295 (ffffffff) and the boundary index is incorrect.

Getting wrong results from CUDA matrix multiplication kernel [duplicate]

This question already has answers here:
Multiply Rectangular Matrices in CUDA
(5 answers)
Closed 7 years ago.
I am new to CUDA. I have a kernel to do matrix multiplication. It seems alright for me but it is failing in some cases. Please help me where the problem is.
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
//## Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
__syncthreads();
}
}
I am invoking the kernel as follows.
int BLOCKX = (int)(ceil((numCRows / 8.0)));
int BLOCKY = (int)(ceil((numCColumns / 8.0)));
printf("Number of blocks: %d\t%d\n", BLOCKX, BLOCKY);
dim3 DimGrid(BLOCKX, BLOCKY);
dim3 DimBlock(8 , 8, 1);
Your code will deadlock in the below :
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
__syncthreads();
}
Consider a block where for some threads, the condition is satisfied, while for some it is not. In that case, this will deadlock. Put __syncthreads() outside the if conditions
Also replace dim3 DimGrid(BLOCKX, BLOCKY); by dim3 DimGrid(BLOCKY, BLOCKX);. That should fix it

What did I miss while converting from CUDA to OpenCL? Or why is my kernel returning different output than serial code

This is my code for multiplication of a sparse matrix in compressed column format
__kernel void mykernel(__global int* colvector,
__global int* val,
__global int* result,
__global int* index,
__global int* rowptr,
__global int* sync )
{
__local int vals[1000];
for(int i=0;i<4;i++)
{
result[i]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
const int items_per_row=32;//total threads working in a row
const int thread_id=get_global_id(0)+get_local_id(0);//total threads in the program
const int warpid = thread_id/items_per_row;//warp id is actual row
int lane=thread_id&(items_per_row-1);//thread id within the warp
int row = warpid;
if(row<4)
{
int sum = 0;
int row_start = rowptr[row];
int row_end = rowptr[row+1];
vals[get_global_id(0)]=0;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = row_start+lane; i<row_end; i+=items_per_row)
{
vals[get_local_id(0)]+=val[i]*colvector[index[i]];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if (lane < 16 ) vals[get_local_id(0)] += vals[get_local_id(0) + 16];
if (lane < 8 ) vals[get_local_id(0)] += vals[get_local_id(0) + 8];
if (lane < 4 ) vals[get_local_id(0)] += vals[get_local_id(0) +4];
if (lane < 2 ) vals[get_local_id(0)] += vals[get_local_id(0) + 2];
if (lane < 1 ) vals[get_local_id(0)] += vals[get_local_id(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if(lane==0)
{
result[row] += vals[get_local_id(0)];
}
}
}
the above OpenCL code was converted from the CUDA code given below:
spmv_csr_vector_kernel(const int num_rows,
const int * ptr,
const int * indices,
const float * data,
const float * x,
float * y )
{
__shared__ float vals[];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x; // global thread index
int warp_id = thread_id / 32; // global warp index
int lane = thread_id & (32 - 1); // thread index within the warp
// one warp per row
int row = warp_id;
if (row < num_rows)
{
int row_start = ptr[row];
int row_end = ptr[row+1];
// compute running sum per thread
vals[threadIdx.x] = 0;
for(int jj = row_start + lane; jj < row_end; jj += 32)
{
vals[threadIdx.x] += data[jj] * x[indices[jj]];
}
// parallel reduction in shared memory
if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16];
if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8];
if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4];
if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2];
if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1];
// first thread writes the result
if (lane == 0)
{
y[row] += vals[threadIdx.x];
}
}
}
The CUDA code is correct but my OpenCL kernel is not returning correct output. I have been trying for a week now but no solution. Does anybody know what mistake I am making?
I can at least see one mistake. thread_id is not the same in each code. blockDim.x * blockIdx.x + threadIdx.x in CUDA == get_global_id(0) in OpenCL, not get_global_id(0)+get_local_id(0). Also get_local_id(0) == threadIdx.x
Try using swan, this might help you understand your problem.
you can find an article here about it.