How can I convolution image in CUDA - cuda

I have a question about image convolution in CUDA. When I test it with small maxtrix (16*16) evething is ok. But with larger matrix, the result is always change when I run.
I think problem is 2 for loops into kernel.
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i < img_width*img_height; i++)
{
printf("%d, ",(int)output[i]);
}
printf("\n\n");
}
Here is my result, I test it with 24*24 image, I run it 2 time, and I also write simple function to compared the output.
And here is result when I compare the output, there are 32 differents,at index 240, 241 ....

You have made a fairly common error in your program. When you create a grid of threads like this:
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
you are intentionally creating (usually) extra threads in each dimension, so as to fully cover the problem space (i.e. image size). There is nothing wrong with this.
However, it means we will be launching extra threads, which are outside the valid image dimension. We must ensure that these threads do nothing. The usual approach is to add a thread check to the kernel, so that threads outside the valid image dimensions do nothing. Here's a modified kernel and fully worked example showing that change:
$ cat t1219.cu
#include <iostream>
#include <cstdlib>
const int iw = 1025;
const int ih = 1025;
const int rng = 10;
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if ((x < img_width) && (y < img_height)){ // thread check
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
}
int main(){
float *in, *out;
int is = ih*iw;
in = new float[is];
out = new float[is];
for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;}
image_convolution(in,out, ih, iw);
for (int iy = 1; iy < ih-1; iy++)
for (int ix = 1; ix < iw-1; ix++){
float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]);
if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}}
return 0;
}
$ nvcc -o t1219 t1219.cu
$ cuda-memcheck ./t1219
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
For image dimensions which are exact multiples of the block size (16,16) (which was true for my previous test case) this problem won't show up -- the code will work correctly. For all other test cases, we need such a thread check.

Related

Shared Memory slows down the blurring operation compared to the one without shared memory

When I use shared memory on gaussian blur kernel, the execution time is slower than the one without shared memory. The code is as the following. Could you help me to resolve this issue?
The execution time for shared memory is 0.27 ms however, the execution time for the one without shared memory is 0.18 ms.
In addition to them the number of inactive threads is almost two times more than the one without shared memory.
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
The blurring kernel in which shared memory is not used is as the following
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if(row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
The problem is you are making ineffective use of shared memory. Replacing a few of the global loads with shared loads is not going to be sufficient. As a result, your else clause:
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
is getting invoked too many times, and is drowning out any benefit of shared usage in the if clause.
Instead you want to arrange a shared memory tile in such a way that all the data can be retrieved from shared memory, after it is properly loaded.
The following is an example of how it could be done (in gaus_xdirection_shared_i):
$ cat t145.cu
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
__global__
void gaus_xdirection_shared_i(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ float columns[];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x + 2*k;
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col+k; // block pixel = b_p
float temp;
if( row < height && col < width)
temp = in_channel[p];
else
temp = 0;
columns[b_p] = temp;
if (threadIdx.x < k){
// handle left edge/border
if (((p-k) >= row*width) && ((p-k) < width*height)) temp = in_channel[p-k];
else temp = 0;
columns[b_p-k] = temp;
// handle right edge/border
if (((p+blockDim.x) < (row+1)*width) && (row < height))
temp = in_channel[p+blockDim.x];
else
temp = 0;
columns[b_p+blockDim.x] = temp;}
__syncthreads();
//Load ends
temp = 0.0f;
for(int i = -k; i < k+1; ++i)
temp += gaussian_kernel[k+i] * columns[b_p + i];
if( row < height && col < width)
output_channel[p] = temp;
}
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if( row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if( (col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
int main(){
float *in_channel;
float *output_channel;
float *gaussian_kernel;
int width, height, k;
int th = 32;
width = 1024;
height = 1024;
k = 7;
cudaMalloc(&in_channel, width*height*sizeof(float));
cudaMalloc(&output_channel, width*height*sizeof(float));
cudaMalloc(&gaussian_kernel, (2*k+1)*sizeof(float));
dim3 b(th, th);
dim3 g((width+b.x-1)/b.x,(height+b.y-1)/b.y);
gaus_xdirection_shared<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection_shared_i<<<g,b,th*(th+2*k)*sizeof(float)>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
cudaDeviceSynchronize();
}
$ nvcc -o t145 t145.cu
$ cuda-memcheck ./t145
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t145
==27500== NVPROF is profiling process 27500, command: ./t145
==27500== Profiling application: ./t145
==27500== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 44.53% 1.0205ms 1 1.0205ms 1.0205ms 1.0205ms gaus_xdirection_shared(float*, float*, float*, int, int, int)
33.35% 764.46us 1 764.46us 764.46us 764.46us gaus_xdirection(float*, float*, float*, int, int, int)
22.12% 506.95us 1 506.95us 506.95us 506.95us gaus_xdirection_shared_i(float*, float*, float*, int, int, int)
API calls: 97.88% 141.58ms 3 47.192ms 115.32us 141.22ms cudaMalloc
1.58% 2.2808ms 1 2.2808ms 2.2808ms 2.2808ms cudaDeviceSynchronize
0.36% 514.21us 202 2.5450us 165ns 118.09us cuDeviceGetAttribute
0.10% 146.33us 2 73.166us 52.335us 93.998us cuDeviceTotalMem
0.04% 58.346us 2 29.173us 26.147us 32.199us cuDeviceGetName
0.03% 50.393us 3 16.797us 6.9170us 34.369us cudaLaunchKernel
0.01% 9.5440us 2 4.7720us 1.8600us 7.6840us cuDeviceGetPCIBusId
0.00% 1.3980us 3 466ns 279ns 801ns cuDeviceGetCount
0.00% 1.3100us 4 327ns 186ns 712ns cuDeviceGet
0.00% 564ns 2 282ns 237ns 327ns cuDeviceGetUuid
$
I have not carefully tested the above code, it may contain defects. But it should give you an idea of how to structure a larger shared memory tile, and it seems to run without runtime error, and it seems to be faster.

My CUDA kernel code is not working

I try to make a small code to generate numbers and return the result in array but once I run this code it's not working, I have tried to use Nsight debugger to understand where is my problem but it freezes and closes immediately.
Could you help me please to understand where is the problem in this code?
__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp,
int m[2], int p[5], int q[5], int i, int* n,
int out[10][5], int N)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockIdx.x;
int idy = blockIdx.y;
int w = idx/100;
int x = idx%100;
int y = idy;
int z = threadIdx.x;
int len = ((i * 2) + 5);
// Fill PF_tmp & QF_tmp
if( i > 0){
for(int k = 0; k < (i * 2); k++)
{
p[k] = PF_tmp[k];
q[k] = QF_tmp[k];
}
}
// Fill X
if( x > 10)
{
p[(i*2)] = (x - (x % 10)) / 10;
p[(i*2)+1] = x % 10;
}else{
p[(i*2)] = 0;
p[(i*2)+1] = x;
}
// Fill Y
if( y > 10)
{
q[(i*2)] = (y - (y % 10)) / 10;
q[(i*2)+1] = y % 10;
}else{
q[(i*2)] = 0;
q[(i*2)+1] = y;
}
// Fill m
p[(i * 2)+2] = m[0];
q[(i * 2)+2] = m[1];
// Fill W
if( w > 10)
{
p[(i*2)+3] = (w - (w % 10)) / 10;
p[(i*2)+4] = w % 10;
}else{
p[(i*2)+3] = 0;
p[(i*2)+4] = w;
}
// Fill Z
if( z > 10)
{
q[(i*2)+3] = (z - (z % 10)) / 10;
q[(i*2)+4] = z % 10;
}else{
q[(i*2)+3] = 0;
q[(i*2)+4] = z;
}
// Fill PL_tmp & QL_tmp
if( i > 0)
{
for(int k = 0; k < (i * 2); k++)
{
p[(len-(i * 2))+k] = PL_tmp[k];
q[(len-(i * 2))+k] = QL_tmp[k];
}
}
if(id<10)
{
for(int k =0; k<5; k++)
out[id][k] = p[k];
}
}
int main()
{
cudaError err;
dim3 blocks(10000, 100);
dim3 threads(100);
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
mykernel<<<blocks, threads>>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);
return 0;
}
The error very obvious, it is all C programming.
when you declare
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
now hst_out, p, q are not a pointer, but later it is used as a pointer:
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
so u should have declare it initially as a pointer instead, eg,
int *p;
and used it as this way:
err = cudaMalloc((void **)&p, 5*sizeof(int));
And notice too that the size you have declared is just 5 bytes....whereas I declared it as 5*sizeof(int).
For more example see:
http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html

CUDA in-place transpose doesn't complete transpose total matrix [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
I've written the CUDA code below. It's supposed to transpose a matrix using tiling blocks, and the code works when using small values, but when using, for example:
TILE = 32, matrix 128 x 128, it doesn't complete the transpose, it stops after 96. In host this is my dimension thread/block
dim3 dimGrid((nEven + TILE_DIM - 1) / TILE_DIM, (nEven + TILE_DIM - 1) / TILE_DIM);
dim3 dimBlock(TILE_DIM, TILE_DIM);
where I let the threads number == to tile block number,
the global code is simple and it should theoretically work:
__global__ void transposeMain( int *idata)
{
__shared__ int tile2[TILE_DIM][TILE_DIM];
int yyy = blockIdx.y * TILE_DIM ; // col values (0,32,64,96)
int xxx = blockIdx.x * TILE_DIM ; // row values (0,32,64,96)
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
__syncthreads();
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}
}
Any idea what might be the problem?
The problem is you are trying to do an in-place transpose.
CUDA device code execution is broken up into threadblocks. Threadblocks (groups of threads) can execute in any order, and do not all (typically) execute at the same time. So when you read a tile in here:
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
That is OK. But when you write the tile:
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
You are frequently over-writing data (in some other tile in the original matrix) which you haven't read yet (because the threadblock responsible for reading that tile hasn't even begun to execute yet). Once you overwrite it like this, it's lost.
The solution (for square matrix transpose) has several aspects to it:
Each threadblock must first read 2 tiles. These 2 tiles from the input data will be swapped.
Then each threadblock can write those two tiles.
The tiles along the main diagonal need special casing.
since most threadblocks are handling 2 tiles, only threadblocks on or on one side of the main diagonal need do any work.
You haven't shown a complete MCVE (which is expected when you have questions like this), and your code has other issues such as the potential for uncoalesced access (lower performance) so I'm not going to try to "fix" your code.
Instead, here's a fully worked example, lifted from here:
$ cat t469.cu
#include <stdio.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#define uS_PER_SEC 1000000
#define uS_PER_mS 1000
#define N 4096
#define M 4096
#define TILE_DIM 32
#define BLOCK_ROWS 8
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
__global__ void iptransposeCoalesced(float *data)
{
__shared__ float tile_s[TILE_DIM][TILE_DIM+1];
__shared__ float tile_d[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
if (blockIdx.y>blockIdx.x) { // handle off-diagonal case
int dx = blockIdx.y * TILE_DIM + threadIdx.x;
int dy = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_d[threadIdx.y+j][threadIdx.x] = data[(dy+j)*width + dx];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(dy+j)*width + dx] = tile_s[threadIdx.x][threadIdx.y + j];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_d[threadIdx.x][threadIdx.y + j];
}
else if (blockIdx.y==blockIdx.x){ // handle on-diagonal case
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_s[threadIdx.x][threadIdx.y + j];
}
}
int validate(const float *mat, const float *mat_t, int n, int m){
int result = 1;
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
if (mat[(i*m)+j] != mat_t[(j*n)+i]) result = 0;
return result;
}
int main(){
timeval t1, t2;
float *matrix = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i ++)
for (int j = 0; j < M; j++)
matrix[(i*M) + j] = i;
// Starting the timer
gettimeofday(&t1, NULL);
float *matrixT = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
matrixT[(j*N)+i] = matrix[(i*M)+j]; // matrix is obviously filled
//Ending the timer
gettimeofday(&t2, NULL);
if (!validate(matrix, matrixT, N, M)) {printf("fail!\n"); return 1;}
float et1 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("CPU time = %fms\n", et1);
float *h_matrixT , *d_matrixT , *d_matrix;
h_matrixT = (float *) (malloc (N * M * sizeof(float)));
cudaMalloc((void **)&d_matrixT , N * M * sizeof(float));
cudaMalloc((void**)&d_matrix , N * M * sizeof(float));
cudaMemcpy(d_matrix , matrix , N * M * sizeof(float) , cudaMemcpyHostToDevice);
//Starting the timer
gettimeofday(&t1, NULL);
const float alpha = 1.0;
const float beta = 0.0;
cublasHandle_t handle;
//gettimeofday(&t1, NULL);
cublasCreate(&handle);
gettimeofday(&t1, NULL);
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, &alpha, d_matrix, M, &beta, d_matrix, N, d_matrixT, N);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cublasDestroy(handle);
//Ending the timer
float et2 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU Sgeam time = %fms\n", et2);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
cudaMemset(d_matrixT,0, N*M*sizeof(float));
memset(h_matrixT, 0, N*M*sizeof(float));
dim3 threads(TILE_DIM, BLOCK_ROWS);
dim3 blocks(N/TILE_DIM, M/TILE_DIM);
gettimeofday(&t1, NULL);
transposeCoalesced<<<blocks, threads >>>(d_matrixT, d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et3 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU kernel time = %fms\n", et3);
memset(h_matrixT, 0, N*M*sizeof(float));
gettimeofday(&t1, NULL);
iptransposeCoalesced<<<blocks, threads >>>(d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrix , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et4 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU in-place kernel time = %fms\n", et4);
cudaFree(d_matrix);
cudaFree(d_matrixT);
return 0;
}
$ nvcc -arch=sm_20 -o t469 t469.cu -lcublas
$ ./t469
CPU time = 450.095001ms
GPU Sgeam time = 1.937000ms
GPU kernel time = 1.694000ms
GPU in-place kernel time = 1.839000ms
$
Note that this compares several different approaches to matrix transpose.
If you study the iptransposeCoalesced you will see that it is adhering to the 4 specific aspects I outlined above.
It is fishy to use __syncthreads(); in the if statement in CUDA. Try to move it outside this block by simple:
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
}
__syncthreads();
if (xxx < nEven && yyy < nEven)
{
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}

CUDA 2d convolution boundary incorrect

I implemented a CUDA 2D convolution code with naive way and cannot get the boundary value correct. The error happens on the top and left borders with half-of-filter wide. For example, if my filter is 7x7, the error reside in top 3 pixels and left 3 pixels (compared to C result). Can some one help me to resolve this bug? Your help is very appreciated!
Attached is my cuda code and c code:
#define ISIZE 32//input image size ISIZE*ISIZE
#define MASK_RADIUS 3
#define MASK_WIDTH (2 * MASK_RADIUS + 1)
const int FILTER_SIZE = MASK_WIDTH * MASK_WIDTH * sizeof(float);
__device__ __constant__ float d_filter[FILTER_SIZE];
__global__ void convolution2D_cuda(float* d_Result, float* d_Data, int dataH, int dataW)
{
// global mem address for this thread
const int gLoc = threadIdx.x + blockIdx.x * blockDim.x +
(threadIdx.y + blockIdx.y * blockDim.y) * dataW;
float sum = 0;
float value = 0;
for(int i = -MASK_RADIUS; i <= MASK_RADIUS; i++) //row wise
{
for (int j = -MASK_RADIUS; j <= MASK_RADIUS; j++) //col wise
{
// check row
if ( (blockIdx.x == 0) && ((threadIdx.x + j) < 0) ) //left apron
value = 0;
else if ( blockIdx.x == (gridDim.x -1) && (threadIdx.x + j) > (blockDim.x-1) ) //right apron
value = 0;
else {
// check col
if ( blockIdx.y == 0 && (threadIdx.y + i) < 0) //top apron
value = 0;
else if ( blockIdx.y == (gridDim.y-1) && (threadIdx.y + i) > (blockDim.y-1) ) //bottom apron
value = 0;
else // load data
value = d_Data[gLoc + i * dataW + j];
}
//2d array case: non-separable filter
sum += value * d_filter[ (MASK_RADIUS - i) * MASK_WIDTH + (MASK_RADIUS - j) ];
}
}
d_Result[gLoc] = sum;
}
//c code
void convolution2D_cpu(float* result, float* input, float* filter, int dataW, int dataH, int k_Width, int k_Height, int radiusY, int radiusX)
{
int y, x, ky, kx;
for (y = 0; y < dataH; y++) { //row
for (x = 0; x < dataW; x++) {
result[y*dataW + x] = 0;
float sum=0;
for(ky = -radiusY; ky <= radiusY; ky++) {
for(kx = -radiusX; kx <= radiusX; kx++) {
int dy = y + ky;
int dx = x + kx;
if (dy >= 0 && dy < dataH) //left & upper borders
if (dx >= 0 && dx < dataW) //right & lower borders
sum += input[dy*dataW + dx] * filter[(radiusY-ky)*k_Width + (radiusX - kx)];
}
}
result[y*dataW+x] = sum;
}
}
}
Part of the main() code is :
dim3 blocks(16, 16);
dim3 grids(width/16, height/16);
checkCudaErrors( cudaMalloc( (void **)&d_data, data_size ));
checkCudaErrors( cudaMalloc( (void **)&d_result, data_size ));
checkCudaErrors( cudaMemcpy(d_data, indata, data_size, cudaMemcpyHostToDevice) );
checkCudaErrors( cudaThreadSynchronize() );
convolution2D_cuda<<<grids, blocks>>>(d_result, d_data, width, height);
checkCudaErrors( cudaThreadSynchronize() );
checkCudaErrors( cudaMemcpy(output, d_result, data_size, cudaMemcpyDeviceToHost) );
checkCudaErrors( cudaThreadSynchronize() );
//check with result of CPU
convolution2D_cpu(c_result, indata, filter, width, height, len, len, MASK_RADIUS, MASK_RADIUS);
I get to resolve this mystery. The error happens on thread index calculation. threadIdx is uint, nvcc thinks (threadIdx.x + j) as unsigned int. Ex. if j is -1, it is interpreted as 4294967295 (ffffffff) and the boundary index is incorrect.

Creating identity matrix with CUDA

Hi i try to create an identity matrix with CUDA but the output is just : zeros
__global__ void initIdentityGPU(int *devMatrix, int numR, int numC) {
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x * y;
for (int i = 0; i < x ; i++) {
for (int j = 0; j < numR; j++) {
if (i == j)
devMatrix[offset] = 1;
else
devMatrix[offset] = 0;
}
}
}
Why only it puts 0s ?
The simplest way how to do it is:
__global__ void initIdentityGPU(int **devMatrix, int numR, int numC) {
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
if(x == y)
devMatrix[y][x] = 1;
else
devMatrix[y][x] = 0;
}
}
and you launch it as:
dim3 blockDim(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 gridDim((numC + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (numR + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y);
initIdentityGPU<<<gridDim, blockDim>>>(matrix, numR, numC);
It simply runs as many threads as matrix cells, each thread obtains the coordinates of its cell and in a case the cell is in the diagonal of matrix it assigns 1 or 0 otherwise. Note the code is untested.