Getting wrong results from CUDA matrix multiplication kernel [duplicate] - cuda

This question already has answers here:
Multiply Rectangular Matrices in CUDA
(5 answers)
Closed 7 years ago.
I am new to CUDA. I have a kernel to do matrix multiplication. It seems alright for me but it is failing in some cases. Please help me where the problem is.
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
//## Insert code to implement matrix multiplication here
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (numAColumns != numBRows) return;
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
__syncthreads();
}
}
I am invoking the kernel as follows.
int BLOCKX = (int)(ceil((numCRows / 8.0)));
int BLOCKY = (int)(ceil((numCColumns / 8.0)));
printf("Number of blocks: %d\t%d\n", BLOCKX, BLOCKY);
dim3 DimGrid(BLOCKX, BLOCKY);
dim3 DimBlock(8 , 8, 1);

Your code will deadlock in the below :
if ((Row < numARows) && (Col < numBColumns)){
float Cvalue = 0;
for (int k = 0 ; k < numAColumns ; ++k )
Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
C[Row*numCColumns + Col] = Cvalue;
__syncthreads();
}
Consider a block where for some threads, the condition is satisfied, while for some it is not. In that case, this will deadlock. Put __syncthreads() outside the if conditions
Also replace dim3 DimGrid(BLOCKX, BLOCKY); by dim3 DimGrid(BLOCKY, BLOCKX);. That should fix it

Related

Shared Memory slows down the blurring operation compared to the one without shared memory

When I use shared memory on gaussian blur kernel, the execution time is slower than the one without shared memory. The code is as the following. Could you help me to resolve this issue?
The execution time for shared memory is 0.27 ms however, the execution time for the one without shared memory is 0.18 ms.
In addition to them the number of inactive threads is almost two times more than the one without shared memory.
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
The blurring kernel in which shared memory is not used is as the following
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if(row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
The problem is you are making ineffective use of shared memory. Replacing a few of the global loads with shared loads is not going to be sufficient. As a result, your else clause:
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
is getting invoked too many times, and is drowning out any benefit of shared usage in the if clause.
Instead you want to arrange a shared memory tile in such a way that all the data can be retrieved from shared memory, after it is properly loaded.
The following is an example of how it could be done (in gaus_xdirection_shared_i):
$ cat t145.cu
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
__global__
void gaus_xdirection_shared_i(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ float columns[];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x + 2*k;
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col+k; // block pixel = b_p
float temp;
if( row < height && col < width)
temp = in_channel[p];
else
temp = 0;
columns[b_p] = temp;
if (threadIdx.x < k){
// handle left edge/border
if (((p-k) >= row*width) && ((p-k) < width*height)) temp = in_channel[p-k];
else temp = 0;
columns[b_p-k] = temp;
// handle right edge/border
if (((p+blockDim.x) < (row+1)*width) && (row < height))
temp = in_channel[p+blockDim.x];
else
temp = 0;
columns[b_p+blockDim.x] = temp;}
__syncthreads();
//Load ends
temp = 0.0f;
for(int i = -k; i < k+1; ++i)
temp += gaussian_kernel[k+i] * columns[b_p + i];
if( row < height && col < width)
output_channel[p] = temp;
}
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if( row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if( (col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
int main(){
float *in_channel;
float *output_channel;
float *gaussian_kernel;
int width, height, k;
int th = 32;
width = 1024;
height = 1024;
k = 7;
cudaMalloc(&in_channel, width*height*sizeof(float));
cudaMalloc(&output_channel, width*height*sizeof(float));
cudaMalloc(&gaussian_kernel, (2*k+1)*sizeof(float));
dim3 b(th, th);
dim3 g((width+b.x-1)/b.x,(height+b.y-1)/b.y);
gaus_xdirection_shared<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection_shared_i<<<g,b,th*(th+2*k)*sizeof(float)>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
cudaDeviceSynchronize();
}
$ nvcc -o t145 t145.cu
$ cuda-memcheck ./t145
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t145
==27500== NVPROF is profiling process 27500, command: ./t145
==27500== Profiling application: ./t145
==27500== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 44.53% 1.0205ms 1 1.0205ms 1.0205ms 1.0205ms gaus_xdirection_shared(float*, float*, float*, int, int, int)
33.35% 764.46us 1 764.46us 764.46us 764.46us gaus_xdirection(float*, float*, float*, int, int, int)
22.12% 506.95us 1 506.95us 506.95us 506.95us gaus_xdirection_shared_i(float*, float*, float*, int, int, int)
API calls: 97.88% 141.58ms 3 47.192ms 115.32us 141.22ms cudaMalloc
1.58% 2.2808ms 1 2.2808ms 2.2808ms 2.2808ms cudaDeviceSynchronize
0.36% 514.21us 202 2.5450us 165ns 118.09us cuDeviceGetAttribute
0.10% 146.33us 2 73.166us 52.335us 93.998us cuDeviceTotalMem
0.04% 58.346us 2 29.173us 26.147us 32.199us cuDeviceGetName
0.03% 50.393us 3 16.797us 6.9170us 34.369us cudaLaunchKernel
0.01% 9.5440us 2 4.7720us 1.8600us 7.6840us cuDeviceGetPCIBusId
0.00% 1.3980us 3 466ns 279ns 801ns cuDeviceGetCount
0.00% 1.3100us 4 327ns 186ns 712ns cuDeviceGet
0.00% 564ns 2 282ns 237ns 327ns cuDeviceGetUuid
$
I have not carefully tested the above code, it may contain defects. But it should give you an idea of how to structure a larger shared memory tile, and it seems to run without runtime error, and it seems to be faster.

Calculating indices for nested loops in CUDA

I'm trying to learn CUDA and I'm a bit confused about calculating thread indices. Let's say I have this loop I'm trying to parallelize:
...
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
In PyCUDA, I set:
block = (8, 8, 8)
grid = (96, 96, 16)
Most of the examples I've seen for parallelizing loops calculate thread indices like this:
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1)
DIM_x = 580, DIM_y = 550, psize = 50
However, if I print x, I see that multiple threads with the same thread Id are created, and the final result is wrong.
Instead, if I use this (3D grid of 3D blocks):
int blockId = blockIdx.x + blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int x = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x) + threadIdx.x;
It fixes the multiple same thread Ids problem for x, but I'm not sure how I'd parallelize y and dx.
If anyone could help me understand where I'm going wrong, and show me the right way to parallelize the loops, I'd really appreciate it.
However, if I print x, I see that multiple threads with the same
thread Id are created, and the final result is wrong.
It would be normal for you to see multiple threads with the same x thread ID in a multi-dimensional grid, as it would also be normal to observe many iterations of the loops in your host code with the same x value. If the result is wrong, it has nothing to do with any of the code you have shown, viz:
#include <vector>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <assert.h>
void host(int* array, int DIM_x, int DIM_y, int psize)
{
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
}
__global__
void kernel(int* array, int DIM_x, int DIM_y, int psize)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1);
}
int main()
{
dim3 block(8, 8, 8);
dim3 grid(96, 96, 16);
int DIM_x = 580, DIM_y = 550, psize = 50;
std::vector<int> array_h(DIM_x * DIM_y * psize, 0);
std::vector<int> array_hd(DIM_x * DIM_y * psize, 0);
thrust::device_vector<int> array_d(DIM_x * DIM_y * psize, 0);
kernel<<<grid, block>>>(thrust::raw_pointer_cast(array_d.data()), DIM_x, DIM_y, psize);
host(&array_h[0], DIM_x, DIM_y, psize);
thrust::copy(array_d.begin(), array_d.end(), array_hd.begin());
cudaDeviceSynchronize();
for(int i=0; i<DIM_x * DIM_y * psize; i++) {
assert( array_h[i] == array_hd[i] );
}
return 0;
}
which when compiled and run
$ nvcc -arch=sm_52 -std=c++11 -o looploop loop_the_loop.cu
$ cuda-memcheck ./looploop
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
emits no errors and passes the check of all elements against the host code in your question.
If you are getting incorrect results, it is likely that you have a problem with initialization of the device memory before running the kernel. Otherwise I fail to see how incorrect results could be emitted by the code you have shown.
In general, performing a large number of atomic memory transactions, as your code does, is not the optimal way to perform computation on the GPU. Using non-atomic transactions would probably need to rely on other a priori information about the structure of the problem (such as a graph decomposition or a precise description of the write patterns of the problem).
In a 3D grid with 3D blocks, the thread ID is:
unsigned long blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
unsigned long threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
Not the x you computed. The x is only the x index of that 3D matrix.
There is a nice cheatsheet in this blog

CUDA in-place transpose doesn't complete transpose total matrix [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
I've written the CUDA code below. It's supposed to transpose a matrix using tiling blocks, and the code works when using small values, but when using, for example:
TILE = 32, matrix 128 x 128, it doesn't complete the transpose, it stops after 96. In host this is my dimension thread/block
dim3 dimGrid((nEven + TILE_DIM - 1) / TILE_DIM, (nEven + TILE_DIM - 1) / TILE_DIM);
dim3 dimBlock(TILE_DIM, TILE_DIM);
where I let the threads number == to tile block number,
the global code is simple and it should theoretically work:
__global__ void transposeMain( int *idata)
{
__shared__ int tile2[TILE_DIM][TILE_DIM];
int yyy = blockIdx.y * TILE_DIM ; // col values (0,32,64,96)
int xxx = blockIdx.x * TILE_DIM ; // row values (0,32,64,96)
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
__syncthreads();
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}
}
Any idea what might be the problem?
The problem is you are trying to do an in-place transpose.
CUDA device code execution is broken up into threadblocks. Threadblocks (groups of threads) can execute in any order, and do not all (typically) execute at the same time. So when you read a tile in here:
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
That is OK. But when you write the tile:
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
You are frequently over-writing data (in some other tile in the original matrix) which you haven't read yet (because the threadblock responsible for reading that tile hasn't even begun to execute yet). Once you overwrite it like this, it's lost.
The solution (for square matrix transpose) has several aspects to it:
Each threadblock must first read 2 tiles. These 2 tiles from the input data will be swapped.
Then each threadblock can write those two tiles.
The tiles along the main diagonal need special casing.
since most threadblocks are handling 2 tiles, only threadblocks on or on one side of the main diagonal need do any work.
You haven't shown a complete MCVE (which is expected when you have questions like this), and your code has other issues such as the potential for uncoalesced access (lower performance) so I'm not going to try to "fix" your code.
Instead, here's a fully worked example, lifted from here:
$ cat t469.cu
#include <stdio.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#define uS_PER_SEC 1000000
#define uS_PER_mS 1000
#define N 4096
#define M 4096
#define TILE_DIM 32
#define BLOCK_ROWS 8
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
__global__ void iptransposeCoalesced(float *data)
{
__shared__ float tile_s[TILE_DIM][TILE_DIM+1];
__shared__ float tile_d[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
if (blockIdx.y>blockIdx.x) { // handle off-diagonal case
int dx = blockIdx.y * TILE_DIM + threadIdx.x;
int dy = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_d[threadIdx.y+j][threadIdx.x] = data[(dy+j)*width + dx];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(dy+j)*width + dx] = tile_s[threadIdx.x][threadIdx.y + j];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_d[threadIdx.x][threadIdx.y + j];
}
else if (blockIdx.y==blockIdx.x){ // handle on-diagonal case
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_s[threadIdx.x][threadIdx.y + j];
}
}
int validate(const float *mat, const float *mat_t, int n, int m){
int result = 1;
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
if (mat[(i*m)+j] != mat_t[(j*n)+i]) result = 0;
return result;
}
int main(){
timeval t1, t2;
float *matrix = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i ++)
for (int j = 0; j < M; j++)
matrix[(i*M) + j] = i;
// Starting the timer
gettimeofday(&t1, NULL);
float *matrixT = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
matrixT[(j*N)+i] = matrix[(i*M)+j]; // matrix is obviously filled
//Ending the timer
gettimeofday(&t2, NULL);
if (!validate(matrix, matrixT, N, M)) {printf("fail!\n"); return 1;}
float et1 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("CPU time = %fms\n", et1);
float *h_matrixT , *d_matrixT , *d_matrix;
h_matrixT = (float *) (malloc (N * M * sizeof(float)));
cudaMalloc((void **)&d_matrixT , N * M * sizeof(float));
cudaMalloc((void**)&d_matrix , N * M * sizeof(float));
cudaMemcpy(d_matrix , matrix , N * M * sizeof(float) , cudaMemcpyHostToDevice);
//Starting the timer
gettimeofday(&t1, NULL);
const float alpha = 1.0;
const float beta = 0.0;
cublasHandle_t handle;
//gettimeofday(&t1, NULL);
cublasCreate(&handle);
gettimeofday(&t1, NULL);
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, &alpha, d_matrix, M, &beta, d_matrix, N, d_matrixT, N);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cublasDestroy(handle);
//Ending the timer
float et2 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU Sgeam time = %fms\n", et2);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
cudaMemset(d_matrixT,0, N*M*sizeof(float));
memset(h_matrixT, 0, N*M*sizeof(float));
dim3 threads(TILE_DIM, BLOCK_ROWS);
dim3 blocks(N/TILE_DIM, M/TILE_DIM);
gettimeofday(&t1, NULL);
transposeCoalesced<<<blocks, threads >>>(d_matrixT, d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et3 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU kernel time = %fms\n", et3);
memset(h_matrixT, 0, N*M*sizeof(float));
gettimeofday(&t1, NULL);
iptransposeCoalesced<<<blocks, threads >>>(d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrix , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et4 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU in-place kernel time = %fms\n", et4);
cudaFree(d_matrix);
cudaFree(d_matrixT);
return 0;
}
$ nvcc -arch=sm_20 -o t469 t469.cu -lcublas
$ ./t469
CPU time = 450.095001ms
GPU Sgeam time = 1.937000ms
GPU kernel time = 1.694000ms
GPU in-place kernel time = 1.839000ms
$
Note that this compares several different approaches to matrix transpose.
If you study the iptransposeCoalesced you will see that it is adhering to the 4 specific aspects I outlined above.
It is fishy to use __syncthreads(); in the if statement in CUDA. Try to move it outside this block by simple:
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
}
__syncthreads();
if (xxx < nEven && yyy < nEven)
{
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}

CUDA 2d convolution boundary incorrect

I implemented a CUDA 2D convolution code with naive way and cannot get the boundary value correct. The error happens on the top and left borders with half-of-filter wide. For example, if my filter is 7x7, the error reside in top 3 pixels and left 3 pixels (compared to C result). Can some one help me to resolve this bug? Your help is very appreciated!
Attached is my cuda code and c code:
#define ISIZE 32//input image size ISIZE*ISIZE
#define MASK_RADIUS 3
#define MASK_WIDTH (2 * MASK_RADIUS + 1)
const int FILTER_SIZE = MASK_WIDTH * MASK_WIDTH * sizeof(float);
__device__ __constant__ float d_filter[FILTER_SIZE];
__global__ void convolution2D_cuda(float* d_Result, float* d_Data, int dataH, int dataW)
{
// global mem address for this thread
const int gLoc = threadIdx.x + blockIdx.x * blockDim.x +
(threadIdx.y + blockIdx.y * blockDim.y) * dataW;
float sum = 0;
float value = 0;
for(int i = -MASK_RADIUS; i <= MASK_RADIUS; i++) //row wise
{
for (int j = -MASK_RADIUS; j <= MASK_RADIUS; j++) //col wise
{
// check row
if ( (blockIdx.x == 0) && ((threadIdx.x + j) < 0) ) //left apron
value = 0;
else if ( blockIdx.x == (gridDim.x -1) && (threadIdx.x + j) > (blockDim.x-1) ) //right apron
value = 0;
else {
// check col
if ( blockIdx.y == 0 && (threadIdx.y + i) < 0) //top apron
value = 0;
else if ( blockIdx.y == (gridDim.y-1) && (threadIdx.y + i) > (blockDim.y-1) ) //bottom apron
value = 0;
else // load data
value = d_Data[gLoc + i * dataW + j];
}
//2d array case: non-separable filter
sum += value * d_filter[ (MASK_RADIUS - i) * MASK_WIDTH + (MASK_RADIUS - j) ];
}
}
d_Result[gLoc] = sum;
}
//c code
void convolution2D_cpu(float* result, float* input, float* filter, int dataW, int dataH, int k_Width, int k_Height, int radiusY, int radiusX)
{
int y, x, ky, kx;
for (y = 0; y < dataH; y++) { //row
for (x = 0; x < dataW; x++) {
result[y*dataW + x] = 0;
float sum=0;
for(ky = -radiusY; ky <= radiusY; ky++) {
for(kx = -radiusX; kx <= radiusX; kx++) {
int dy = y + ky;
int dx = x + kx;
if (dy >= 0 && dy < dataH) //left & upper borders
if (dx >= 0 && dx < dataW) //right & lower borders
sum += input[dy*dataW + dx] * filter[(radiusY-ky)*k_Width + (radiusX - kx)];
}
}
result[y*dataW+x] = sum;
}
}
}
Part of the main() code is :
dim3 blocks(16, 16);
dim3 grids(width/16, height/16);
checkCudaErrors( cudaMalloc( (void **)&d_data, data_size ));
checkCudaErrors( cudaMalloc( (void **)&d_result, data_size ));
checkCudaErrors( cudaMemcpy(d_data, indata, data_size, cudaMemcpyHostToDevice) );
checkCudaErrors( cudaThreadSynchronize() );
convolution2D_cuda<<<grids, blocks>>>(d_result, d_data, width, height);
checkCudaErrors( cudaThreadSynchronize() );
checkCudaErrors( cudaMemcpy(output, d_result, data_size, cudaMemcpyDeviceToHost) );
checkCudaErrors( cudaThreadSynchronize() );
//check with result of CPU
convolution2D_cpu(c_result, indata, filter, width, height, len, len, MASK_RADIUS, MASK_RADIUS);
I get to resolve this mystery. The error happens on thread index calculation. threadIdx is uint, nvcc thinks (threadIdx.x + j) as unsigned int. Ex. if j is -1, it is interpreted as 4294967295 (ffffffff) and the boundary index is incorrect.

What did I miss while converting from CUDA to OpenCL? Or why is my kernel returning different output than serial code

This is my code for multiplication of a sparse matrix in compressed column format
__kernel void mykernel(__global int* colvector,
__global int* val,
__global int* result,
__global int* index,
__global int* rowptr,
__global int* sync )
{
__local int vals[1000];
for(int i=0;i<4;i++)
{
result[i]=0;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
const int items_per_row=32;//total threads working in a row
const int thread_id=get_global_id(0)+get_local_id(0);//total threads in the program
const int warpid = thread_id/items_per_row;//warp id is actual row
int lane=thread_id&(items_per_row-1);//thread id within the warp
int row = warpid;
if(row<4)
{
int sum = 0;
int row_start = rowptr[row];
int row_end = rowptr[row+1];
vals[get_global_id(0)]=0;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = row_start+lane; i<row_end; i+=items_per_row)
{
vals[get_local_id(0)]+=val[i]*colvector[index[i]];
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if (lane < 16 ) vals[get_local_id(0)] += vals[get_local_id(0) + 16];
if (lane < 8 ) vals[get_local_id(0)] += vals[get_local_id(0) + 8];
if (lane < 4 ) vals[get_local_id(0)] += vals[get_local_id(0) +4];
if (lane < 2 ) vals[get_local_id(0)] += vals[get_local_id(0) + 2];
if (lane < 1 ) vals[get_local_id(0)] += vals[get_local_id(0) + 1];
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if(lane==0)
{
result[row] += vals[get_local_id(0)];
}
}
}
the above OpenCL code was converted from the CUDA code given below:
spmv_csr_vector_kernel(const int num_rows,
const int * ptr,
const int * indices,
const float * data,
const float * x,
float * y )
{
__shared__ float vals[];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x; // global thread index
int warp_id = thread_id / 32; // global warp index
int lane = thread_id & (32 - 1); // thread index within the warp
// one warp per row
int row = warp_id;
if (row < num_rows)
{
int row_start = ptr[row];
int row_end = ptr[row+1];
// compute running sum per thread
vals[threadIdx.x] = 0;
for(int jj = row_start + lane; jj < row_end; jj += 32)
{
vals[threadIdx.x] += data[jj] * x[indices[jj]];
}
// parallel reduction in shared memory
if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16];
if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8];
if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4];
if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2];
if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1];
// first thread writes the result
if (lane == 0)
{
y[row] += vals[threadIdx.x];
}
}
}
The CUDA code is correct but my OpenCL kernel is not returning correct output. I have been trying for a week now but no solution. Does anybody know what mistake I am making?
I can at least see one mistake. thread_id is not the same in each code. blockDim.x * blockIdx.x + threadIdx.x in CUDA == get_global_id(0) in OpenCL, not get_global_id(0)+get_local_id(0). Also get_local_id(0) == threadIdx.x
Try using swan, this might help you understand your problem.
you can find an article here about it.