Having an issue detecting problem in my code | CUDA c - cuda

I'm having a hard time understanding where is the bug in my code.
The purpose of the project is to multiply two matrices and compare the time between sequential and parallel.
When I'm printing the Matrices I see that the device matrix is basically empty.
Also, I treated the matrices as an array of size n*n .
Thanks!
//This program computes the multiplication of two Matrices GPU using CUDA
#include <stdio.h>
#include <cassert>
__global__ void matrixMul(int * m,int * n,int * p,int size)
{
//Calculate Row and Column
int row=threadIdx.y*blockDim.y+threadIdx.y;
int column=threadIdx.x*blockDim.x+threadIdx.x;
int p_sum=0;
for (int i = 0; i < size; i++)
{
p_sum += m[row*size + i] * n[i*size +column];
}
p[row*size + column] = p_sum;
}
void matrixMul_seq(int * m,int * n,int * p,int size){
for(int i = 0; i < size; i++){
for(int j = 0; j < size; j++){
for(int k = 0; k < size; k++){
p[i*size +j] += m[i*size +k] +n[k*size +j];
}
}
}
}
//Initialize matricies
void init_matricies(int * mat,int n){
for(int i = 0; i < n; i++) {
for (int j = 0; j < n; j++)
{
mat[i*n+j]=rand()%1024;
}
}
}
int main(int argc,char **argv)
{
//Set our problem Size(Default = 2^10 == 1024)
int n = 1<<10;
printf("Square Matrix of size:%d\n",n);
//Size in Bytes
size_t bytes=n*n*sizeof(bytes);
//Host matricies
int *h_m;
int *h_p;
int *h_n;
int *h_p_seq;
//Host matricies
int *d_m;
int *d_p;
int *d_n;
//Memory allocation for Host Matricies
h_m=(int*)malloc(bytes);
h_n=(int*)malloc(bytes);
h_p=(int*)malloc(bytes);
h_p_seq=(int*)malloc(bytes);
init_matricies(h_m,n);
init_matricies(h_n,n);
//Allocate memory on device side
cudaMalloc(&d_n, bytes);
cudaMalloc(&d_m, bytes);
cudaMalloc(&d_p, bytes);
//Copy data to Device
cudaMemcpy(d_m,h_m, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_n,h_n, bytes, cudaMemcpyHostToDevice);
int threads_per_block =16;
dim3 block_size(threads_per_block,threads_per_block);
dim3 grid_size( n / block_size.x , n / block_size.y);
printf("Grid size X:%d, Grid size y:%d\n",grid_size.x,grid_size.y);
printf("THE RESULT OF THE SIZES: 2^6 * 2^4 * 2^6 * 2^4 \n");
matrixMul <<<grid_size,block_size>>>(d_m,d_n,d_p,n);
matrixMul_seq(h_m,h_n,h_p_seq,n);
cudaMemcpy(h_p,d_p, bytes, cudaMemcpyDeviceToHost);
for(int i = 0; i < n; i++){
for(int j = 0; j < n; j++){
//printf("Grid size X:%d, Grid size y:%d\n",h_p[ n * i + j],h_p_seq[ n * i + j]);
assert(h_p[ n * i + j]==h_p_seq[ n * i + j]);
}
}
free(h_m);
free(h_p);
free(h_n);
free(h_p_seq);
cudaFree(d_m);
cudaFree(d_n);
cudaFree(d_p);
return 0;
}

You have a variety of problems in your code:
You are calculating kernel index variables incorrectly. This is incorrect:
int row=threadIdx.y*blockDim.y+threadIdx.y;
int column=threadIdx.x*blockDim.x+threadIdx.x;
it should be:
int row=blockIdx.y*blockDim.y+threadIdx.y;
int column=blockIdx.x*blockDim.x+threadIdx.x;
The matrix operations in your calculation functions don't match each other. Kernel:
p_sum += m[row*size + i] * n[i*size +column];
^
multiplication
host code:
p[i*size +j] += m[i*size +k] +n[k*size +j];
^
addition
we also observe, from above, that the host code is doing a summation to the output variable (+=), whereas the the kernel is doing an assignment to the output variable (=):
p[row*size + column] = p_sum;
This has implications for the next issue.
malloc doesn't initialize data. Since this operation is creating the output array that will be used by the host code, which is doing a summation to it, we must initialize this allocation to zero:
h_p_seq=(int*)malloc(bytes);
memset(h_p_seq, 0, bytes); // must add this line to initialize to zero
The calculation of the size of your arrays in bytes is too large. You have defined your arrays to be of type int. But your size calculation is like this:
size_t bytes=n*n*sizeof(bytes);
An int is a 4-byte quantity, whereas a size_t variable like bytes is an 8-byte quantity. This doesn't cause an actual problem, but is unnecessary. I would suggest changing it to:
size_t bytes=n*n*sizeof(int);
With the above items addressed, your code runs correctly for me.

Related

Passing a row of pointers to __global__ function

I am trying to pass a row of pointers of a two dimensional array of pointers in CUDA. See my code below. Here the array of pointers is noLocal. Because I am doing an atomicAdd I am expecting a number different of zero in line printf("Holaa %d\n", local[0][0]);, but the value I get is 0. Could you help me to pass an arrow in CUDA by reference, please?
__global__ void myadd(int *data[8])
{
unsigned int x = blockIdx.x;
unsigned int y = threadIdx.x;
unsigned int z = threadIdx.y;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
//printf("Ola sou a td %d\n", tid);
for (int i; i<8; i++)
atomicAdd(&(*data)[i],10);
}
int main(void)
{
int local[20][8] = { 0 };
int *noLocal[20][8];
for (int d = 0; d< 20;d++) {
for (int dd = 0; dd< 8; dd++) {
cudaMalloc(&(noLocal[d][dd]), sizeof(int));
cudaMemcpy(noLocal[d][dd], &(local[d][dd]), sizeof(int), cudaMemcpyHostToDevice);
}
myadd<<<20, dim3(10, 20)>>>(noLocal[d]);
}
for (int d = 0; d< 20;d++)
for (int dd = 0; dd < 8; dd++)
cudaMemcpy(&(local[d][dd]), noLocal[d][dd], sizeof(int), cudaMemcpyDeviceToHost);
printf("Holaa %d\n", local[0][0]);
for (int d = 0; d < 20; d++)
for (int dd = 0; dd < 8; dd++)
cudaFree(noLocal[d][dd]);
}
I believe you received good advice in the other answer. I don't recommend this coding pattern. For general reference material on creating 2D arrays in CUDA, see this answer.
When I compile the code you have shown, I get warnings of the form "i is used before its value is set". This kind of warning should not be ignored. It arises from this statement which doesn't make sense to me:
for (int i; i<8; i++)
that should be:
for (int i = 0; i<8; i++)
It's not clear you understand the C++ concepts of pointers and arrays. This:
int local[20][8] = { 0 };
represents an array of 20x8 = 160 integers. If you want to imagine it as an array of pointers, you could pretend that it includes 20 pointers of the form local[0], local[1]..local[19]. Each of those "pointers" points to an array of 8 integers. But there is no sensible comparison to suggest that it has 160 pointers in it. Furthermore the usage pattern you indicate in your kernel does not suggest that you expect 160 pointers to individual integers. But that is exactly what you are creating here:
int *noLocal[20][8]; //this is declaring a 2D array of 160 *pointers*
for (int d = 0; d< 20;d++) { // the combination of these loops means
for (int dd = 0; dd< 8; dd++) { // you will create 160 *pointers*
cudaMalloc(&(noLocal[d][dd]), sizeof(int));
To mimic your host array (local) you want to create 20 pointers each of which is pointing to an allocation of 8 int quantities. The usage in your kernel code here:
&(*data)[i]
means that you intend to take a single pointer, and offset it by i values ranging from 0 to 7. It does not mean that you expect to receive 8 individual pointers. Again, this is C++ behavior, not unique or specific to CUDA.
In order to make your code "sensible" there were a variety of changes I had to make. Here's a "fixed" version:
$ cat t1858.cu
#include <cstdio>
__global__ void myadd(int data[8])
{
// unsigned int x = blockIdx.x;
// unsigned int y = threadIdx.x;
// unsigned int z = threadIdx.y;
// int tid = blockDim.x * blockIdx.x + threadIdx.x;
//printf("Ola sou a td %d\n", tid);
for (int i = 0; i<8; i++)
atomicAdd(data+i,10);
}
int main(void)
{
int local[20][8] = { 0 };
int *noLocal[20];
for (int d = 0; d< 20;d++) {
cudaMalloc(&(noLocal[d]), 8*sizeof(int));
cudaMemcpy(noLocal[d], local[d], 8*sizeof(int), cudaMemcpyHostToDevice);
myadd<<<20, dim3(10, 20)>>>(noLocal[d]);
}
for (int d = 0; d< 20;d++)
cudaMemcpy(local[d], noLocal[d], 8*sizeof(int), cudaMemcpyDeviceToHost);
printf("Holaa %d\n", local[0][0]);
for (int d = 0; d < 20; d++)
cudaFree(noLocal[d]);
}
$ nvcc -o t1858 t1858.cu
$ cuda-memcheck ./t1858
========= CUDA-MEMCHECK
Holaa 40000
========= ERROR SUMMARY: 0 errors
$
The number 40000 is correct. It comes about because every thread is doing an atomic add of 10, and you have 20x200 threads that are doing that. 10x20x200 = 40000.
You should simply not be doing anything like that. You are wasting time and memory with these excessive allocations. And - your kernel would be pretty slow as well. I am 100% certain this is not what you were asked, nor what you wanted, to do.
Instead, you should:
Allocate a single large buffer on the device to fit the data you need.
Avoid using pointers on the device side, except to that buffer, unless absolutely necessary.
If you somehow have to use a 2D pointer array - add relevant offsets to your buffer's base pointer to get different pointers into it.

How to create and use a 1D layered texture in CUDA

I am new to CUDA. I have figured out how to do 1D and 2D textures in CUDA. However, I am struggling with how to use a 1D layered texture. The output of my kernel which uses the texture is all zeros, which is definitely incorrect. However, I am not sure what I am doing wrong. I have serious doubts that I set up this texture correctly, but I checked for cuda errors everywhere and couldn't find any issues. Can someone show me how to correctly set up a 1D layered texture and use it. Here is my code. Thanks in advance:
// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (location_idx < numlocations) {
// Get the location you want to interpolate from the array
float loc2find = (float) d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 10;
const unsigned int numlayers = 3;
const unsigned int upsamp = 3;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1/(float)upsamp;
float h_data[len][numlayers], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
for (int i = 0; i < loclen; i ++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, len, numlayers);
// Copy to device memory some data located at address h_data in host memory
cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float* d_output;
cudaMalloc(&d_output, loclen * sizeof(float));
// Invoke kernel
int thdsPerBlk = 256;
int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid);
interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}
You must use cudaMalloc3DArray with the cudaArrayLayered flag set to allocate memory for layered textures. There is a complete example of layered texture usage in the toolkit samples which you can study to see how they work.
Unfortunately, the CUDA SDK only shows you how to do it when you have 2D layered texture. There is some more trickiness when it comes to 1D layered textures. It turns out you have to put a 0 into the second argument for make_cudaExtent when making the extentDesc as follows:
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
However, when using make_cudaExtent for mParams.extent for cudaMemcpy3D, you still need to put a 1 for the second argument:
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
Furthermore, there are some other non-obvious details such as the pitch for make_cudaPitchedPtr. So I have included my complete and functioning code for the 1D layered texture. I couldn't find an example of this anywhere. So hopefully this will help out others who are in the same boat:
// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
if (location_idx < numlocations && layer < numlayers) {
// Get the location you want to interpolate from the array
float loc2find = (float)d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
//printf("location=%d layer=%d loc2find=%f result=%f \n", location_idx, layer, loc2find, d_output[location_idx]);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 7;
const unsigned int numlayers = 3;
const unsigned int upsamp = 4;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1 / (float)upsamp;
float h_data[numlayers*len], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
for (int i = 0; i < loclen; i++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaMemcpy3DParms mParams = { 0 };
mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
mParams.kind = cudaMemcpyHostToDevice;
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
cudaArray* cuArray;
cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
mParams.dstArray = cuArray;
cudaMemcpy3D(&mParams);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float *d_output;
cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
float h_output[loclen * numlayers];
// Invoke kernel
dim3 dimBlock(16, 16, 1);
dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
(numlayers + dimBlock.y - 1) / dimBlock.y, 1);
interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < len; i++) {
printf("%5.3f ", h_data[i + j*len]);
}
printf("\n");
}
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < loclen; i++) {
printf("%5.3f ", h_output[i + j*loclen]);
}
printf("\n");
}
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}

Find max of matrix with window size in CUDA [duplicate]

I just started in CUDA. Now I have a question.
I have N*N matrix, and a window scale is 8x8. I want subdivided this matrix into multiple sub-matrix and find max value of this.
For example if I have 64*64 matrix so I will have 8 small matrix with 8*8 scale and find out 8 max values. Finally I save all max values into new array, but its order always change. I want find solution to keep them in right order
__global__ void calculate_emax_kernel(float emap[],float emax[], int img_height, int img_width,int windows_size)
{
int x_index = blockIdx.x*blockDim.x+threadIdx.x;
int y_index = blockIdx.y*blockDim.y+threadIdx.y;
int num_row_block = img_height/windows_size;
int num_col_block = img_width/windows_size;
__shared__ float window_elements[256];
__shared__ int counter;
__shared__ int emax_count;
if (threadIdx.x == 0) emax_count = 0;
__syncthreads();
int index;
int emax_idx = 0;
if(y_index >= img_height|| x_index >= img_width) return;
for(int i = 0; i < num_row_block; i++)
{
for(int j = 0; j < num_col_block; j++)
{
counter = 0;
if(y_index >= i*windows_size && y_index < (i+1)*windows_size
&& x_index >= j*windows_size && x_index < (j+1)*windows_size)
{
int idx = y_index*img_height + x_index;
index = atomicAdd(&counter, 1);
window_elements[index] = emap[idx];
__syncthreads();
// reduction
unsigned int k = (windows_size*windows_size)/2;
while(k != 0)
{
if(index < k)
{
window_elements[index] = fmaxf(window_elements[index], window_elements[index+k]);
}
k /= 2;
}
if(index == 0)
{
emax[i*num_row_block+j] = window_elements[index];
}
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
This is my configuration
void construct_emax(float *input,float *output, int img_height, int img_width)
{
int windows_size = 4;
float * d_input, * d_output;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
calculate_emax_kernel<<<gridsize,blocksize>>>(d_input,d_output,img_height,img_width,windows_size);
}
With CUDA, parallel reduction is tricky; segmented parallel reduction is trickier. Now you are doing it in 2-D, and your segment/window is smaller than the thread block.
For large window size, I don't think it is a problem. You could use one thread block to reduce one window. For example if you have a 16x16 window, you could simply use 16x16 thread block. If you have even larger window size, for example 64x64, you could still use 16x16 thread block. First reduce the 64x64 window to 16x16 elements during data loading, then reduce to 1 scalar within the thread block.
For window size smaller than the block size, you will have to reduce multiple windows per thread block for higher performance. You could use your current block/grid configuration, where each 256-thread block (16x16) is responsible for 16 4x4 windows. But this will not be optimal because each 32-thread wrap is organized in two parts (2x16). This is not good for coalesced global memory access, and it is hard to map a 2x16 warp to one or more 4x4 windows for efficient parallel reduction.
Alternatively I would suggest you use 1-D thread block with 256 threads. Every m threads reduce one mxm window. Then you could use 2-D grid to cover the whole image.
const int m = window_size;
dim3 blocksize(256);
dim3 gridsize((img_width+255)/256, (img_height+m-1)/m);
In the kernel function, you could
reduce each mxm window to a 1xm vector during global data loading;
use tree reduction method to reduce the 1xm vector to a scalar.
This following code is a conceptual demo which works when m is a power of 2 and m <= 32. You could further modify it for arbitrary m and better boundary checking.
#include <assert.h>
#include <cuda.h>
#include <thrust/device_vector.h>
__global__ void calculate_emax_kernel(const float* input, float* output,
int height, int width, int win_size,
int out_width) {
const int tid = threadIdx.x;
const int i = blockIdx.y * win_size;
const int j = blockIdx.x * 256 + tid;
const int win_id = j % win_size;
__shared__ float smax[256];
float tmax = -1e20;
if (j < width) {
for (int tile = 0; tile < win_size; tile++) {
if (i + tile < height) {
tmax = max(tmax, input[(i + tile) * width + j]);
}
}
}
smax[tid] = tmax;
for (int shift = win_size / 2; shift > 0; shift /= 2) {
if (win_id < shift) {
smax[tid] = max(smax[tid], smax[tid + shift]);
}
}
if (win_id == 0 && j < width) {
output[blockIdx.y * out_width + (j / win_size)] = smax[tid];
}
}
int main() {
const int height = 1024;
const int width = 1024;
const int m = 4;
thrust::device_vector<float> in(height * width);
thrust::device_vector<float> out(
((height + m - 1) / m) * ((width + m - 1) / m));
dim3 blocksize(256);
dim3 gridsize((width + 255) / 256, (height + m - 1) / m);
assert(m == 2 || m == 4 || m == 8 || m == 16 || m == 32);
calculate_emax_kernel<<<gridsize, blocksize>>>(
thrust::raw_pointer_cast(in.data()),
thrust::raw_pointer_cast(out.data()),
height, width, m, (width + m - 1) / m);
return 0;
}
In case you're willing to use a library, few pointers:
use NPP, set of primitives (from nvidia)
https://docs.nvidia.com/cuda/npp/group__image__filter__max.html
a lower level library, for other reduce operations and more granularity in the way you use the hardware (from nvidia / nvlabs)
http://nvlabs.github.io/cub/

Optimize vector matrix multiplication in cuda with large number of zeros

I am using the following kernel to optimize vector-matrix multiplication for the case where both the vector and the matrix have a large number of zeros. The use of this kernel may reduce the time taken for such a multiplication by up to half of the time taken by cublasSgemv, for the case where there are more than 90% zeros. But, it is still much longer than an equivalent blas gemm host call on Ubuntu 14.04
vec = 1 x m, mat = m x m and prod = 1 x m; all are in row-major order
m >= 5000
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
What can be done to further enhance the performance of this kernel apart from libraries like cuSparse?
Would be nice if this optimization was compatible with Compute Capability of 1.2
Thanks
EDIT
Corrected: prod = 1 x m
GPU = Quadro FX 1800M, Cuda v.5.0 on Ubuntu 14.04
EDIT
Complete code that performs multiplication using i. blas, ii. cublas, iii. above kernel for m = 6000. Please enter 0, when asked to enter a value
#include <iostream>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <cublas_v2.h>
#include <math.h>
using namespace std;
const int m = 6000;
const int BS = 512; // threads per block
const int NB = ceil((float) m / BS); // number of blocks
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
int main()
{
timespec blas_start, blas_end, cublas_start, cublas_end, opt_start, opt_end;
long totalnsec; //total nano sec
double totalsec, totaltime;
int i, j;
float *A = new float[m]; // 1 x m
float *B = new float[m*m]; // m x m
float *C = new float[m]; // 1 x m
float input;
cout<<"Enter a value to populate the vector (0 to make it sparse) ";
cin>>input;
// input martix A: every 600th element is non-zero i.e 90% zero
for(i = 0; i < m; i++)
{
A[i] = input;
if( i % 600 == 0) //adjust for sparsity
A[i] = i;
}
// input matrix B: identity matrix
for(i = 0; i < m; i++)
for(j = 0; j < m; j++)
B[j*m + i] = (i==j);
//blas on host
clock_gettime(CLOCK_REALTIME, &blas_start);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, m, m, 1.0f, A, m, B, m, 0.0f, C, m);
//cblas_sgemv(CblasRowMajor, CblasTrans, m, m, 1.0f, B, m, A, 1, 0.0f, C, 1);
clock_gettime(CLOCK_REALTIME, &blas_end);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
//cublas section
cudaError_t cudaStat;
cublasHandle_t handle;
cublasCreate(&handle);
float *A_d, *B_d, *C_d;
cudaStat = cudaMalloc(&A_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for A_d\n");
cudaStat = cudaMalloc(&B_d, sizeof(float)*m*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for B_d\n");
cudaStat = cudaMalloc(&C_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for C_d\n");
cudaMemcpy(A_d, A, sizeof(float)*m, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, sizeof(float)*m*m, cudaMemcpyHostToDevice);
float alpha = 1.0f, beta = 0.0f;
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_start);
cublasSgemv(handle, CUBLAS_OP_N, m, m, &alpha, B_d, m, A_d, 1, &beta, C_d, 1);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Call kernel having Optimization for Zeros
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_start);
/////////////////// call kernel //////////////////
calc_v_m<<<NB, BS>>>(A_d, B_d, C_d, m);
//////////////////////////////////////////////////
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/*for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Print times
// blas time
totalsec = (double)blas_end.tv_sec - (double)blas_start.tv_sec;
totalnsec = blas_end.tv_nsec - blas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"blas Time = "<< totaltime << "\n";
//cublas time
totalsec = (double)cublas_end.tv_sec - (double)cublas_start.tv_sec;
totalnsec = cublas_end.tv_nsec - cublas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"cublas Time = "<< totaltime << "\n";
//Optimized Kernel Time
totalsec = (double)opt_end.tv_sec - (double)opt_start.tv_sec;
totalnsec = opt_end.tv_nsec - opt_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"Opt Kernel Time = "<< totaltime << "\n";
return 0;
}
Results
$ nvcc -arch=sm_12 blascomp.cu -o blascomp.o -lblas -lcublas
$ ./blascomp.o
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 0.000105207
cublas Time = 0.0070294
Opt Kernel Time = 0.00642797
At least on my system blas is still the fastest for such a scenario
Things get even more interesting if every '1200th' element instead of '600th' is set to 0
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 7.84e-05
cublas Time = 0.00698783
Opt Kernel Time = 0.00643042
The important thing to recognise here is that the gemv operation you are concerned with is fundamentally memory throughput limited on GPUs, rather than compute throughput limited. This implies that an "optimisation" as you have shown in your kernel:
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
isn't really an optmisation at all, simply because the memory transactions are the performance bottleneck in the kernel, not the floating point arithmetic, and your code must perform most of the memory transactions irrespective of whether the multiply add operation will be performed because of zero detection or not.
Consider the following, instrumented version of roughly the same code:
__constant__ float cvec1[2];
__global__ void
__launch_bounds__(512,4)
calc_v_m1(const float* __restrict__ vec,
const float* __restrict__ mat,
float* __restrict__ prod,
int m,
int do_reads = 1,
int do_write = 1)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
float res = 0;
float mval = cvec1[0], vval = cvec1[1];
#pragma unroll 8
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if (do_reads) {
mval = mat[offset];
vval = vec[i];
}
res += mval * vval;
}
if (do_write) prod[x] = res;
}
}
Here I have added two optional arguments which control whether the kernel will load from global memory, and whether the kernel will store to global memory. This allows me to quantify the performance impact of the memory loads, computation, and memory stores independently. The results using your test code are instructive:
Function nvprof time
-----------------------------------------------
cublasSgemv 942.75us
calc_v_m 2798.4us
calc_v_m1(do_reads=1, do_write=1) 962.40us
calc_v_m1(do_reads=1, do_write=0) 970.40us
calc_v_m1(do_reads=0, do_write=1) 55.166us
calc_v_m1(do_reads=0, do_write=0) 55.102us
[All benchmarking done on a GTX970 using the CUDA 7.5 release toolchain and CUBLAS 7.5 library]
In no particular order:
The full instrumented kernel runtime is within a few percent of the equivalent CUBLAS call
The memory fetches from global memory are the bottleneck
The actual computations in the kernel only constitute 5% of the kernel running time
The "fire-and-forget" nature of write operations in CUDA means that the latency of the write has no significant effect on throughput.
Your "optimised" kernel is considerably slower than either CUBLAS or the instrumented kernel, probably because all you are introducing is branch divergence without addressing the source of the kernel bottleneck (the latency of the memory loads).
The only times conditionally executing the FMAD operation makes sense would be in an architecture where memory has near zero latency and floating point throughput was severely constrained. The GPU definitely doesn't fall into that category.
The only other option for optimising this would be to exploit a priori information about the sparsity patterns in the LHS matrix to remove the need to read zero entries. Which is precisely what sparse matrix formats and linear algebra codes are designed to accommodate.

Cuda program not working

i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.