Creating identity matrix with CUDA - cuda

Hi i try to create an identity matrix with CUDA but the output is just : zeros
__global__ void initIdentityGPU(int *devMatrix, int numR, int numC) {
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x * y;
for (int i = 0; i < x ; i++) {
for (int j = 0; j < numR; j++) {
if (i == j)
devMatrix[offset] = 1;
else
devMatrix[offset] = 0;
}
}
}
Why only it puts 0s ?

The simplest way how to do it is:
__global__ void initIdentityGPU(int **devMatrix, int numR, int numC) {
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
if(x == y)
devMatrix[y][x] = 1;
else
devMatrix[y][x] = 0;
}
}
and you launch it as:
dim3 blockDim(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 gridDim((numC + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (numR + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y);
initIdentityGPU<<<gridDim, blockDim>>>(matrix, numR, numC);
It simply runs as many threads as matrix cells, each thread obtains the coordinates of its cell and in a case the cell is in the diagonal of matrix it assigns 1 or 0 otherwise. Note the code is untested.

Related

Cuda Implementation of Partitioned Subgroup

is there a more efficient way to implement the "Partitioned Subgroup" functions of Vulkan/OpenGL, which do not have to loop over all elements in the subgroup? My current implementation just uses a loop from 0 to WARP_SIZE.
References:
(slide 37+38) https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf
https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt
Simple Implementation:
__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
uint32_t result = 0;
for (int i = 0; i < 32; ++i)
{
int x = __shfl_sync(0xFFFFFFFF, p(0), i);
int y = __shfl_sync(0xFFFFFFFF, p(1), i);
uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
if (i == threadIdx.x & 31) result = b;
}
return result;
}
__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
float result = 0;
for ( unsigned int i = 0; i < 32; ++i)
{
float other_value = __shfl_sync(0xFFFFFFFF, value, i);
if ((1U << i) & ballot) result += other_value;
}
return result;
}
Thanks to the hint of Abator I came up with a more efficient solution. It's a little ugly because labeled_partition is only implemented for int but works quite well.
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
using namespace cooperative_groups;
thread_block block = this_thread_block();
thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);
coalesced_group g1 = labeled_partition(tile32, p(0));
coalesced_group g2 = labeled_partition(tile32, p(1));
details::_coalesced_group_data_access acc;
return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}
template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
int s = group.size();
int r = group.thread_rank();
for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
{
auto v = group.template shfl_down(value, offset);
if (r + offset < s) value += v;
}
return value;
}

How can I convolution image in CUDA

I have a question about image convolution in CUDA. When I test it with small maxtrix (16*16) evething is ok. But with larger matrix, the result is always change when I run.
I think problem is 2 for loops into kernel.
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i < img_width*img_height; i++)
{
printf("%d, ",(int)output[i]);
}
printf("\n\n");
}
Here is my result, I test it with 24*24 image, I run it 2 time, and I also write simple function to compared the output.
And here is result when I compare the output, there are 32 differents,at index 240, 241 ....
You have made a fairly common error in your program. When you create a grid of threads like this:
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
you are intentionally creating (usually) extra threads in each dimension, so as to fully cover the problem space (i.e. image size). There is nothing wrong with this.
However, it means we will be launching extra threads, which are outside the valid image dimension. We must ensure that these threads do nothing. The usual approach is to add a thread check to the kernel, so that threads outside the valid image dimensions do nothing. Here's a modified kernel and fully worked example showing that change:
$ cat t1219.cu
#include <iostream>
#include <cstdlib>
const int iw = 1025;
const int ih = 1025;
const int rng = 10;
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if ((x < img_width) && (y < img_height)){ // thread check
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
}
int main(){
float *in, *out;
int is = ih*iw;
in = new float[is];
out = new float[is];
for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;}
image_convolution(in,out, ih, iw);
for (int iy = 1; iy < ih-1; iy++)
for (int ix = 1; ix < iw-1; ix++){
float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]);
if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}}
return 0;
}
$ nvcc -o t1219 t1219.cu
$ cuda-memcheck ./t1219
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
For image dimensions which are exact multiples of the block size (16,16) (which was true for my previous test case) this problem won't show up -- the code will work correctly. For all other test cases, we need such a thread check.

My CUDA kernel code is not working

I try to make a small code to generate numbers and return the result in array but once I run this code it's not working, I have tried to use Nsight debugger to understand where is my problem but it freezes and closes immediately.
Could you help me please to understand where is the problem in this code?
__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp,
int m[2], int p[5], int q[5], int i, int* n,
int out[10][5], int N)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockIdx.x;
int idy = blockIdx.y;
int w = idx/100;
int x = idx%100;
int y = idy;
int z = threadIdx.x;
int len = ((i * 2) + 5);
// Fill PF_tmp & QF_tmp
if( i > 0){
for(int k = 0; k < (i * 2); k++)
{
p[k] = PF_tmp[k];
q[k] = QF_tmp[k];
}
}
// Fill X
if( x > 10)
{
p[(i*2)] = (x - (x % 10)) / 10;
p[(i*2)+1] = x % 10;
}else{
p[(i*2)] = 0;
p[(i*2)+1] = x;
}
// Fill Y
if( y > 10)
{
q[(i*2)] = (y - (y % 10)) / 10;
q[(i*2)+1] = y % 10;
}else{
q[(i*2)] = 0;
q[(i*2)+1] = y;
}
// Fill m
p[(i * 2)+2] = m[0];
q[(i * 2)+2] = m[1];
// Fill W
if( w > 10)
{
p[(i*2)+3] = (w - (w % 10)) / 10;
p[(i*2)+4] = w % 10;
}else{
p[(i*2)+3] = 0;
p[(i*2)+4] = w;
}
// Fill Z
if( z > 10)
{
q[(i*2)+3] = (z - (z % 10)) / 10;
q[(i*2)+4] = z % 10;
}else{
q[(i*2)+3] = 0;
q[(i*2)+4] = z;
}
// Fill PL_tmp & QL_tmp
if( i > 0)
{
for(int k = 0; k < (i * 2); k++)
{
p[(len-(i * 2))+k] = PL_tmp[k];
q[(len-(i * 2))+k] = QL_tmp[k];
}
}
if(id<10)
{
for(int k =0; k<5; k++)
out[id][k] = p[k];
}
}
int main()
{
cudaError err;
dim3 blocks(10000, 100);
dim3 threads(100);
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
mykernel<<<blocks, threads>>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);
return 0;
}
The error very obvious, it is all C programming.
when you declare
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
now hst_out, p, q are not a pointer, but later it is used as a pointer:
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
so u should have declare it initially as a pointer instead, eg,
int *p;
and used it as this way:
err = cudaMalloc((void **)&p, 5*sizeof(int));
And notice too that the size you have declared is just 5 bytes....whereas I declared it as 5*sizeof(int).
For more example see:
http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html

CUDA 2d convolution boundary incorrect

I implemented a CUDA 2D convolution code with naive way and cannot get the boundary value correct. The error happens on the top and left borders with half-of-filter wide. For example, if my filter is 7x7, the error reside in top 3 pixels and left 3 pixels (compared to C result). Can some one help me to resolve this bug? Your help is very appreciated!
Attached is my cuda code and c code:
#define ISIZE 32//input image size ISIZE*ISIZE
#define MASK_RADIUS 3
#define MASK_WIDTH (2 * MASK_RADIUS + 1)
const int FILTER_SIZE = MASK_WIDTH * MASK_WIDTH * sizeof(float);
__device__ __constant__ float d_filter[FILTER_SIZE];
__global__ void convolution2D_cuda(float* d_Result, float* d_Data, int dataH, int dataW)
{
// global mem address for this thread
const int gLoc = threadIdx.x + blockIdx.x * blockDim.x +
(threadIdx.y + blockIdx.y * blockDim.y) * dataW;
float sum = 0;
float value = 0;
for(int i = -MASK_RADIUS; i <= MASK_RADIUS; i++) //row wise
{
for (int j = -MASK_RADIUS; j <= MASK_RADIUS; j++) //col wise
{
// check row
if ( (blockIdx.x == 0) && ((threadIdx.x + j) < 0) ) //left apron
value = 0;
else if ( blockIdx.x == (gridDim.x -1) && (threadIdx.x + j) > (blockDim.x-1) ) //right apron
value = 0;
else {
// check col
if ( blockIdx.y == 0 && (threadIdx.y + i) < 0) //top apron
value = 0;
else if ( blockIdx.y == (gridDim.y-1) && (threadIdx.y + i) > (blockDim.y-1) ) //bottom apron
value = 0;
else // load data
value = d_Data[gLoc + i * dataW + j];
}
//2d array case: non-separable filter
sum += value * d_filter[ (MASK_RADIUS - i) * MASK_WIDTH + (MASK_RADIUS - j) ];
}
}
d_Result[gLoc] = sum;
}
//c code
void convolution2D_cpu(float* result, float* input, float* filter, int dataW, int dataH, int k_Width, int k_Height, int radiusY, int radiusX)
{
int y, x, ky, kx;
for (y = 0; y < dataH; y++) { //row
for (x = 0; x < dataW; x++) {
result[y*dataW + x] = 0;
float sum=0;
for(ky = -radiusY; ky <= radiusY; ky++) {
for(kx = -radiusX; kx <= radiusX; kx++) {
int dy = y + ky;
int dx = x + kx;
if (dy >= 0 && dy < dataH) //left & upper borders
if (dx >= 0 && dx < dataW) //right & lower borders
sum += input[dy*dataW + dx] * filter[(radiusY-ky)*k_Width + (radiusX - kx)];
}
}
result[y*dataW+x] = sum;
}
}
}
Part of the main() code is :
dim3 blocks(16, 16);
dim3 grids(width/16, height/16);
checkCudaErrors( cudaMalloc( (void **)&d_data, data_size ));
checkCudaErrors( cudaMalloc( (void **)&d_result, data_size ));
checkCudaErrors( cudaMemcpy(d_data, indata, data_size, cudaMemcpyHostToDevice) );
checkCudaErrors( cudaThreadSynchronize() );
convolution2D_cuda<<<grids, blocks>>>(d_result, d_data, width, height);
checkCudaErrors( cudaThreadSynchronize() );
checkCudaErrors( cudaMemcpy(output, d_result, data_size, cudaMemcpyDeviceToHost) );
checkCudaErrors( cudaThreadSynchronize() );
//check with result of CPU
convolution2D_cpu(c_result, indata, filter, width, height, len, len, MASK_RADIUS, MASK_RADIUS);
I get to resolve this mystery. The error happens on thread index calculation. threadIdx is uint, nvcc thinks (threadIdx.x + j) as unsigned int. Ex. if j is -1, it is interpreted as 4294967295 (ffffffff) and the boundary index is incorrect.

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.
As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.
Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}