CUDA loop unroll in array add - cuda

I want to compute 'out = alpha * px + beta * py','px' and 'py' is array.*
I have a simple kernel:
__global__
void saxpyGPU2( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
size_t i = blockDim.x*blockIdx.x + threadIdx.x;
while (i < N)
{
out[i] = alpha * px[i] + beta * py[i];
i += blockDim.x*gridDim.x;
}
}
It works, so I want to loop unroll.
The code in cuda-handbook is:
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x; i < N-n*blockDim.x*gridDim.x; i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = alpha*x[j]+beta* y[j];
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
I don't understand in the second branch when i > N-n*blockDim.x*gridDim.x. why use a outer loop
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ )....}
And I test those two kernel , first one is OK, but second one I copy from the book is incorrect.
I initial two array while(i<1024) a[i] = i; b[i] = 10*i;i++, and I want to compute the c = alpha*a + beta*b use the two kernels above, but the result in the loop unrolled kernel is 4.3e8 for all element in c.
This my test code:
int main(){
int arraySize = 1024;
float* a =new float[arraySize];
float* b =new float[arraySize];
float* c =new float[arraySize];
for (int i =0;i<arraySize;i++)
{
a[i] = 1.0* i;
b[i] = 10.0*i;
c[i] = 0.0;
}
float* d_a;
float* d_b;
float* d_c;
cudaMalloc((void**)&d_a,sizeof(float)*arraySize);
cudaMemcpy(d_a,a,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b,sizeof(float)*arraySize);
cudaMemcpy(d_b,b,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_c,sizeof(float)*arraySize);
for (int i=0;i<arraySize;i++)
{
c[i] = a[i] + b[i];
}
dim3 block_size(256,1,1);
dim3 grid_size((arraySize -1)/block_size.x+1,1,1);
float alpha = 1.0;
float beta = 1.0;
bool flag = true;
if(flag)
{
saxpyGPU<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
float* temp = new float[arraySize];
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
}
else
{
saxpyGPU2<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
Those two kernel show different result

The kernel code you posted is perfectly correct and produces the expected results. This can be demonstrated using the following code:
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py,
size_t N, float alpha,float beta) {
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x;
i < N-n*blockDim.x*gridDim.x;
i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
out[index] = alpha*x[j] + beta*y[j];
}
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py,
size_t N, float alpha,float beta ) {
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
struct prg {
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const {
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void) {
const int N = 100000;
const float alpha = 0.12345f, beta = 0.9876f;
prg gen(1.f, 2.f);
thrust::device_vector<float> x(N), y(N), z(N);
thrust::counting_iterator<unsigned int> iseqx(0);
thrust::counting_iterator<unsigned int> iseqy(N);
thrust::transform(iseqx, iseqx + N, x.begin(), gen);
thrust::transform(iseqy, iseqy + N, y.begin(), gen);
float *xp = thrust::raw_pointer_cast(&x[0]);
float *yp = thrust::raw_pointer_cast(&y[0]);
float *zp = thrust::raw_pointer_cast(&z[0]);
dim3 blockdim(128);
dim3 griddim(16);
saxpyGPU<<<griddim, blockdim>>>(zp, xp, yp, N, alpha, beta);
cudaDeviceSynchronize();
std::vector<float> xh(N), yh(N), zh(N);
thrust::copy(x.begin(), x.end(), xh.begin());
thrust::copy(y.begin(), y.end(), yh.begin());
thrust::copy(z.begin(), z.end(), zh.begin());
float maxabserr = -1.f, maxrelerr = -1.f;
for(int i=0; i<N; i++) {
float saxpyval = alpha * xh[i] + beta * yh[i];
float abserr = fabs(zh[i]-saxpyval);
float relerr = abserr / fmaxf(fabs(zh[i]), fabs(saxpyval));
maxabserr = fmaxf(abserr, maxabserr);
maxrelerr = fmaxf(relerr, maxrelerr);
}
std::cout.precision(10);
std::cout << "Maximum absolute error = " << maxabserr << std::endl;
std::cout << "Maximum relative error = " << maxrelerr << std::endl;
return 0;
}
which gives me the following:
$ nvcc -arch=sm_30 -o unrolled_saxpy unrolled_saxpy.cu
$ ./unrolled_saxpy
Maximum absolute error = 2.384185791e-07
Maximum relative error = 1.1920676e-07
If you (still) do not understand why the kernel is written as it is, follow what I showed you in your previous question and manually unroll the saxpy function. Start with n=1 and confirm it is functionally the same as the unrolled equivalent, and then try n=2, n=4, etc. to see what the action of loop unrolling is.

Related

Cuda passing an array of structs

I am new to cuda and am trying to parallelize a very simple program shown below that was inspired from this link: https://devblogs.nvidia.com/even-easier-introduction-cuda/
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
void add(int n, S * s){
for(int i = 0; i < n; i++){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
add(n,grid);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].temp);
}
return 0;
}
I am not getting desired results however as when I am updating temp all the new values are 0. I think that the issue is because the array of structs I am passing to my add function cannot be accessed in device memory. I, however, am having a hard time figuring out how to fix this. I found this post on stackoverflow and am a little unsure what the suggested answer did to fix the issue: Array of structs of arrays CUDA C
The cuda code I have for reference is here:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define SIZE 1000
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__ void add(int n, S * s){
int index = threadIdx.x;
int stride = blockDim.x;
//printf("%d\n",(n-index)/stride);
//printf("%d\n",s[0].temp);
for(int i = index; i < n; i+=stride){
printf("%d\n",index);
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
printf("%d\n",index);
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int *h_a;
int *d_a;
int num_blocks= 2;
int num_th_per_blk= 5;
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
size_t memSize;
memSize = num_blocks* num_th_per_blk* sizeof(int);
h_a= (int*) malloc(memSize);
cudaMallocManaged((void **)&grid, n * sizeof(S));
cudaMalloc( (void**) &d_a, memSize);
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].newtemp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
I feel like there is a simple fix here that I am not seeing, or maybe I am missing a key concept. Whatever the case any help would be greatly appreciated.
There is actually a lot wrong in your code, so much so that it is easier to post a working version than point out all the individual mistakes:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__
void add(int n, S * s)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < n; i+=stride){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<10;
S* grid;
cudaMallocManaged((void **)&grid, n * sizeof(S));
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand()%n;
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
int num_th_per_blk= 32;
int num_blocks= (n / num_th_per_blk) + (n % num_th_per_blk > 0) ? 1 : 0;
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaDeviceSynchronize();
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%10==1)printf("%d %d\n",i,grid[i].temp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
The most egregious error is how you handle grid in the host code. Doing this:
S grid[n];
// code initializing grid
cudaMallocManaged((void **)&grid, n * sizeof(S));
is both illegal (you shouldn't try and set grid to another pointer value, it isn't a pointer), and nonsensical. cudaMallocManaged allocates new memory, so all you are doing is initializing grid, then throwing away all the carefully initialized memory and replacing it with uninitialized memory which you pass to the kernel. The kernel then operates on random data. Note also that the grid stride loop within the kernel is also incorrect, and both the original code and CUDA version potentially suffer from integer overflow due to how you initialize the temp members of the structure in both versions using rand().

cuda batched cholesky factorization

I kinda understand how to deal with 2D cuda. But batched cholesky has a 4D towards the end of the algorithm. I attached cholesky and my cuda code if anyone could give me a hint.
int i, k, m, n;
// Batched Cholesky factorization.
for (i = 0; i < batch; i++) {
float *pA = &dA[i*N*N];
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
Cuda:
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.y * blockDim.y + threadIdx.y;
int m = blockIdx.z * blockDim.z + threadIdx.z;
int n = blockIdx.z * blockDim.z + threadIdx.z;
if( k >= N || m >= N || n >= N || i >= batch ) return;
float *pA = &dA[i*N*N];
pA[k*N+k] = sqrtf(pA[k*N+k]);
pA[k*N+m] /= pA[k*N+k];
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
starter:
dim3 dimBlock( (batch+31)/32, (n+31)/32, (n+31)/32 );
dim3 dimGrid( 32, 32, 32);
spotrf_batched_kernel<<< dimBlock, dimGrid, 0, stream>>>(n, batch, dA);
I am going to leave this here without much comment. The code is relatively self-explanatory. This implementation is completely faithful to your serial version, with the following features:
Each block performs exactly one factorization in the batch. Run as many blocks as there are batched matrices to factorize.
Because the factorization is all done at block scope, synchronization between parallel operations is possible, so the order of operations of the factorization is respected
The only parallelism the algorithm exposes is within the row operations of the factorization and update operations
Blocks should be sized according to the number of rows in the batch matrix size in round multiples of the warp size (32 on all CUDA capable devices to date)
The code below has been extremely lightly tested and is not guaranteed to work or be correct. Use at your own peril:
#include <iostream>
#include <algorithm>
__global__
void batchkernel(float** batches, int nbatches, int N, int LDA)
{
if (blockIdx.x < nbatches) {
float* pA = batches[blockIdx.x];
for (int k = 0; k < N; k++) {
// Panel factorization.
if (threadIdx.x == 0) {
pA[k*LDA+k] = sqrtf(pA[k*LDA+k]);
}
__syncthreads();
for (int m = threadIdx.x; ((m < N) && (threadIdx.x > k)); m+=blockDim.x) {
pA[k*LDA+m] /= pA[k*LDA+k];
}
__syncthreads();
// Update of the trailing submatrix.
for (int n = k+1; (n < N); n++) {
for (int m = threadIdx.x; ((m < N) && (threadIdx.x >= n)); m+=blockDim.x) {
pA[n*LDA+m] -= pA[k*LDA+n] * pA[k*LDA+m];
}
}
__syncthreads();
}
}
}
void refCholeskey(float* pA, int N)
{
int k, m, n;
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
int main()
{
// B = np.random.random((10,10))
// SPDmatrix = (0.5*(B+B.T)) + B.shape[0]*np.eye(B.shape[0])
const int N = 10;
const int LDA = 10;
float SPDmatrix[LDA*N] = {
10.22856331, 0.17380577, 0.61779525, 0.66592082, 0.46915566,
0.09946502, 0.69386511, 0.35224291, 0.53155506, 0.51441469,
0.17380577, 10.67971161, 0.34481401, 0.64766522, 0.22372943,
0.55896022, 0.59083588, 0.48872497, 0.54049871, 0.74764959,
0.61779525, 0.34481401, 10.229388, 0.40904432, 0.5015491,
0.52152334, 0.19684814, 0.28262256, 0.04384535, 0.61919751,
0.66592082, 0.64766522, 0.40904432, 10.78410647, 0.12708693,
0.3241063, 0.6984497, 0.65074097, 0.08027563, 0.56332844,
0.46915566, 0.22372943, 0.5015491, 0.12708693, 10.52234091,
0.76346103, 0.80932473, 0.8234331, 0.52737611, 0.65777357,
0.09946502, 0.55896022, 0.52152334, 0.3241063, 0.76346103,
10.54906761, 0.32865411, 0.32467483, 0.80720007, 0.36287463,
0.69386511, 0.59083588, 0.19684814, 0.6984497, 0.80932473,
0.32865411, 10.29729551, 0.34707933, 0.69379356, 0.87612982,
0.35224291, 0.48872497, 0.28262256, 0.65074097, 0.8234331,
0.32467483, 0.34707933, 10.42929929, 0.78849458, 0.159371,
0.53155506, 0.54049871, 0.04384535, 0.08027563, 0.52737611,
0.80720007, 0.69379356, 0.78849458, 10.49604818, 0.43871288,
0.51441469, 0.74764959, 0.61919751, 0.56332844, 0.65777357,
0.36287463, 0.87612982, 0.159371, 0.43871288, 10.94535485 };
const int nbatches = 8;
float** batches;
cudaMallocManaged((void **)&batches, nbatches * sizeof(float*));
for(int i=0; i<nbatches; i++) {
cudaMallocManaged((void **)&batches[i], N * LDA * sizeof(float));
cudaMemcpy(batches[i], SPDmatrix, N * LDA * sizeof(float), cudaMemcpyDefault);
}
int blocksz = 32;
int nblocks = nbatches;
batchkernel<<<nblocks, blocksz>>>(batches, nbatches, N, LDA);
refCholeskey(SPDmatrix, N);
cudaDeviceSynchronize();
float maxabsrelerror = 0.0f;
for(int i = 0; i < N*N; i++) {
float absrelerror = std::fabs(SPDmatrix[i] - batches[0][i]) / std::fabs(SPDmatrix[i]);
maxabsrelerror = std::max(absrelerror, maxabsrelerror);
}
std::cout << "Maximum absolute relative error = " << maxabsrelerror << std::endl;
cudaDeviceReset();
return 0;
}

Thrust/CUDA replicate an array multiple times combined with the values of another array

Let's say I have two arrays
A = {1, 2, 3}
and
B = {10,20,30,40,50}
I want to generate a new array which would have a size of
sizeof(A) * sizeof(B)
I want to replicate B sizeof(A) times, and on each repetition i, the resultant array should have A[i] added to it. So the result would be something like
{11,21,31,41,51,12,22,32,42,52,13,23,33,43,53}
This task can be interpreted as a 2-dimensional problem where the output array can be treated as a matrix of dimensions sizeof(A) times sizeof(B). In this way, we can use 2D CUDA indexing to achieve the desired functionality. A sample CUDA C++ code of this 2D implementation is shown below:
#include <iostream>
#include <cuda_runtime.h>
#include <cassert>
using namespace std;
__global__ void kernel_replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
const int ai = blockIdx.x * blockDim.x + threadIdx.x;
const int bi = blockIdx.y * blockDim.y + threadIdx.y;
if(ai<alen && bi<blen)
{
const int ci = ai * blen + bi;
c[ci] = a[ai] + b[bi];
}
}
void replicate_device(int* a, int* b, int* c, int alen, int blen, int clen)
{
dim3 block(16,16);
dim3 grid;
grid.x = (alen + block.x - 1) / block.x;
grid.y = (blen + block.y - 1) / block.y;
kernel_replicate<<<grid, block>>>(a,b,c,alen,blen,clen);
assert(cudaSuccess == cudaDeviceSynchronize());
}
void replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
int *ad, *bd, *cd;
size_t abytes = alen * sizeof(int);
size_t bbytes = blen * sizeof(int);
size_t cbytes = clen * sizeof(int);
cudaMalloc(&ad, abytes);
cudaMalloc(&bd, bbytes);
cudaMalloc(&cd, cbytes);
cudaMemcpy(ad,a, abytes, cudaMemcpyHostToDevice);
cudaMemcpy(bd,b, bbytes, cudaMemcpyHostToDevice);
replicate_device(ad,bd,cd, alen,blen,clen);
cudaMemcpy(c,cd, cbytes, cudaMemcpyDeviceToHost);
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
}
int main()
{
const int alen = 3;
const int blen = 5;
const int clen = alen * blen;
int A[alen] = {1,2,3};
int B[blen] = {10,20,30,40,50};
int C[clen] = {0};
replicate(A,B,C,alen, blen, clen);
for(int i=0; i<alen; i++)
{
cout<<A[i]<<" ";
}
cout<<endl;
for(int i=0; i<blen; i++)
{
cout<<B[i]<<" ";
}
cout<<endl;
for(int i=0; i<clen; i++)
{
cout<<C[i]<<" ";
}
cout<<endl;
return 0;
}

prefix scan for large arrays

I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?
One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.
As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.
Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}