Using cudaMemcpy3D to transfer *** pointer - cuda

I am trying to use cudaMemcpy3D to transfer dynamically allocated 3d matrix (tensor). Tensor is allocated as contiguous block of memory (see code below). I tried various combinations of cudaExtent and cudaMemcpy3DParms, however the order of elements gets mixed up. I created the following example to demonstrate the issue:
#include <stdio.h>
int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;
tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}
return tensor;
}
__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;
for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
tensorrow = (int *)(tensorslice + i * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
}
}
}
}
int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;
int ***h_tensor;
struct cudaPitchedPtr d_tensor;
h_tensor = alloc_tensor(Nx, Ny, Nz);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&cpy);
kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
// ... clean-up
}
Output for host variable (h_tensor) and device (d_tensor) differ, looking like
h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...
d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...
What am I doing wrong? What would be the correct way to use cudaMemcpy3D?

Any time you are having trouble with a cuda code, it's a good idea to do proper cuda error checking. The code you have posted here, at least, does not run correctly for me - the cudaMemcpy3D line throws an error. This is due to item 2 below. (I suspect the code you used to generate the output was not identical to the code you have shown here, but that's just a guess.)
Your usage of make_cudaPitchedPtr is not correct:
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
review the API documentation. Making a CUDA pitched pointer this way is no different between 2D and 3D. So it makes no sense to pass 3 different dimensions as you are doing. Instead do this:
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
The remaining issues I found I attribute to incorrect understanding of 3 dimensions in C. The last subscript on a multiply-subscripted array is the rapidly varying dimension, i.e. it is the one where adjacent values in memory occupy adjacent index values. Your usage of Z in the 3rd dimension is confusing to me due to this. Your host allocation was using Nx in the first subscript place, but your device indexing didn't match. There are obviously multiple ways to handle this. If you don't like my arrangement, you can change it, but the host and device indexing must match.
Anyway, the following code modifications worked for me:
#include <stdio.h>
int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;
tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}
return tensor;
}
__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;
for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
tensorrow = (int *)(tensorslice + j * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
}
}
}
}
int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;
int ***h_tensor;
struct cudaPitchedPtr d_tensor;
h_tensor = alloc_tensor(Nz, Ny, Nx);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[k][j][i] = value++;
//printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
//h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
}
}
}
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&cpy);
kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
cudaDeviceSynchronize();
// ... clean-up
}

Related

cuda batched cholesky factorization

I kinda understand how to deal with 2D cuda. But batched cholesky has a 4D towards the end of the algorithm. I attached cholesky and my cuda code if anyone could give me a hint.
int i, k, m, n;
// Batched Cholesky factorization.
for (i = 0; i < batch; i++) {
float *pA = &dA[i*N*N];
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
Cuda:
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.y * blockDim.y + threadIdx.y;
int m = blockIdx.z * blockDim.z + threadIdx.z;
int n = blockIdx.z * blockDim.z + threadIdx.z;
if( k >= N || m >= N || n >= N || i >= batch ) return;
float *pA = &dA[i*N*N];
pA[k*N+k] = sqrtf(pA[k*N+k]);
pA[k*N+m] /= pA[k*N+k];
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
starter:
dim3 dimBlock( (batch+31)/32, (n+31)/32, (n+31)/32 );
dim3 dimGrid( 32, 32, 32);
spotrf_batched_kernel<<< dimBlock, dimGrid, 0, stream>>>(n, batch, dA);
I am going to leave this here without much comment. The code is relatively self-explanatory. This implementation is completely faithful to your serial version, with the following features:
Each block performs exactly one factorization in the batch. Run as many blocks as there are batched matrices to factorize.
Because the factorization is all done at block scope, synchronization between parallel operations is possible, so the order of operations of the factorization is respected
The only parallelism the algorithm exposes is within the row operations of the factorization and update operations
Blocks should be sized according to the number of rows in the batch matrix size in round multiples of the warp size (32 on all CUDA capable devices to date)
The code below has been extremely lightly tested and is not guaranteed to work or be correct. Use at your own peril:
#include <iostream>
#include <algorithm>
__global__
void batchkernel(float** batches, int nbatches, int N, int LDA)
{
if (blockIdx.x < nbatches) {
float* pA = batches[blockIdx.x];
for (int k = 0; k < N; k++) {
// Panel factorization.
if (threadIdx.x == 0) {
pA[k*LDA+k] = sqrtf(pA[k*LDA+k]);
}
__syncthreads();
for (int m = threadIdx.x; ((m < N) && (threadIdx.x > k)); m+=blockDim.x) {
pA[k*LDA+m] /= pA[k*LDA+k];
}
__syncthreads();
// Update of the trailing submatrix.
for (int n = k+1; (n < N); n++) {
for (int m = threadIdx.x; ((m < N) && (threadIdx.x >= n)); m+=blockDim.x) {
pA[n*LDA+m] -= pA[k*LDA+n] * pA[k*LDA+m];
}
}
__syncthreads();
}
}
}
void refCholeskey(float* pA, int N)
{
int k, m, n;
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
int main()
{
// B = np.random.random((10,10))
// SPDmatrix = (0.5*(B+B.T)) + B.shape[0]*np.eye(B.shape[0])
const int N = 10;
const int LDA = 10;
float SPDmatrix[LDA*N] = {
10.22856331, 0.17380577, 0.61779525, 0.66592082, 0.46915566,
0.09946502, 0.69386511, 0.35224291, 0.53155506, 0.51441469,
0.17380577, 10.67971161, 0.34481401, 0.64766522, 0.22372943,
0.55896022, 0.59083588, 0.48872497, 0.54049871, 0.74764959,
0.61779525, 0.34481401, 10.229388, 0.40904432, 0.5015491,
0.52152334, 0.19684814, 0.28262256, 0.04384535, 0.61919751,
0.66592082, 0.64766522, 0.40904432, 10.78410647, 0.12708693,
0.3241063, 0.6984497, 0.65074097, 0.08027563, 0.56332844,
0.46915566, 0.22372943, 0.5015491, 0.12708693, 10.52234091,
0.76346103, 0.80932473, 0.8234331, 0.52737611, 0.65777357,
0.09946502, 0.55896022, 0.52152334, 0.3241063, 0.76346103,
10.54906761, 0.32865411, 0.32467483, 0.80720007, 0.36287463,
0.69386511, 0.59083588, 0.19684814, 0.6984497, 0.80932473,
0.32865411, 10.29729551, 0.34707933, 0.69379356, 0.87612982,
0.35224291, 0.48872497, 0.28262256, 0.65074097, 0.8234331,
0.32467483, 0.34707933, 10.42929929, 0.78849458, 0.159371,
0.53155506, 0.54049871, 0.04384535, 0.08027563, 0.52737611,
0.80720007, 0.69379356, 0.78849458, 10.49604818, 0.43871288,
0.51441469, 0.74764959, 0.61919751, 0.56332844, 0.65777357,
0.36287463, 0.87612982, 0.159371, 0.43871288, 10.94535485 };
const int nbatches = 8;
float** batches;
cudaMallocManaged((void **)&batches, nbatches * sizeof(float*));
for(int i=0; i<nbatches; i++) {
cudaMallocManaged((void **)&batches[i], N * LDA * sizeof(float));
cudaMemcpy(batches[i], SPDmatrix, N * LDA * sizeof(float), cudaMemcpyDefault);
}
int blocksz = 32;
int nblocks = nbatches;
batchkernel<<<nblocks, blocksz>>>(batches, nbatches, N, LDA);
refCholeskey(SPDmatrix, N);
cudaDeviceSynchronize();
float maxabsrelerror = 0.0f;
for(int i = 0; i < N*N; i++) {
float absrelerror = std::fabs(SPDmatrix[i] - batches[0][i]) / std::fabs(SPDmatrix[i]);
maxabsrelerror = std::max(absrelerror, maxabsrelerror);
}
std::cout << "Maximum absolute relative error = " << maxabsrelerror << std::endl;
cudaDeviceReset();
return 0;
}

How can I convolution image in CUDA

I have a question about image convolution in CUDA. When I test it with small maxtrix (16*16) evething is ok. But with larger matrix, the result is always change when I run.
I think problem is 2 for loops into kernel.
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i < img_width*img_height; i++)
{
printf("%d, ",(int)output[i]);
}
printf("\n\n");
}
Here is my result, I test it with 24*24 image, I run it 2 time, and I also write simple function to compared the output.
And here is result when I compare the output, there are 32 differents,at index 240, 241 ....
You have made a fairly common error in your program. When you create a grid of threads like this:
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
you are intentionally creating (usually) extra threads in each dimension, so as to fully cover the problem space (i.e. image size). There is nothing wrong with this.
However, it means we will be launching extra threads, which are outside the valid image dimension. We must ensure that these threads do nothing. The usual approach is to add a thread check to the kernel, so that threads outside the valid image dimensions do nothing. Here's a modified kernel and fully worked example showing that change:
$ cat t1219.cu
#include <iostream>
#include <cstdlib>
const int iw = 1025;
const int ih = 1025;
const int rng = 10;
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if ((x < img_width) && (y < img_height)){ // thread check
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
}
int main(){
float *in, *out;
int is = ih*iw;
in = new float[is];
out = new float[is];
for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;}
image_convolution(in,out, ih, iw);
for (int iy = 1; iy < ih-1; iy++)
for (int ix = 1; ix < iw-1; ix++){
float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]);
if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}}
return 0;
}
$ nvcc -o t1219 t1219.cu
$ cuda-memcheck ./t1219
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
For image dimensions which are exact multiples of the block size (16,16) (which was true for my previous test case) this problem won't show up -- the code will work correctly. For all other test cases, we need such a thread check.

My CUDA kernel code is not working

I try to make a small code to generate numbers and return the result in array but once I run this code it's not working, I have tried to use Nsight debugger to understand where is my problem but it freezes and closes immediately.
Could you help me please to understand where is the problem in this code?
__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp,
int m[2], int p[5], int q[5], int i, int* n,
int out[10][5], int N)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockIdx.x;
int idy = blockIdx.y;
int w = idx/100;
int x = idx%100;
int y = idy;
int z = threadIdx.x;
int len = ((i * 2) + 5);
// Fill PF_tmp & QF_tmp
if( i > 0){
for(int k = 0; k < (i * 2); k++)
{
p[k] = PF_tmp[k];
q[k] = QF_tmp[k];
}
}
// Fill X
if( x > 10)
{
p[(i*2)] = (x - (x % 10)) / 10;
p[(i*2)+1] = x % 10;
}else{
p[(i*2)] = 0;
p[(i*2)+1] = x;
}
// Fill Y
if( y > 10)
{
q[(i*2)] = (y - (y % 10)) / 10;
q[(i*2)+1] = y % 10;
}else{
q[(i*2)] = 0;
q[(i*2)+1] = y;
}
// Fill m
p[(i * 2)+2] = m[0];
q[(i * 2)+2] = m[1];
// Fill W
if( w > 10)
{
p[(i*2)+3] = (w - (w % 10)) / 10;
p[(i*2)+4] = w % 10;
}else{
p[(i*2)+3] = 0;
p[(i*2)+4] = w;
}
// Fill Z
if( z > 10)
{
q[(i*2)+3] = (z - (z % 10)) / 10;
q[(i*2)+4] = z % 10;
}else{
q[(i*2)+3] = 0;
q[(i*2)+4] = z;
}
// Fill PL_tmp & QL_tmp
if( i > 0)
{
for(int k = 0; k < (i * 2); k++)
{
p[(len-(i * 2))+k] = PL_tmp[k];
q[(len-(i * 2))+k] = QL_tmp[k];
}
}
if(id<10)
{
for(int k =0; k<5; k++)
out[id][k] = p[k];
}
}
int main()
{
cudaError err;
dim3 blocks(10000, 100);
dim3 threads(100);
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
mykernel<<<blocks, threads>>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);
return 0;
}
The error very obvious, it is all C programming.
when you declare
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
now hst_out, p, q are not a pointer, but later it is used as a pointer:
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
so u should have declare it initially as a pointer instead, eg,
int *p;
and used it as this way:
err = cudaMalloc((void **)&p, 5*sizeof(int));
And notice too that the size you have declared is just 5 bytes....whereas I declared it as 5*sizeof(int).
For more example see:
http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html

CUDA loop unroll in array add

I want to compute 'out = alpha * px + beta * py','px' and 'py' is array.*
I have a simple kernel:
__global__
void saxpyGPU2( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
size_t i = blockDim.x*blockIdx.x + threadIdx.x;
while (i < N)
{
out[i] = alpha * px[i] + beta * py[i];
i += blockDim.x*gridDim.x;
}
}
It works, so I want to loop unroll.
The code in cuda-handbook is:
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x; i < N-n*blockDim.x*gridDim.x; i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = alpha*x[j]+beta* y[j];
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
I don't understand in the second branch when i > N-n*blockDim.x*gridDim.x. why use a outer loop
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ )....}
And I test those two kernel , first one is OK, but second one I copy from the book is incorrect.
I initial two array while(i<1024) a[i] = i; b[i] = 10*i;i++, and I want to compute the c = alpha*a + beta*b use the two kernels above, but the result in the loop unrolled kernel is 4.3e8 for all element in c.
This my test code:
int main(){
int arraySize = 1024;
float* a =new float[arraySize];
float* b =new float[arraySize];
float* c =new float[arraySize];
for (int i =0;i<arraySize;i++)
{
a[i] = 1.0* i;
b[i] = 10.0*i;
c[i] = 0.0;
}
float* d_a;
float* d_b;
float* d_c;
cudaMalloc((void**)&d_a,sizeof(float)*arraySize);
cudaMemcpy(d_a,a,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b,sizeof(float)*arraySize);
cudaMemcpy(d_b,b,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_c,sizeof(float)*arraySize);
for (int i=0;i<arraySize;i++)
{
c[i] = a[i] + b[i];
}
dim3 block_size(256,1,1);
dim3 grid_size((arraySize -1)/block_size.x+1,1,1);
float alpha = 1.0;
float beta = 1.0;
bool flag = true;
if(flag)
{
saxpyGPU<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
float* temp = new float[arraySize];
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
}
else
{
saxpyGPU2<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
Those two kernel show different result
The kernel code you posted is perfectly correct and produces the expected results. This can be demonstrated using the following code:
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py,
size_t N, float alpha,float beta) {
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x;
i < N-n*blockDim.x*gridDim.x;
i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
out[index] = alpha*x[j] + beta*y[j];
}
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py,
size_t N, float alpha,float beta ) {
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
struct prg {
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const {
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void) {
const int N = 100000;
const float alpha = 0.12345f, beta = 0.9876f;
prg gen(1.f, 2.f);
thrust::device_vector<float> x(N), y(N), z(N);
thrust::counting_iterator<unsigned int> iseqx(0);
thrust::counting_iterator<unsigned int> iseqy(N);
thrust::transform(iseqx, iseqx + N, x.begin(), gen);
thrust::transform(iseqy, iseqy + N, y.begin(), gen);
float *xp = thrust::raw_pointer_cast(&x[0]);
float *yp = thrust::raw_pointer_cast(&y[0]);
float *zp = thrust::raw_pointer_cast(&z[0]);
dim3 blockdim(128);
dim3 griddim(16);
saxpyGPU<<<griddim, blockdim>>>(zp, xp, yp, N, alpha, beta);
cudaDeviceSynchronize();
std::vector<float> xh(N), yh(N), zh(N);
thrust::copy(x.begin(), x.end(), xh.begin());
thrust::copy(y.begin(), y.end(), yh.begin());
thrust::copy(z.begin(), z.end(), zh.begin());
float maxabserr = -1.f, maxrelerr = -1.f;
for(int i=0; i<N; i++) {
float saxpyval = alpha * xh[i] + beta * yh[i];
float abserr = fabs(zh[i]-saxpyval);
float relerr = abserr / fmaxf(fabs(zh[i]), fabs(saxpyval));
maxabserr = fmaxf(abserr, maxabserr);
maxrelerr = fmaxf(relerr, maxrelerr);
}
std::cout.precision(10);
std::cout << "Maximum absolute error = " << maxabserr << std::endl;
std::cout << "Maximum relative error = " << maxrelerr << std::endl;
return 0;
}
which gives me the following:
$ nvcc -arch=sm_30 -o unrolled_saxpy unrolled_saxpy.cu
$ ./unrolled_saxpy
Maximum absolute error = 2.384185791e-07
Maximum relative error = 1.1920676e-07
If you (still) do not understand why the kernel is written as it is, follow what I showed you in your previous question and manually unroll the saxpy function. Start with n=1 and confirm it is functionally the same as the unrolled equivalent, and then try n=2, n=4, etc. to see what the action of loop unrolling is.

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.
As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.
Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}