CUB reduction using 2D grid of blocks - cuda

I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121

Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.

Related

issues of cuBLAS performance on batched complex linear system solver

I am new to cuda and cuBlas, and recently I am trying to use batched cuBlas API to solve multiple systems of linear equations. Here's my code:
The size of the matrix is N, and the number of matrices (batch size) is numOfMat.
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h> // deal with complex numbers
#include <cuda_profiler_api.h>
using namespace std::chrono;
#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
std::random_device device;
std::mt19937 generator(device());
std::uniform_real_distribution<double> distribution(1., 5.);
high_resolution_clock::time_point t1;
high_resolution_clock::time_point t2;
double duration = 0;
double duration_1 = 0;
// step 1: cuda solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
cublasStatus_t stat;
int* PivotArray;
int* infoArray;
cudaError_t cudaStatUnified1 = cudaSuccess;
cudaError_t cudaStatUnified2 = cudaSuccess;
const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
std::cout <<"unified memory allocated unsuccessful!"<<std::endl;
//ALLOCATE MEMORY - using unified memory
cuDoubleComplex** h_A;
cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
for (int nm = 1; nm < numOfMat; nm++)
h_A[nm] = h_A[nm-1]+ N * N;
cuDoubleComplex** h_b;
cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
for (int nm = 1; nm < numOfMat; nm++)
h_b[nm] = h_b[nm-1] + N;
// FILL MATRICES
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
// FILL COEFFICIENTS
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
t1 = high_resolution_clock::now();
// step 2: Perform CUBLAS LU solver
stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
gpuErrchk( cudaDeviceSynchronize() );
// check if the input matrix is singular
/*for (int i = 0; i < numOfMat; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}*/
// step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple RHSs
// --- Function overrides b as a result
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
gpuErrchk( cudaDeviceSynchronize() );
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
gpuErrchk( cudaDeviceSynchronize() );
t2 = high_resolution_clock::now();
duration = duration_cast<microseconds>(t2 - t1).count();
std::cout<<duration<<std::endl;
}
The code works fine, but when I plot the computational time versus the number of matrices, the curve looks like this:
My question is: why does the computational time show linear to the number of matrices? Intuitively, the curve should look to be flat when the batch size is large in some extent. However, when the batch size reaches up to 500,000, the time still appears to be linear to the batch size.
How can it be? Is there any explanation behind such a circumstance?
I think you need to look more closely at your data. If I run a modification of your code on Google Colab (Tesla T4) I get this:
Which looks largely like your figure. But look more closely (log scales help):
You can clearly see that up to a certain point, the runtime is largely independent of the number of matrices (around 2^8 = 64), but then scaling is linear as sizes increase. That is the transition from being able to parallelize the workload to reaching parallel capacity and having to schedule many parallel groups of operations to execute the workload. You might infer that for this particular GPU, the GPU run out of parallel capacity at between 64 and 128 concurrent operations (The T4 has 40 SM, so it might well be 80 if an SM could accommodate 2 operations per SM concurrently), after which runtime scales with multiples of that limiting size.
This is completely normal behaviour for any parallel computation architecture I am familiar with.

sum vectors values with cuda C++

I try to sum many vectors values using CUDA c++. I found some solution for two vectors. As you can see, just possible to add two vectors but I wanna generate vectors dynamically with the same length.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
// Sum up vector c and the print result divided by n, this should equal 1
within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Is there a way to do this for many vectors? My vectors size are:
#vector length
N = 1000
#number of vectors
i = 300000
v[i] = [1,2,..., N]
As result i need to get:
out[i]= [sum(v[1]), sum(v[2]),..., sum(v[i])]
Thanks for any advice.
Summing multiple vectors together in a fashion similar to the code you have shown (i.e. generating elementwise sums) is equivalent to summing the columns of a matrix. And this idea represents a sensible way to realize the solution.
We will treat your vectors as a matrix, where each vector is a row in the matrix. The CUDA kernel will assign one thread to each column, and will sum the elements of that column, producing a single number result. That single number result will become one element of the vector result of the entire problem.
Here is a fully worked example demonstrating one possible approach:
$ cat t2.cu
#include <iostream>
typedef double mt;
const int nTPB = 64;
template <typename T>
__global__ void column_sum(T *matrix, T *sums, unsigned n_vectors, unsigned vector_length){
unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < vector_length){
T temp = 0;
for (unsigned i = 0; i < n_vectors; i++)
temp += matrix[i*vector_length+idx];
sums[idx] = temp;}
}
int main(){
const unsigned vlen = 1000;
const unsigned nvec = 300000;
mt *h_matrix, *d_matrix, *h_sums, *d_sums;
// create the desired number of vectors as a single matrix
h_sums = new mt[vlen];
h_matrix = new mt[vlen*nvec];
cudaMalloc(&d_matrix, vlen*nvec*sizeof(mt));
cudaMalloc(&d_sums, vlen*sizeof(mt));
size_t count = 0;
for (unsigned i = 0; i < nvec; i++)
for (unsigned j = 0; j < vlen; j++)
h_matrix[count++] = j;
cudaMemcpy(d_matrix, h_matrix, vlen*nvec*sizeof(mt), cudaMemcpyHostToDevice);
column_sum<<<(vlen+nTPB-1)/nTPB,nTPB>>>(d_matrix, d_sums, nvec, vlen);
cudaMemcpy(h_sums, d_sums, vlen*sizeof(mt), cudaMemcpyDeviceToHost);
for (unsigned i = 0; i < vlen; i++) if (h_sums[i] != ((mt)nvec)*i) {std::cout << " mismatch at " << i << " was: " << h_sums[i] << " should be: " << ((mt)nvec)*i << std::endl; return -1;}
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$
Note that this methodology only creates as many threads on the GPU as there are vector elements (1000 in the above example). 1000 threads would be enough to keep only the smallest GPUs busy. However this algorithm will be efficient on most GPUs if your vector length is 10,000 or longer. If you'd like to explore creating more efficient algorithms for small problem sizes, you can study the idea of a classical parallel reduction.

CUDA_SAFE_CALL: an illegal memory access was encountered

I am trying to do simple matrix multiplication on CUDA. I know arrays can be flattened for passing it to the device. However I am using cudaMallocPitch and cudaMemcpy2d to do the multiplication. While executing the code below I get an error " illegal memory was encountered" when I try to copy the result onto the host I highly appreciate any advice on where I am going wrong. Thanks!
weights-first matrix,dim:30x784
input- second matrix,dim:784x100
results_d - result on the device(GPU)
result - result copied on the host
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch)
{
int row = threadIdx.x;
int col= threadIdx.y;
double value;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
printf("%d",threadIdx);
for(int i =0 ; i < in_pitch ; i++)
{
double *element1 = ((double*)((char*)input + row*in_pitch) + i) ;
double *element2 = ((double*)((char*)weights + i*w1_pitch) + col);
value =+ (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
static double arr1[30][784];
static double arr2[784][100];
static double result[30][100];
for (int i = 0 ; i < 30; i++)
{
for(int j =0;j <784 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < 784; i ++)
{
for(int j=0;j < 100 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,100*sizeof(double),784));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,784*sizeof(double),30));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,100*sizeof(double),30));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,100*sizeof(double),100*sizeof(double),784,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,784*sizeof(double),784*sizeof(double),30,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,100*sizeof(double),100*sizeof(double),30,cudaMemcpyHostToDevice));
//using GPU
dim3 dimGrid(1,1,1);
dim3 dimBlock(32,32,1);
printf("before kernel fucntion");
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch);
printf("after kernel fucntion");
cudaThreadSynchronize();
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < 100; i ++)
{
for(int j=0;j < 30 ; j++)
{
printf("%f",result);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
There are a number of errors in this code. First of all, it's not clear that doing pitched allocations is going to give any benefit here. Second, if you're serious about wanting fast matrix multiply performance, you should use CUBLAS.
Issues:
You don't seem to understand pitched allocations. The pitch value returned is a value in bytes. You cannot sensibly use that for a loop index for matrix multiply. Also, the pitch value is the overall width of the pitch allocation. It does not correspond to the valid data area. For that, you should use the appropriate matrix dimension.
Your code will not do a matrix multiplication over the entire matrix area. You are only creating a single block of 32x32 threads, but you need enough blocks/threads to cover the entire matrix area. This requires changes to your grid dimensions, passing matrix dimensions to your kernel, as well as a "thread check" in your kernel to prevent out-of-bounds access.
This construct for pitched access is not correct:
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
it does not match the other constructions you have for the 2 input matrices, it has a misplaced close parenthesis.
You have the sense of your two input matrices reversed. You are indexing into the input matrix as if it were the weight matrix, and vice-versa. We need to swap the sense of row, column and i to make these match the actual matrix dimensions.
Your final cudaMemcpy2D operation has the pitch values reversed:
cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)
^^^^^ ^^^^^
You forgot to initialize to zero your loop sum variable:
double value;
I don't know what you intended here, it should be += not =+:
value =+ ...
The following code has these issues addressed, and seems to run without error for me:
$ cat t104.cu
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
const int d1 = 30;
const int d2 = 784;
const int d3 = 100;
double arr1[d1][d2];
double arr2[d2][d3];
double result[d1][d3];
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol)
{
int col = threadIdx.x + blockDim.x*blockIdx.x;
int row= threadIdx.y + blockDim.y*blockIdx.y;
if ((row >= rrow) || (col >= rcol)) return;
double value = 0;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch) + col);
for(int i =0 ; i < dim ; i++)
{
double *element1 = ((double*)((char*)input + i*in_pitch) + col) ;
double *element2 = ((double*)((char*)weights + row*w1_pitch) + i);
value += (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
for (int i = 0 ; i < d1; i++)
{
for(int j =0;j <d2 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < d2; i ++)
{
for(int j=0;j < d3 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice));
//using GPU
dim3 dimBlock(32,32,1);
dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3);
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < d3; i ++)
{
for(int j=0;j < d1 ; j++)
{
printf("%f", result[j][i]);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
$ nvcc -arch=sm_61 -o t104 t104.cu
$

Unspecified launch failure after cudaDeviceSynchronize() call when program starts. But no errors using step-through debugging. CUDA

I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}

Cuda Summation per block. I get 0 returned to the sums. What is wrong?

I tried summation in cuda . I cant find what i did wrong here.
The sum is always returned 0. Can anyone help.
The shared tag defines the variable common in each block.
So i tried to sum one block at a time and finally sum up the result for overall sum.
But the sum doesnt work for block. And i am stuck.
Can anyone help.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <stdlib.h>
//#define BLOCK_SIZE 32 // size of vectors
__global__ void add( float * i_data, float * sum){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s_data;
s_data = 0;
// must be synchronized
__syncthreads();
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
__syncthreads();
if (tid <= i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
}
if (tid == 0)
sum[blockIdx.x]=s_data;
}
int main() {
int T = 10, B = 5; // threads per block and blocks per grid
float *a,*b; // host pointers
float *dev_a, *dev_b; // device pointers to host memory
int sizeIN = T*B*sizeof(int);
int sizeOUT = B*sizeof(int);
a= new float[T*B];
b= new float[B];
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
a[i*T+j]=i;
}
}
for(int i = 0;i<B;i++)
{
b[i]=0;
}
cudaMalloc((void **) &dev_a, sizeIN);
cudaMalloc((void **) &dev_b, sizeOUT);
cudaMemcpy(dev_a, a, sizeIN, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeOUT, cudaMemcpyHostToDevice);
add<<< B, T >>> (dev_a, dev_b);
cudaMemcpy(a,dev_a, sizeIN, cudaMemcpyDeviceToHost);
cudaMemcpy(b,dev_b, sizeOUT, cudaMemcpyDeviceToHost);
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
std::cout<< a[i*T+j]<<"\t";
std::cout<<std::endl;
}
std::cout<<std::endl<<std::endl<<"sum is: "<<b[i]<<std::endl;
}
std::cout<<std::endl<<std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
free(a);
free(b);
return 0;
}
This is wrong in 2 ways:
if (tid = 0)
First, you should be doing a comparison == not an assignment =. I don't know why your compiler didn't warn you about this.
Second, tid is only zero for one thread in the entire grid:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
You want one thread in each block to write the block result out to global memory:
if (threadIdx.x == 0)
This is also a problem, similarly:
if (tid <= i)
This is only satisfied for threads in the first block. Beyond that, I have to start to guess at what you want. I guess you're trying to sum the values in each block. Your construction is not a parallel reduction, but to make the minimum changes to get it "functional" I would rewrite the end of your kernel like this:
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
if (threadIdx.x == i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
__syncthreads();
}
if (threadIdx.x == 0)
sum[blockIdx.x]=s_data;
}
Although you didn't have any CUDA API errors, it's good practice to use proper cuda error checking and also run your code with cuda-memcheck any time you are having trouble with a cuda code.
I mentioned that your code above is not a classical reduction. Its just an unoptimal for-loop.
To learn about a CUDA parallel reduction, study the cuda sample code and the accompanying presentation, and there are many examples here on the CUDA tag on SO as well that you can search on.