Finding minimum in GPU slower than CPU - cuda

I have implemented this code: http://www.cuvilib.com/Reduction.pdf in order to calculate the sum of the elements of a matrix.
However in GPU it runs much slower than in CPU.
I got i7 processor and NVIDIA GT 540M graphics card.
Is it supposed to be that way or something else?
EDIT: I use version 3 of the above code in Ubuntu 13.04 and I compile it using Eclipse Nsight. The size of the matrix is 2097152 elements. It executes in 3.6 ms whereas the CPU version in around 1.0 ms. Below is the whole code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
}

GT540M is a mobile GPU, so I assume you're running on a laptop, and furthermore you may be hosting the X display on the 540M GPU.
I built a complete version of your code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
int loops = 0;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
loops++;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float et = 0.0f;
cudaEventElapsedTime(&et, start, stop);
printf("GPU time: %fms, in %d loops\n", et, loops);
int gpuresult;
if (flag)
cudaMemcpy(&gpuresult, d1, sizeof(int), cudaMemcpyDeviceToHost);
else
cudaMemcpy(&gpuresult, d2, sizeof(int), cudaMemcpyDeviceToHost);
printf("GPU min: %d\n", gpuresult);
return 0;
}
compiled it:
$ nvcc -O3 -arch=sm_20 -o t264 t264.cu
and ran it on a M2050 GPU, RHEL 5.5, CUDA 5.5, Xeon X5650 CPU
$ ./t264
Minimum Element CPU: 288
Microseconds elapsed CPU: 1217
GPU time: 0.621408ms, in 3 loops
GPU min: 288
$
So my CPU results were pretty close to yours, but my GPU results were about 5-6x faster. If we compare M2050 to your GT540M, we see that the M2050 has 14 SMs whereas the GT540M has 2. More importantly, the M2050 has about 5x the memory bandwidth of your GT540M GPU (28.8GB/s peak theoretical for GT540M vs. ~150GB/s peak theoretical for M2050)
Since a well written parallel reduction is a memory bandwidth constrained code on GPUs, the speed difference between your GPU and my GPU makes sense.
So I would say your results are probably about what is expected, and to get better results you will probably need a faster GPU.
Also, if your GT540M is also hosting an X display, it's possible that the GPU timing is corrupted by display activity. If we are timing a single kernel, this is not normally an issue - the kernel execution interrupts the display processing briefly. But when we are timing a sequence of kernels in succession, it's possible for the display tasks to jump in and execute in-between kernel calls (the GPU is multi-tasking when it is asked to both support a display and also process CUDA code). Therefore, this may be a possible performance impact in your case as well.

Related

Why my vectorized access kernel is so slow?

I am trying to understand vectorized memory access and implement a simple example to evaluate the performance. But I found that the vectorized one is slower than the naive one?
in vectorized kernel, i recast the int pointer to an int2 pointer and then do the data copy.
This is the code I used:
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#include <iostream>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void naiveCopy(int *d_in, int *d_out, int size)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for (int i = tid; i < size; i += blockDim.x*gridDim.x)
{
d_out[i] = d_in[i];
}
}
__global__ void vecCopy(int *d_in, int *d_out, int size)
{
int2* in = (int2*)d_in;
int2* out = (int2*)d_out;
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for (int i = tid; i < size/2; i += blockDim.x*gridDim.x)
{
out[i] = in[i];
}
if(tid==size/2 && size%2==1)
d_out[size-1] = d_in[size-1];
}
int main(int argc, char **argv)
{
int size = 1<<24;
//int size = 128;
int nBytes = size*sizeof(int);
int *d_h;
cudaMallocHost((int**)&d_h, nBytes);
initData_int(d_h, size);
//printData(d_h, size);
int *res = (int*)malloc(nBytes);
cudaStream_t stream;
cudaStreamCreate(&stream);
int *d_in, *d_out;
dim3 block(128, 1);
dim3 grid((size-1)/block.x+1, 1);
cudaMalloc((int**)&d_in, nBytes);
cudaMalloc((int**)&d_out, nBytes);
cudaMemcpyAsync(d_in, d_h, nBytes, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
auto s_0 = std::chrono::system_clock::now();
naiveCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
cudaStreamSynchronize(stream);
auto e_0 = std::chrono::system_clock::now();
std::chrono::duration<double> diff = e_0 - s_0;
printf("Naive Kernel time cost is: %2f.\n", diff.count());
memset(res, 0, nBytes);
cudaMemset(d_out, 0, nBytes);
//vectorized access:
cudaStreamSynchronize(stream);
s_0 = std::chrono::system_clock::now();
vecCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
cudaStreamSynchronize(stream);
e_0 = std::chrono::system_clock::now();
diff = e_0 - s_0;
printf("Vectorized kernel time cost is: %2f.\n", diff.count());
cudaStreamDestroy(stream);
cudaFree(d_h);
cudaFree(d_in);
cudaFree(d_out);
free(res);
return 0;
}
This is the data from nvprof:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 89.28% 5.5024ms 1 5.5024ms 5.5024ms 5.5024ms [CUDA memcpy HtoD]
4.82% 296.94us 1 296.94us 296.94us 296.94us vecCopy(int*, int*, int)
3.99% 246.19us 1 246.19us 246.19us 246.19us naiveCopy(int*, int*, int)
Could you please explain what causes the performance degradation?
You are not doing a good job of grid sizing. Your grid dimensions might be sensible for the naive kernel:
dim3 grid((size-1)/block.x+1, 1);
But they are unnecessarily twice as large as they need to be for the vectorized copy kernel.
When I cut the grid size in half for the vectorized kernel (to match the methodology for the naive kernel):
dim3 grid2((size/2+block.x-1)/block.x);
then according to my testing, the vectorized copy kernel becomes faster:
3.88% 233.99us 1 233.99us 233.99us 233.99us naiveCopy(int*, int*, int)
2.84% 171.33us 1 171.33us 171.33us 171.33us vecCopy(int*, int*, int)
Notes:
cudaFree is not the correct API to use with cudaMallocHost. The correct API is cudaFreeHost.
We can probably do a better job of grid sizing, as was mentioned in the comments, by sizing the grid to match the GPU you are running on. However we don't need to take this step in order to demonstrate the improvement here.

Using of shared memory not showing desired result

I am trying to learn the usuage of Shared memory with a view to increase the performance . here I am trying to copy the global memory to shared memory. but when I have single block(256 thread) it gives the result and with more than 1 block it gives random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[400];
int t = blockIdx.x * blockDim.x + threadIdx.x;
d[t] = d[t]*d[t];
s[t] =d[t];
__syncthreads();
d[t] = s[t];
}
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = threadIdx.x;
s[t] = d[t]*d[t];
__syncthreads();
d[t] = s[t];
}
int main(void)
{
const int n = 400;
int a[n], d[n];
for (int i = 0; i < n; i++)
{
a[i] = i;
}
int *d_d;
cudaMalloc(&d_d, n * sizeof(int));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
staticReverse<<<n_blocks,block_size>>>(d_d, n);
cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++)
{
printf("%d\n",d[i]);
}
}
1)what does the third argument in dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
kernal call does? does it allocates shared memory for entire block or thread.
2) if I required more than 64kb of shared memory per multiprocessor in compute capability 5.0 what I need to do?
In your static shared memory allocation code you had three issues:
The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
You should use local thread index for indexing shared memory, instead of the global one;
You had no array out of bounds checking.
The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.
Here is the complete working code:
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[256];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
d[t] = d[t]*d[t];
s[threadIdx.x] =d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
s[threadIdx.x] = d[t]*d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
int main(void)
{
const int n = 400;
int* a = (int*) malloc(n*sizeof(int));
int* d = (int*) malloc(n*sizeof(int));
for (int i = 0; i < n; i++) { a[i] = i; }
int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int)));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
//staticReverse<<<n_blocks,block_size>>>(d_d, n);
dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }
}

How can I check the progress of matrix multiplication?

I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here

CUDA: How can I run the parallel reduction code for summation that is described in the NVIDIA paper by Mark Harris?

Although I understand the logic behind parallel reduction described in this paper, I can't seem to be able to run it for a simple example where the input array has size 1s.
Here is what I achieved so far. Keep in mind that I'm using the thrust library to manage input and output data.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 10;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to 10
int threadsPerBlock = 256;
int totalBlocks = size/threadsPerBlock + 1;
dim3 dimGrid(totalBlocks,1,1);
dim3 dimBlock(threadsPerBlock, 1, 1);
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(size);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<dimGrid, dimBlock>>>(input, output);
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
the code is simple, I create a host_vector of size size and initialize all the values to 1.
then I say that we need 256 threads per each block and find dynamically the amount of blocks needed for my example.
To keep things simple, I create an array of 10 values only, which means that we are going to require only one block. So one kernel invocation will be enough to produce the final result.
My questions are the following:
Question 1
After compiling the above example (nvcc -O3 reduction.cu -arch=sm_21) and entering ./a.out I get the following message:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): unspecified launch failure
I'm not sure what's going on here, but it seems to me that the error comes from the line
sdata[tid] = g_idata[i]
The kernel is an exact copy of the kernel described in the paper so I'm not sure what changes are required in order to fix this problem.
Question 2
If we fix the first problem, how could we make the above code work for arbitrary size of input array? If for example our size is more than 256, then we would need at least two blocks, so each block will give an output that will then have to be combined with the outputs of other blocks. In the paper it says that we would need multiple invocations of the kernel, however I'm not sure how this can be done dynamically.
Thank you in advance
EDIT1: For Question 1 it seems that I don't allocate memory for the shared memory correctly. Calling the kernel like that: reduce0<<<dimGrid, dimBlock, size*sizeof(int)>>>(input, output); and also checking to see if tid is not out of range.
makes the code work properly.
The new kernel is the following:
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<size){
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < size; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
}
I'm still not sure about Question 2 though.
Question 1
Your kernel is using dynamically allocated shared memory:
extern __shared__ int sdata[];
...
sdata[tid] = g_idata[i];
But you are not allocating any dynamic shared memory in your kernel call:
reduce0<<<dimGrid, dimBlock>>>(input, output);
^
|
missing shared memory parameter.
So when you attempt to access the shared memory, you get a kernel fault.
By the way you can still do cuda error checking on your kernel calls (even though you are using thrust elsewhere).
Question 2
Question 2 is pretty well answered in Mark's paper here You can see at the bottom of slide 9 that each block writes it's partial result to an array in global memory (g_odata[]) which stores one result per block. We then simply launch another kernel of essentially the same type that is operating on g_odata[] instead of the original input data. We can do this process successively until our partial results (e.g. g_odata[]) only contain 256 results, or how many threads we are launching in a threadblock. We can then sum that final result with a single threadblock and produce a single answer value.
examples are given in the cuda sample code here.
Here's an edited version of your code, that shows how to call the two kernels in sequence to handle a larger size. I don't consider this a paragon of reduction programming, just a simple extension of what you already wrote to illustrate the concept. Note that there are a variety of changes throughout the kernel and main code to facilitate the use of the kernel to handle larger data sizes. This method still won't scale beyond a data size of (threadsPerBlock ^2), but again it's just to illustrate the concept of calling multiple kernels in sequence to sum partial results, with fewest modifications to your code.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 40000;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
reduce0<<<1, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, totalBlocks);
data_v_o[0] = data_v_i[0];
data_v_i.clear();
data_v_i.shrink_to_fit();
thrust::host_vector<int> data_h_o = data_v_o;
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}
After modifying the code made by Robert Crovella to answer my question, here is the final version which supports arbitrary amount of input values.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>
using namespace std;
__global__ void reduce0(int *g_idata, int *g_odata, int size){
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = 0;
if(i<size)
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(void){
int size = 939289;
thrust::host_vector<int> data_h_i(size, 1);
//initialize the data, all values will be 1
//so the final sum will be equal to size
int threadsPerBlock = 256;
int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;
thrust::device_vector<int> data_v_i = data_h_i;
thrust::device_vector<int> data_v_o(totalBlocks);
int* output = thrust::raw_pointer_cast(data_v_o.data());
int* input = thrust::raw_pointer_cast(data_v_i.data());
bool turn = true;
while(true){
if(turn){
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
turn = false;
}
else{
reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, size);
turn = true;
}
if(totalBlocks == 1) break;
size = totalBlocks;
totalBlocks = ceil((double)totalBlocks/threadsPerBlock);
}
thrust::host_vector<int> data_h_o;
if(turn)
data_h_o = data_v_i;
else
data_h_o = data_v_o;
data_v_i.clear();
data_v_i.shrink_to_fit();
data_v_o.clear();
data_v_o.shrink_to_fit();
cout<<data_h_o[0]<<endl;
return 0;
}

Performance of static versus dynamic CUDA shared memory allocation

I have 2 kernels that do exactly the same thing. One of them allocates shared memory statically while the other allocates the memory dynamically at run time. I am using the shared memory as 2D array. So for the dynamic allocation, I have a macro that computes the memory location. Now, the results generated by the 2 kernels are exactly the same. However, the timing results I got from both kernels are 3 times apart! The static memory allocation is much faster. I am sorry that I can't post any of my code. Can someone give a justification for this?
I have no evidence that static shared memory allocation is faster than dynamic shared memory allocation. As was evidenced in the comments above, it would be impossible to answer your question without a reproducer. In at least the case of the code below, the timings of the same kernel, when run with static or dynamic shared memory allocations, are exactly the same:
#include <cuda.h>
#include <stdio.h>
#define BLOCK_SIZE 512
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
__global__ void kernel_static_memory_allocation(int *d_inout, int N)
{
__shared__ int s[BLOCK_SIZE];
const int tid = threadIdx.x;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
s[tid] = d_inout[i];
__syncthreads();
s[tid] = s[tid] * s[tid];
__syncthreads();
d_inout[i] = s[tid];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void kernel_dynamic_memory_allocation(int *d_inout, int N)
{
extern __shared__ int s[];
const int tid = threadIdx.x;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
s[tid] = d_inout[i];
__syncthreads();
s[tid] = s[tid] * s[tid];
__syncthreads();
d_inout[i] = s[tid];
}
}
/********/
/* MAIN */
/********/
int main(void)
{
int N = 1000000;
int* a = (int*)malloc(N*sizeof(int));
for (int i = 0; i < N; i++) { a[i] = i; }
int *d_inout; gpuErrchk(cudaMalloc(&d_inout, N * sizeof(int)));
int n_blocks = N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_inout, a, N*sizeof(int), cudaMemcpyHostToDevice));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
kernel_static_memory_allocation<<<n_blocks,BLOCK_SIZE>>>(d_inout, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Static allocation - elapsed time: %3.3f ms \n", time);
cudaEventRecord(start, 0);
kernel_dynamic_memory_allocation<<<n_blocks,BLOCK_SIZE,BLOCK_SIZE*sizeof(int)>>>(d_inout, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Dynamic allocation - elapsed time: %3.3f ms \n", time);
}
The possible reason for that is due to the fact that the disassembled codes for the two kernels are exactly the same and do not change even on replacing int N = 1000000; with int N = rand();.