I am trying to learn the usuage of Shared memory with a view to increase the performance . here I am trying to copy the global memory to shared memory. but when I have single block(256 thread) it gives the result and with more than 1 block it gives random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[400];
int t = blockIdx.x * blockDim.x + threadIdx.x;
d[t] = d[t]*d[t];
s[t] =d[t];
__syncthreads();
d[t] = s[t];
}
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = threadIdx.x;
s[t] = d[t]*d[t];
__syncthreads();
d[t] = s[t];
}
int main(void)
{
const int n = 400;
int a[n], d[n];
for (int i = 0; i < n; i++)
{
a[i] = i;
}
int *d_d;
cudaMalloc(&d_d, n * sizeof(int));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
staticReverse<<<n_blocks,block_size>>>(d_d, n);
cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++)
{
printf("%d\n",d[i]);
}
}
1)what does the third argument in dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);
kernal call does? does it allocates shared memory for entire block or thread.
2) if I required more than 64kb of shared memory per multiprocessor in compute capability 5.0 what I need to do?
In your static shared memory allocation code you had three issues:
The size of the statically allocated shared memory should comply with the block size, not with the size of the input array,
You should use local thread index for indexing shared memory, instead of the global one;
You had no array out of bounds checking.
The dynamic shared memory allocation code had the same issues #2 and #3 as above, plus the fact that you were indexing global memory with local thread index, instead of global. You can use the third argument to specify the size of the shared memory to be allocated. In particular, you should allocate an amount of 256 ints, i.e., related to the block size, similarly to the static shared memory allocation case.
Here is the complete working code:
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
__shared__ int s[256];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
d[t] = d[t]*d[t];
s[threadIdx.x] =d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void dynamicReverse(int *d, int n)
{
extern __shared__ int s[];
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t < n) {
s[threadIdx.x] = d[t]*d[t];
__syncthreads();
d[t] = s[threadIdx.x];
}
}
int main(void)
{
const int n = 400;
int* a = (int*) malloc(n*sizeof(int));
int* d = (int*) malloc(n*sizeof(int));
for (int i = 0; i < n; i++) { a[i] = i; }
int *d_d; gpuErrchk(cudaMalloc(&d_d, n * sizeof(int)));
// run version with static shared memory
int block_size = 256;
int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice));
//staticReverse<<<n_blocks,block_size>>>(d_d, n);
dynamicReverse<<<n_blocks,block_size,256*sizeof(int)>>>(d_d, n);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) { printf("%d\n",d[i]); }
}
Related
I'm running a toy CUDA sample on my GeForce 1080 Ti (Pascal) on windows 10 and CUDA 9.2.
Goal is to test cudaMemPrefetchAsync to the CPU, as it's supposed to work.
However, I get a CUDA error (invalid device ordinal) on this particular line.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
void fill(int* a, int val, int N) {
for (int k = 0; k < N; ++k) {
a[k] = val;
}
}
__global__ void add(int* a, int* b, int N)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) {
a[i] += b[i];
}
}
inline void check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
::fprintf(stderr, "ERROR at %s[%d] : %s\n", file, line, cudaGetErrorString(err));
abort();
}
}
#define CUDA_CHECK(err) do { check(err, __FILE__, __LINE__); } while(0)
int main()
{
int deviceId;
CUDA_CHECK(cudaGetDevice(&deviceId));
const int N = 1024*1024*32;
int *a, *b;
CUDA_CHECK(cudaMallocManaged(&a, N * sizeof(int)));
CUDA_CHECK(cudaMallocManaged(&b, N * sizeof(int)));
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), cudaCpuDeviceId)); // program breaks here
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), cudaCpuDeviceId));
fill(a, 1, N);
fill(a, 2, N);
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), deviceId));
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), deviceId));
add<<<32, 256>>>(a, b, N);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
return 0;
}
Is that a hardware/driver/OS limitation? Can I simply ignore the error?
Is that a hardware/driver/OS limitation?
Yes, the latter. Quoting from the documentation
GPUs with SM architecture 6.x or higher (Pascal class or newer)
provide additional Unified Memory features such as on-demand page
migration and GPU memory oversubscription that are outlined throughout
this document. Note that currently these features are only supported
on Linux operating systems.
So asynchronous page migration is not supported in Windows at the moment and that it why you get an error when you try to enable it.
I implemented the reduction#1 form the well-known slides by Mark Harris, but I obtain 0 as result. I filled the input array with the same values shown in the slides. I compiled with cuda 7.0 using the command nvcc reduction1.cu -o red1. Where is the mistake? Thanks.
#include <stdio.h>
#include <cuda_runtime.h>
#define THREADS_PER_BLOCK 16
__global__ void reduce1(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
if (tid % (2*s) == 0) sdata[tid] += sdata[tid + s];
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main()
{
int inputLength=16;
int hostInput[16]={10,1,8,-1,0,-2,3,5,-2,-3,2,7,0,11,0,2};
int hostOutput=0;
int *deviceInput;
int *deviceOutput;
cudaMalloc((void **)&deviceInput, inputLength * sizeof(int));
cudaMalloc((void **)&deviceOutput, sizeof(int));
cudaMemcpy(deviceInput, hostInput, inputLength * sizeof(int),cudaMemcpyHostToDevice);
reduce1<<<1,THREADS_PER_BLOCK>>>(deviceInput, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n",hostOutput);
cudaFree(deviceInput);
cudaFree(deviceOutput);
return 0;
}
As talonmies said, you are using dynamic shared memory, but you are not allocating any memory space for it. You have to specify the size of this memory as the third argument of your kernel execution configuration.
reduce1<<<1, THREADS_PER_BLOCK, 64>>>(deviceInput, deviceOutput);
^^
Another way to fix this code is to use static shared memory. Declare your shared memory like this:
__shared__ int sdata[16];
Please read this before asking questions for CUDA.
I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
I have implemented this code: http://www.cuvilib.com/Reduction.pdf in order to calculate the sum of the elements of a matrix.
However in GPU it runs much slower than in CPU.
I got i7 processor and NVIDIA GT 540M graphics card.
Is it supposed to be that way or something else?
EDIT: I use version 3 of the above code in Ubuntu 13.04 and I compile it using Eclipse Nsight. The size of the matrix is 2097152 elements. It executes in 3.6 ms whereas the CPU version in around 1.0 ms. Below is the whole code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
}
GT540M is a mobile GPU, so I assume you're running on a laptop, and furthermore you may be hosting the X display on the 540M GPU.
I built a complete version of your code:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/sort.h>
#include <sys/time.h>
#include <omp.h>
#include <iostream>
#include <algorithm>
#define MIN(a,b) (((a)<(b))?(a):(b))
static const int WORK_SIZE = 2097152;
int find_min(int *a,int length){
int min = a[0];
for (int i=1;i<length;i++)
if (a[i]<min)
min=a[i];
return min;
}
__global__ static void red_min(int *g_idata,int *g_odata) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]= g_idata[i];
__syncthreads();
for(unsigned int s=blockDim.x/2; s > 0; s >>= 1) {
if (tid<s) {
sdata[tid] = MIN(sdata[tid],sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
int main(void) {
int *d1,*d2;
int i,*result;
int *idata,*fdata;
srand ( time(NULL) );
result = (int *)malloc(sizeof(int));
idata = (int *)malloc(WORK_SIZE*sizeof(int));
fdata = (int *)malloc(WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d1,WORK_SIZE*sizeof(int));
cudaMalloc((int**)&d2,WORK_SIZE*sizeof(int));
for (i = 0; i < WORK_SIZE; i++){
idata[i] = rand();
fdata[i] = i;
}
struct timeval begin, end;
gettimeofday(&begin, NULL);
*result = find_min(idata,WORK_SIZE);
printf( "Minimum Element CPU: %d \n", *result);
gettimeofday(&end, NULL);
int time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (begin.tv_sec * (unsigned int)1e6 + begin.tv_usec);
printf("Microseconds elapsed CPU: %d\n", time);
cudaMemcpy(d1,idata,WORK_SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate( &start);
cudaEventCreate( &stop);
cudaEventRecord(start,0);
int num_blocks = 16384;
bool flag = true;
int loops = 0;
while (num_blocks>0){
if (flag) {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d1,d2);
}
else {
red_min<<<num_blocks,128,128*sizeof(int)>>>(d2,d1);
}
num_blocks /= 128;
flag = !flag;
loops++;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float et = 0.0f;
cudaEventElapsedTime(&et, start, stop);
printf("GPU time: %fms, in %d loops\n", et, loops);
int gpuresult;
if (flag)
cudaMemcpy(&gpuresult, d1, sizeof(int), cudaMemcpyDeviceToHost);
else
cudaMemcpy(&gpuresult, d2, sizeof(int), cudaMemcpyDeviceToHost);
printf("GPU min: %d\n", gpuresult);
return 0;
}
compiled it:
$ nvcc -O3 -arch=sm_20 -o t264 t264.cu
and ran it on a M2050 GPU, RHEL 5.5, CUDA 5.5, Xeon X5650 CPU
$ ./t264
Minimum Element CPU: 288
Microseconds elapsed CPU: 1217
GPU time: 0.621408ms, in 3 loops
GPU min: 288
$
So my CPU results were pretty close to yours, but my GPU results were about 5-6x faster. If we compare M2050 to your GT540M, we see that the M2050 has 14 SMs whereas the GT540M has 2. More importantly, the M2050 has about 5x the memory bandwidth of your GT540M GPU (28.8GB/s peak theoretical for GT540M vs. ~150GB/s peak theoretical for M2050)
Since a well written parallel reduction is a memory bandwidth constrained code on GPUs, the speed difference between your GPU and my GPU makes sense.
So I would say your results are probably about what is expected, and to get better results you will probably need a faster GPU.
Also, if your GT540M is also hosting an X display, it's possible that the GPU timing is corrupted by display activity. If we are timing a single kernel, this is not normally an issue - the kernel execution interrupts the display processing briefly. But when we are timing a sequence of kernels in succession, it's possible for the display tasks to jump in and execute in-between kernel calls (the GPU is multi-tasking when it is asked to both support a display and also process CUDA code). Therefore, this may be a possible performance impact in your case as well.
I have 2 kernels that do exactly the same thing. One of them allocates shared memory statically while the other allocates the memory dynamically at run time. I am using the shared memory as 2D array. So for the dynamic allocation, I have a macro that computes the memory location. Now, the results generated by the 2 kernels are exactly the same. However, the timing results I got from both kernels are 3 times apart! The static memory allocation is much faster. I am sorry that I can't post any of my code. Can someone give a justification for this?
I have no evidence that static shared memory allocation is faster than dynamic shared memory allocation. As was evidenced in the comments above, it would be impossible to answer your question without a reproducer. In at least the case of the code below, the timings of the same kernel, when run with static or dynamic shared memory allocations, are exactly the same:
#include <cuda.h>
#include <stdio.h>
#define BLOCK_SIZE 512
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
__global__ void kernel_static_memory_allocation(int *d_inout, int N)
{
__shared__ int s[BLOCK_SIZE];
const int tid = threadIdx.x;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
s[tid] = d_inout[i];
__syncthreads();
s[tid] = s[tid] * s[tid];
__syncthreads();
d_inout[i] = s[tid];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void kernel_dynamic_memory_allocation(int *d_inout, int N)
{
extern __shared__ int s[];
const int tid = threadIdx.x;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
s[tid] = d_inout[i];
__syncthreads();
s[tid] = s[tid] * s[tid];
__syncthreads();
d_inout[i] = s[tid];
}
}
/********/
/* MAIN */
/********/
int main(void)
{
int N = 1000000;
int* a = (int*)malloc(N*sizeof(int));
for (int i = 0; i < N; i++) { a[i] = i; }
int *d_inout; gpuErrchk(cudaMalloc(&d_inout, N * sizeof(int)));
int n_blocks = N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_inout, a, N*sizeof(int), cudaMemcpyHostToDevice));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
kernel_static_memory_allocation<<<n_blocks,BLOCK_SIZE>>>(d_inout, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Static allocation - elapsed time: %3.3f ms \n", time);
cudaEventRecord(start, 0);
kernel_dynamic_memory_allocation<<<n_blocks,BLOCK_SIZE,BLOCK_SIZE*sizeof(int)>>>(d_inout, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Dynamic allocation - elapsed time: %3.3f ms \n", time);
}
The possible reason for that is due to the fact that the disassembled codes for the two kernels are exactly the same and do not change even on replacing int N = 1000000; with int N = rand();.