cudaMallocManaged and cudaDeviceSynchronize() - cuda

I have the following two mostly identical example codes. code1.cu use cudaMalloc and cudaMemcpy to handling device/host variable value exchange.
The code2.cu use cudaMallocManaged and thus cudaMemcpy is not needed. When cudaMallocManaged is used, I have to include cudaDeviceSynchronize() to get the correct results, while for the one with cudaMalloc, this is not needed. I would appreciate some hint on why this is happening
code2.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMallocManaged((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
tot =*d_tot;
//
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
code1.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMalloc((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
//
cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0
After remove the comment on cudaDeviceSynchronize(),
GPU: tot: 8.79609e+12

CUDA kernel launches are asynchronous. That means that they execute independently of the CPU thread that launched them.
Because of this asynchronous launch, the CUDA kernel is not guaranteed to be finished (or even started) by the time your CPU thread code begins testing the result.
Therefore it is necessary to wait until the GPU kernel is complete, and cudaDeviceSynchronize() does exactly that. cudaMemcpy also has a synchronizing effect, so when you remove the cudaMemcpy operations, you lose that synchronization, but cudaDeviceSynchronize() restores it.

Related

Performance difference due to indexing during matrix multiplication

I'm trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:
#include <iostream>
#include <assert.h>
using namespace std;
# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void init_values(int *a, int *b, int sz) {
for(int i=0; i<sz; i++) {
a[i] = rand()%513 - 256;
b[i] = rand()%513 - 256;
}
}
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int t = 0;
for(int i=0; i<n; i++) {
t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
}
c[IDX(x, y, n)] = t;
}
void matmul_verify(int *a, int *b, int *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
int t = 0;
for(int k=0; k<n; k++)
t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
// cout << i << " " << j << " " << c[IDX(i, j, n)] << " " << t << endl;
assert(c[IDX(i, j, n)] == t);
}
}
}
int main()
{
int *a, *b, *c;
int *da, *db, *dc;
size_t sz = N * N * sizeof(int);
a = (int*)malloc(sz);
b = (int*)malloc(sz);
c = (int*)malloc(sz);
init_values(a, b, N*N);
gpuErrchk(cudaMalloc((void**)&da, sz));
gpuErrchk(cudaMalloc((void**)&db, sz));
gpuErrchk(cudaMalloc((void**)&dc, sz));
gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
// init grid size
dim3 grids(N/THREADS, N/THREADS);
dim3 blocks(THREADS, THREADS);
// time it
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matmul<<<grids, blocks>>>(da, db, dc, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cout << "Took " << milliseconds << " milliseconds.\n";
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
matmul_verify(a, b, c, N);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(a);
free(b);
free(c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
and for the tiled implementation, I change the kernel as
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int ty = threadIdx.y, by = blockIdx.y;
int tx = threadIdx.x, bx = blockIdx.x;
int x = bx * blockDim.x + tx;
int y = by * blockDim.y + ty;
// block IDs tell us which block to solve for
// (bx, by) --> (bx: bx + tx, by:by + ty)
__shared__ int A[SHMEM_SIZE];
__shared__ int B[SHMEM_SIZE];
const int tile_size = THREADS;
// to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
int res = 0;
for(int blk=0; blk < n; blk+=tile_size) {
// block index
A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
__syncthreads();
for(int k=0; k<tile_size; k++) {
res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
}
__syncthreads();
}
// for(int k=0; k<n; k++)
// res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
c[IDX(x, y, n)] = res;
}
nothing else really changes. However, in the tiled implementation, if I simply change
int ty = threadIdx.x, by = blockIdx.x;
int tx = threadIdx.y, bx = blockIdx.y;
for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.
CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth. In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.
Other factors that can deteriorate memory bandwidth are the size of the load (one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.) and alignment.
The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.
Therefore to allow for coalescing, you have to access memory as
A[ty * s + tx] // coalesced access
instead of
A[tx * s + ty] // strided access
to achieve optimal performance.
What is probably meant in the book you mentioned is that there shouldn't be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. These divisions probably happen internally when having a block that is not flat in the first place.

Cuda program not working

i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.

loop unrolling with dynamic parallelism decrease the time performance

I have a simple program to calculate square root, loop unrolling was done as
loop unrolling
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
//printf("%d\n",n);
for(int q=0;q<2;q++)
{
if(N<2000)
{
arr[idx+q] = arr[idx+q] * arr[idx+q];
}
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
then realizing that the loop calculation can be minimized with dynamic parallelism .
unrolling with dynamic parallelism was implemented as
unrolling with dynamic parallelism
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
square <<< 1,2 >>> (arr, n,idx);
}
__global__ void square(float *a, int N,int idx)
{
int tdx = blockIdx.x * blockDim.x + threadIdx.x;
printf("%d\n",N);
if(N<2000)
{
a[tdx+idx] = a[tdx+idx] * a[tdx+idx];
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
the implementation of dynamic parallelism with unrolling takes more time for executio than only unrolling. Aren,t we suppose to improve execution time with dynamic parallelism in such case?
Dynamic parallelism is mainly useful in cases where you have parallelism that is dynamic. That is: cases where you don't know how much parallelism you're going to need until you've done some calculation. Rather than transfer data back to the host which is then instantly fed into parameterising another launch, you launch from within the kernel. In this pattern, with memcpys between kernel launches avoided, you'll see speedup.
In your example above this is not the case. You could have just launched twice as many threads from the host. There's nothing dynamic required as there's no parallelism available there that you didn't know about at the time of the first kernel launch.
Furthermore, performance requirements for kernels launched using dynamic parallelism are similar to that of those launched from the host. You have to launch a reasonable amount of work or the launch latency will dominate your computation time.

the Kernel delay increase by increasing the blocksPerGrid and threadsPerBlock in CUDA VecAdd example, what does it mean?

when i tested the following example, i found that by increasing the blocksPerGrid and threadsPerBlock the Kernel delay increase
such that if
int threadsPerBlock = 1;
int blocksPerGrid = 1;
blocksPerGrid and threadsPerBlock equal 1 the delay of the kernel = .0072 ms
but when i make the following it the delay become higher = .049 ms
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
where
N = 50000; //the no. of array elements
on the following the complete VecAdd example. you can test it
// Includes
#include <stdio.h>
#include <cutil_inline.h>
#include <shrQATest.h>
// Variables
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void CleanupResources(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
printf("Vector Addition\n");
int N = 50000;
size_t size = N * sizeof(float);
ParseArguments(argc, argv);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) CleanupResources();
h_B = (float*)malloc(size);
if (h_B == 0) CleanupResources();
h_C = (float*)malloc(size);
if (h_C == 0) CleanupResources();
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_B, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );
// Invoke kernel
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
cudaEventRecord(event1, 0);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaEventRecord(event2, 0);
cudaEventSynchronize(event1); //optional
cudaEventSynchronize(event2);
float dt_ms;
cudaEventElapsedTime(&dt_ms, event1, event2);
printf("delay_time = %f\n", dt_ms);
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cutilDeviceSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
CleanupResources();
shrQAFinishExit(argc, (const char **)argv, (i==N) ? QA_PASSED : QA_FAILED);
}
void CleanupResources(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cutilDeviceReset();
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
// Parse program arguments
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--noprompt") == 0 ||
strcmp(argv[i], "-noprompt") == 0)
{
noprompt = true;
break;
}
}
}
can any one explain for me what does it mean?
In case 1, a kernel of size 1 thread is launched and performs 2 reads and 1 write operation. In case 2, a kernel of size 50176 threads are launched and perform 100,000 reads and 50,000 writes operations. Increasing the workload by 50,000 increased execution time by ~7x. The work done by the two launches is significantly different.

Why is texture memory version of below program slower than global memory version

i am confused why my texture version is slower than my global memory version because the texture version should exploit spatial locality. I am trying to compute the dot product in the below case. Thus, if one thread accesses index i, its neighbour should access i+1. Thus, we see spatial locality.
Below is the texture memory version:
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
texture<float> arr1;
texture<float> arr2;
const int n = 4;
__global__ void addVal( float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += tex1Dfetch(arr1,tid) * tex1Dfetch(arr2,tid);
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaBindTexture(NULL,arr1, deva,sizeof(float) * n); // note: deva shd be in gpu
cudaBindTexture(NULL,arr2, devb,sizeof(float) * n); // note: deva shd be in gpu
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
cudaUnbindTexture(arr1);
cudaUnbindTexture(arr2);
cudaFree(devc);
getchar();
return 0;
}
Global Memory version:
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__global__ void addVal(float *a, float *b, float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
While know your graphic device may help, for some type of problems, with compute capability 2.x the L1 and L2 cache works as good the texture cache.
In this case, you are not exploiting the texture cache, as you only read once value per thread. On the other chand, you are exploiting spatial locality in 1D what can be hide with global memory coalesced access.
I recommend you the book 'CUDA by Example: An Introduction to General-Purpose GPU Programming'. Great book for beginners. With graphics examples like JuliaSet or a very basic Raycasting (there are also the common add, reduce and dot product examples if you prefer thouse :).
Hope this help.
Further to pQB's answer, there is no data reuse in your program -- each input is read only once, and used only once. Memory indices are sequential across threads, and therefore perfectly coalesced. Because of these two reasons, there is no need for any device memory cacheing, so global memory access is more efficient than texture access. Add to this additional latency overhead in the texture cache (texture cache is designed to increase throughput, not decrease latency, unlike L1/L2 data caches), and the slowdown is explained.
BTW, what you are doing is a parallel reduction, so you may want to see the "reduction" example in the CUDA SDK for a fast implementation.