Problem with CUDA operation overlapping example

Problem with CUDA operation overlapping example - cuda

all
I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code below.
simpleMultiCopy.cu is an example of operation overlapping in a loop. And mine is similar, it will send a slice of data to GPU to compute each iteration in a loop where I perform the overlapping operation.
This is just a test/demo, don't care the logic of the kernel(increment_kernel), it was used just to delay some time. The main logic lies in processWithStreams function.
But this program works incorrectly with this out put:
i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.
line 132 is:
CUDA_SAFE_CALL( cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream]) ); //this is line 132
I don't have much ideas about how CUDA works, so please help.
Any help will be appreciate.
Code:
#include <stdio.h>
#include <cutil_inline.h>
float processWithStreams(int streams_used);
#define STREAM_COUNT 2
int N = 1 << 24;
int *h_data_source;
int *h_data_sink;
int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];
int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];
cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];
cudaEvent_t start, stop;
dim3 block(512);
dim3 grid;
int memsize;
__global__ void increment_kernel(int *g_data, int inc_value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//g_data[idx] = g_data[idx] + inc_value;
int i = blockDim.x * gridDim.x;
for(; i > 0; i /= 2)
{
if(idx > i)
g_data[idx]++;
}
}
int main(int argc, char *argv[])
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId());
h_data_source = (int *)malloc(sizeof(int) * N);
memset(h_data_source, 0, sizeof(int) * N);
int i;
memsize = 1024 * 1024 * sizeof(int);
for(i = 0; i < STREAM_COUNT; i++)
{
CUDA_SAFE_CALL( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) );
CUDA_SAFE_CALL( cudaMalloc(&d_data_in[i], memsize) );
CUDA_SAFE_CALL( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) );
CUDA_SAFE_CALL( cudaMalloc(&d_data_out[i], memsize) );
CUDA_SAFE_CALL( cudaStreamCreate(&stream[i]) );
CUDA_SAFE_CALL( cudaEventCreate(&cycleDone[i]) );
cudaEventRecord(cycleDone[i], stream[i]);
}
CUDA_SAFE_CALL( cudaEventCreate(&start) );
CUDA_SAFE_CALL( cudaEventCreate(&stop) );
grid.x = N / block.x;
grid.y = 1;
float time1 = processWithStreams(STREAM_COUNT);
printf("time: %f\n", time1);
free( h_data_source );
free( h_data_sink );
for( i = 0; i < STREAM_COUNT; ++i ) {
cudaFreeHost(h_data_in[i]);
cudaFree(d_data_in[i]);
cudaStreamDestroy(stream[i]);
cudaEventDestroy(cycleDone[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaThreadExit();
cutilExit(argc, argv);
return 0;
}
float processWithStreams(int streams_used) {
int current_stream = 0;
float time;
cudaEventRecord(start, 0);
for( int i=0; i < N / 1024 / 1024; ++i ) {
int next_stream = (current_stream + 1 ) % streams_used;
printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame
increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
d_data_in[current_stream], 1);
// Upload next frame
CUDA_SAFE_CALL( cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream]) );
CUDA_SAFE_CALL( cudaEventRecord(
cycleDone[next_stream],
stream[next_stream]) );
// Download current frame
CUDA_SAFE_CALL( cudaMemcpyAsync(
h_data_out[current_stream],
d_data_out[current_stream],
memsize,
cudaMemcpyDeviceToHost,
stream[current_stream]) );
CUDA_SAFE_CALL( cudaEventRecord(
cycleDone[current_stream],
stream[current_stream]) );
current_stream = next_stream;
}
cudaEventRecord(stop, 0);
cudaEventElapsedTime(&time, start, stop);
return time;
}

The problem is in your kernel. One thing that happens when checking errors in CUDA is that errors that occurred previously and were not checked will be reported next time you check for an error. That line is the first time you check for errors after the kernel launch which returned the error your are seeing.
The error unspecified launch failure is usually associated with out of bounds accesses to memory if I recall correctly.
You are launching your kernel with 32768 blocks and 512 threads per block. Calculating the idx value for the last thread of the last block we have 32767 * 512 + 511 = 16777215. In the first iteration idx < i and in the following ones you are trying to read and write to position 16777215 of g_data when you only allocated space for 1024 * 1024 integers.
edit: just noticed, why the tag operator overloading?

Related

cudaMallocManaged and cudaDeviceSynchronize()

I have the following two mostly identical example codes. code1.cu use cudaMalloc and cudaMemcpy to handling device/host variable value exchange.
The code2.cu use cudaMallocManaged and thus cudaMemcpy is not needed. When cudaMallocManaged is used, I have to include cudaDeviceSynchronize() to get the correct results, while for the one with cudaMalloc, this is not needed. I would appreciate some hint on why this is happening
code2.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMallocManaged((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
tot =*d_tot;
//
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
code1.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMalloc((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
//
cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0
After remove the comment on cudaDeviceSynchronize(),
GPU: tot: 8.79609e+12

CUDA kernel launches are asynchronous. That means that they execute independently of the CPU thread that launched them.
Because of this asynchronous launch, the CUDA kernel is not guaranteed to be finished (or even started) by the time your CPU thread code begins testing the result.
Therefore it is necessary to wait until the GPU kernel is complete, and cudaDeviceSynchronize() does exactly that. cudaMemcpy also has a synchronizing effect, so when you remove the cudaMemcpy operations, you lose that synchronization, but cudaDeviceSynchronize() restores it.

CUDA: Understanding the behavior of variables in the registers file in a loop with a dot product example

I am very new to CUDA programming. Currently I have difficulties in understanding the behavior of the following program to calculate dot product of two vectors.
The dot product kernel, dotProd calculates the product of each element and reduce the the results to a shorter vector of length blockDim.x*gridDim.x. Then the results in the vector *out is copied back to Host for further reduction.
The second version, dotProdWithSharedMem is copied from the CUDA By Example book, see here.
My questions are:
When the kernel is initiated with enough threads (nThreadsPerBlock*nblocks >= vector_length), the result of dotProd matches the one calculated by CPU, but the result of dotProdWithSharedMem is different from the two. What can be the possible causes? A possible output of $ dot_prod.o 17 512:
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.56154 ms
GPU using shared : 9.6906833649, time consummed: 0.04473 ms
CPU result : 9.6904191971, time consummed: 0.28504 ms
When the kernel is initiated with not enough threads (nThreadsPerBlock*nblocks < vector_length), the GPU results seem to be less accurate. However the while loop is supposed to handle this problem. I guess there might be something happen to the registers variable temp in the loop, otherwise the result should remain the same as in question 1. A possible output of $ dot_prod.o 17 256:
Number of threads per block : 256
Number of blocks in the grid: 256
Total number of threads : 65536
Length of vectors : 131072
GPU using registers: 9.6906890869, time consummed: 0.31478 ms
GPU using shared : 9.6906604767, time consummed: 0.03530 ms
CPU result : 9.6904191971, time consummed: 0.28404 ms
I don't quite understand the size of the cache in dotProdWithSharedMem. Why it is of nThreadsPerBlock elements other than the total number of threads nThreadsPerBlock * nblocks? I think that should be the right number of temp values, is this correct?
The code:
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#define PI (float) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, float *u, float *v, float *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, float *u, float *v, float *out) {
__shared__ float cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(float);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(float);
size_t size_out_2 = nblocks*sizeof(float);
float *u = (float *)malloc(size);
float *v = (float *)malloc(size);
float *out = (float *)malloc(size_out);
float *out_2 = (float *)malloc(size_out_2);
float *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
float res_gpu = 0;
float res_gpu_2 = 0;
float res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
Thank you for your patience for having done reading this LONG post! Any help will be deeply appreciated!
Niko

You're exploring the limits of float precision combined with the variation associated with floating point order of operations. The actual "accuracy" here will depend on the exact data and exact order of operations. The different algorithms will have different order of operations, and therefore different results.
You may want to read this paper.
One of the assumptions you seem to be making is that the CPU result is the accurate one without any justification for that assumption.
If we define "accuracy" as the difference (i.e. "closeness") between the result and the numerically correct result, I suspect that the shared memory result is the more accurate one.
If we convert your code to use double type instead of float type, we observe that:
The result of all 3 approaches are much closer (identical in the printout).
The double results don't match any of the float case.
The shared memory result from the float case is actually the result that is closest to the double case results.
Here's a test case demonstrating this:
$ cat t397.cu
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#ifndef USE_DOUBLE
typedef float ft;
#else
typedef double ft;
#endif
#define PI (ft) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, ft *u, ft *v, ft *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, ft *u, ft *v, ft *out) {
__shared__ ft cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(ft);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(ft);
size_t size_out_2 = nblocks*sizeof(ft);
ft *u = (ft *)malloc(size);
ft *v = (ft *)malloc(size);
ft *out = (ft *)malloc(size_out);
ft *out_2 = (ft *)malloc(size_out_2);
ft *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
ft res_gpu = 0;
ft res_gpu_2 = 0;
ft res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
$ nvcc -std=c++11 t397.cu -o t397
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.89290 ms
GPU using shared : 9.6906833649, time consummed: 0.04289 ms
CPU result : 9.6904191971, time consummed: 0.41527 ms
$ nvcc -std=c++11 t397.cu -o t397 -DUSE_DOUBLE
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6913433287, time consummed: 1.33016 ms
GPU using shared : 9.6913433287, time consummed: 0.05032 ms
CPU result : 9.6913433287, time consummed: 0.41275 ms
$

Converting array to texture representation

I am trying to represent an array as a tex2D in cuda .... after hours of debugging , i noticed that 19 out of the one million elements is copied wrong to the texture , means as a binary array , i got 0 intstead of 1 .
void evolve_gpu( byte* h_in, byte* h_out)
{
//int SIZE = N * N * N * N * sizeof( float );
cudaEvent_t start, stop;
size_t d_in_pitch;
size_t d_out_pitch;
int len = 1002;
checkCudaErrors( cudaEventCreate(&start) );
checkCudaErrors( cudaEventCreate(&stop) );
// Allocate the device input image array
unsigned char *d_in = NULL;
unsigned char *d_out = NULL;
checkCudaErrors(cudaMallocPitch(&d_in, &d_in_pitch, sizeof(unsigned char)*len, len));
checkCudaErrors(cudaMallocPitch(&d_out, &d_out_pitch, sizeof(unsigned char)*len, len));
// Copy the host input image to the device memory
checkCudaErrors(cudaMemcpy2D(d_in, d_in_pitch, h_in, sizeof(unsigned char)*len
, sizeof(unsigned char)*len, len, cudaMemcpyHostToDevice));
/**************************** TEXTURE CONFIGURATION ******************************/
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d_in;
resDesc.res.pitch2D.pitchInBytes = d_in_pitch;
resDesc.res.pitch2D.width = len;
resDesc.res.pitch2D.height = len;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<unsigned char>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords=false;
texDesc.addressMode[0]=cudaAddressModeBorder;
texDesc.addressMode[1]=cudaAddressModeBorder;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
/*********************************************************************************/
checkCudaErrors( cudaEventRecord(start, NULL) );
// Launch the CUDA Kernel
dim3 block = dim3(THREADS_X, THREADS_Y);
dim3 grid = dim3((len+block.x-1)/block.x,(len+block.y-1)/block.y);//25*50
evolve_kernel<<<grid, block>>>( tex, d_out );
//******** kernel<<< number of blocks, number of threads, dynamic memory per block, associated stream >>> *******//
// Copy the device result to the host
checkCudaErrors(cudaMemcpy2D(h_out, d_out_pitch,
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
for(int i=0;i<1002*1002;i++){
if(h_in[i] != h_out[i])
printf("i = %d\n",i);
}
checkCudaErrors( cudaGetLastError() );
checkCudaErrors( cudaEventRecord(stop, NULL) );
checkCudaErrors( cudaEventSynchronize(stop) );
checkCudaErrors( cudaFree(d_in) );
checkCudaErrors( cudaFree(d_out) );
float msec = 0.f;
checkCudaErrors( cudaEventElapsedTime(&msec, start, stop) );
printf("Basic version took: %f ms\n", msec);
}

One problem I can see in your code is your device->host copy:
checkCudaErrors(cudaMemcpy2D(h_out, d_out_pitch,
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
Referring to the documentation, the 2nd parameter for this cudaMemcpy2D call is the pitch of the destination allocation (i.e. the pitch of h_out, in this case). But h_out is unlikely to refer to a pitched allocation, and even if it somehow were, the pitch is unlikely to be given by d_out_pitch.
Although you haven't shown a complete code, assuming that h_out and h_in are similar allocations, that 2nd parameter should be changed to the (un-pitched) width of the h_out array:
checkCudaErrors(cudaMemcpy2D(h_out, len*sizeof(unsigned char),
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
I'm also curious how your kernel can operate correctly on d_out (a pitched allocation) when you are not passing the pitch of d_out to it:
evolve_kernel<<<grid, block>>>( tex, d_out );
I would have expected to see a call like this:
evolve_kernel<<<grid, block>>>( tex, d_out, d_out_pitch);
but you haven't shown your kernel code.
Here's a fully worked example that I created around the code you have shown, with the above issues fixed and a few other changes to build an example:
$ cat t648.cu
#include <stdio.h>
#include <helper_cuda.h>
#define THREADS_X 16
#define THREADS_Y 16
const int len = 1002;
typedef unsigned char byte;
__global__ void evolve_kernel(cudaTextureObject_t tex, unsigned char *d_out, size_t pitch ){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < len) && (idy < len))
d_out[idy*pitch+idx] = tex2D<unsigned char>(tex, idx, idy);
}
void evolve_gpu( byte* h_in, byte* h_out)
{
//int SIZE = N * N * N * N * sizeof( float );
cudaEvent_t start, stop;
size_t d_in_pitch;
size_t d_out_pitch;
checkCudaErrors( cudaEventCreate(&start) );
checkCudaErrors( cudaEventCreate(&stop) );
// Allocate the device input image array
unsigned char *d_in = NULL;
unsigned char *d_out = NULL;
checkCudaErrors(cudaMallocPitch(&d_in, &d_in_pitch, sizeof(unsigned char)*len, len));
checkCudaErrors(cudaMallocPitch(&d_out, &d_out_pitch, sizeof(unsigned char)*len, len));
// Copy the host input image to the device memory
checkCudaErrors(cudaMemcpy2D(d_in, d_in_pitch, h_in, sizeof(unsigned char)*len
, sizeof(unsigned char)*len, len, cudaMemcpyHostToDevice));
/**************************** TEXTURE CONFIGURATION ******************************/
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d_in;
resDesc.res.pitch2D.pitchInBytes = d_in_pitch;
resDesc.res.pitch2D.width = len;
resDesc.res.pitch2D.height = len;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<unsigned char>();
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords=false;
texDesc.addressMode[0]=cudaAddressModeBorder;
texDesc.addressMode[1]=cudaAddressModeBorder;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
/*********************************************************************************/
checkCudaErrors( cudaEventRecord(start, NULL) );
// Launch the CUDA Kernel
dim3 block = dim3(THREADS_X, THREADS_Y);
dim3 grid = dim3((len+block.x-1)/block.x,(len+block.y-1)/block.y);//25*50
evolve_kernel<<<grid, block>>>( tex, d_out, d_out_pitch );
//******** kernel<<< number of blocks, number of threads, dynamic memory per block, associated stream >>> *******//
// Copy the device result to the host
checkCudaErrors(cudaMemcpy2D(h_out, len*sizeof(unsigned char),
d_out, d_out_pitch,
sizeof(unsigned char)*len, len,
cudaMemcpyDeviceToHost));
for(int i=0;i<1002*1002;i++){
if(h_in[i] != h_out[i])
printf("i = %d\n",i);
}
checkCudaErrors( cudaGetLastError() );
checkCudaErrors( cudaEventRecord(stop, NULL) );
checkCudaErrors( cudaEventSynchronize(stop) );
checkCudaErrors( cudaFree(d_in) );
checkCudaErrors( cudaFree(d_out) );
float msec = 0.f;
checkCudaErrors( cudaEventElapsedTime(&msec, start, stop) );
printf("Basic version took: %f ms\n", msec);
}
int main(){
byte *h_data_in, *h_data_out;
h_data_in = (byte *)malloc(len*len*sizeof(byte));
h_data_out = (byte *)malloc(len*len*sizeof(byte));
for (int i = 0; i < len*len; i++){
h_data_in[i] = 3;
h_data_out[i] = 0;}
evolve_gpu(h_data_in, h_data_out);
return 0;
}
$ nvcc -arch=sm_35 -I/usr/local/cuda/samples/common/inc t648.cu -o t648
$ ./t648
Basic version took: 3.868576 ms
$
It seems to work correctly and pass the test you have created.

the Kernel delay increase by increasing the blocksPerGrid and threadsPerBlock in CUDA VecAdd example, what does it mean?

when i tested the following example, i found that by increasing the blocksPerGrid and threadsPerBlock the Kernel delay increase
such that if
int threadsPerBlock = 1;
int blocksPerGrid = 1;
blocksPerGrid and threadsPerBlock equal 1 the delay of the kernel = .0072 ms
but when i make the following it the delay become higher = .049 ms
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
where
N = 50000; //the no. of array elements
on the following the complete VecAdd example. you can test it
// Includes
#include <stdio.h>
#include <cutil_inline.h>
#include <shrQATest.h>
// Variables
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void CleanupResources(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
printf("Vector Addition\n");
int N = 50000;
size_t size = N * sizeof(float);
ParseArguments(argc, argv);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) CleanupResources();
h_B = (float*)malloc(size);
if (h_B == 0) CleanupResources();
h_C = (float*)malloc(size);
if (h_C == 0) CleanupResources();
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_B, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );
// Invoke kernel
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
cudaEventRecord(event1, 0);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaEventRecord(event2, 0);
cudaEventSynchronize(event1); //optional
cudaEventSynchronize(event2);
float dt_ms;
cudaEventElapsedTime(&dt_ms, event1, event2);
printf("delay_time = %f\n", dt_ms);
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cutilDeviceSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
CleanupResources();
shrQAFinishExit(argc, (const char **)argv, (i==N) ? QA_PASSED : QA_FAILED);
}
void CleanupResources(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cutilDeviceReset();
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
// Parse program arguments
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--noprompt") == 0 ||
strcmp(argv[i], "-noprompt") == 0)
{
noprompt = true;
break;
}
}
}
can any one explain for me what does it mean?

In case 1, a kernel of size 1 thread is launched and performs 2 reads and 1 write operation. In case 2, a kernel of size 50176 threads are launched and perform 100,000 reads and 50,000 writes operations. Increasing the workload by 50,000 increased execution time by ~7x. The work done by the two launches is significantly different.

Why is texture memory version of below program slower than global memory version

i am confused why my texture version is slower than my global memory version because the texture version should exploit spatial locality. I am trying to compute the dot product in the below case. Thus, if one thread accesses index i, its neighbour should access i+1. Thus, we see spatial locality.
Below is the texture memory version:
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
texture<float> arr1;
texture<float> arr2;
const int n = 4;
__global__ void addVal( float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += tex1Dfetch(arr1,tid) * tex1Dfetch(arr2,tid);
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaBindTexture(NULL,arr1, deva,sizeof(float) * n); // note: deva shd be in gpu
cudaBindTexture(NULL,arr2, devb,sizeof(float) * n); // note: deva shd be in gpu
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
cudaUnbindTexture(arr1);
cudaUnbindTexture(arr2);
cudaFree(devc);
getchar();
return 0;
}
Global Memory version:
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__global__ void addVal(float *a, float *b, float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}

While know your graphic device may help, for some type of problems, with compute capability 2.x the L1 and L2 cache works as good the texture cache.
In this case, you are not exploiting the texture cache, as you only read once value per thread. On the other chand, you are exploiting spatial locality in 1D what can be hide with global memory coalesced access.
I recommend you the book 'CUDA by Example: An Introduction to General-Purpose GPU Programming'. Great book for beginners. With graphics examples like JuliaSet or a very basic Raycasting (there are also the common add, reduce and dot product examples if you prefer thouse :).
Hope this help.

Further to pQB's answer, there is no data reuse in your program -- each input is read only once, and used only once. Memory indices are sequential across threads, and therefore perfectly coalesced. Because of these two reasons, there is no need for any device memory cacheing, so global memory access is more efficient than texture access. Add to this additional latency overhead in the texture cache (texture cache is designed to increase throughput, not decrease latency, unlike L1/L2 data caches), and the slowdown is explained.
BTW, what you are doing is a parallel reduction, so you may want to see the "reduction" example in the CUDA SDK for a fast implementation.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Problem with CUDA operation overlapping example - cuda

Related

cudaMallocManaged and cudaDeviceSynchronize()

CUDA: Understanding the behavior of variables in the registers file in a loop with a dot product example

Converting array to texture representation

the Kernel delay increase by increasing the blocksPerGrid and threadsPerBlock in CUDA VecAdd example, what does it mean?

Why is texture memory version of below program slower than global memory version

Categories

Resources