CUDA Transfer Timing using events vs windows - cuda

I'm transferring up 48kb data blocks (with pinned memory), and although cuda events see it go up at 5gb/sec, by the time we get back to windows we only see half that speed. Is this just unavoidable driver overhead, or are there ways to mitigate this? I've encapsulated the process in the test program below.
void transferUp(size_t size)
{
StopWatchWin timer;
timer.start();
float tUpCopyStart,tUpCopyStop;
cudaEvent_t sendUpStopEvent,sendUpStartEvent;
checkCudaErrors(cudaEventCreate( &sendUpStartEvent ));
checkCudaErrors(cudaEventCreate( &sendUpStopEvent ));
unsigned *cpu_sending = (unsigned *)malloc(size);
checkCudaErrors(cudaHostAlloc(&cpu_sending, size*sizeof(unsigned), cudaHostAllocPortable));
unsigned *gpu_receiving;
checkCudaErrors(cudaMalloc(&gpu_receiving, size*sizeof(unsigned)));
tUpCopyStart = timer.getTime();
checkCudaErrors(cudaEventRecord(sendUpStartEvent));
checkCudaErrors(cudaMemcpyAsync(gpu_receiving, cpu_sending, size*sizeof(unsigned), cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(sendUpStopEvent));
checkCudaErrors(cudaEventSynchronize(sendUpStopEvent));
tUpCopyStop = timer.getTime();
double sendTimeWindows = tUpCopyStop - tUpCopyStart;
float sendTimeCuda;
checkCudaErrors(cudaEventElapsedTime( &sendTimeCuda,sendUpStartEvent,sendUpStopEvent));
float GbSec_cuda = (size*sizeof(unsigned)/1000)/(sendTimeCuda*1000);
float GbSec_win = (size*sizeof(unsigned)/1000)/(sendTimeWindows*1000);
printf("size=%06d bytes eventTime=%.03fms windowsTime=%0.3fms cudaSpeed=%.01f gb/s winSpeed=%.01f gb/s\n",
size*sizeof(unsigned),sendTimeCuda,sendTimeWindows,GbSec_cuda,GbSec_win);
checkCudaErrors(cudaEventDestroy( sendUpStartEvent ));
checkCudaErrors(cudaEventDestroy( sendUpStopEvent ));
checkCudaErrors(cudaFreeHost(cpu_sending));
checkCudaErrors(cudaFree(gpu_receiving));
}

The overhead of timing this small operation is overwhelming the measurement.
For small host->device copies (e.g., 64K or smaller), the CUDA driver will inline the data into the command buffer, so even the purportedly-synchronous memcpy calls are actually done asynchronously. But, the cudaEventSynchronize() call in your code forces the CPU to wait instead of continuing execution.

Related

What is the difference between __ldg() intrinsic and a normal execution?

I am trying to explore '__ldg intrinsic'. I have gone through NVIDIA's documentation for this but didn't get any satisfactory answer over its use and implementations. Moreover with reference to THIS I tried implementing __ldg in a simple 1024*1024 matrix multiplication example.
#include<stdio.h>
#include<stdlib.h>
__global__ void matrix_mul(float * ad,float * bd,float * cd,int N)
{
float pvalue=0;
//find Row and Column corresponding to a data element for each thread
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
//calculate dot product of Row of First Matrix and Column of Second Matrix
for(int i=0;i< N;++i)
{
// I tried with executing this first:
float m=__ldg(&ad[Row * N+i]);
float n=__ldg(&bd[i * N + Col]);
//Then I executed this as a normal execution:
// float m = ad[Row * N+i];
// float n = bd[i * N + Col];
pvalue += m * n;
}
//store dot product at corresponding position in resultant Matrix
cd[Row * N + Col] = pvalue;
}
int main()
{
int N = 1024,i,j; //N == size of square matrix
float *a,*b;
float *ad,*bd,*cd,*c;
//open a file for outputting the result
FILE *f;
f=fopen("Parallel Multiply_ldg.txt","w");
size_t size=sizeof(float)* N * N;
//allocate host side memory
a=(float*)malloc(size);
b=(float*)malloc(size);
c=(float*)malloc(size);
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
a[i*N+j]=2.0; //(float)(i*N+j); //initializing each value with its own index
b[i*N+j]=1.0; //(float)(i*N+j); //random functions can be used alternatively
}
}
//allocate device memory
cudaMalloc(&ad,size);
//printf("\nAfter cudaMalloc for ad\n%s\n",cudaGetErrorString(cudaGetLastError()));
cudaMalloc(&bd,size);
//printf("\nAfter cudaMalloc bd\n%s\n",cudaGetErrorString(cudaGetLastError()));
cudaMalloc(&cd,size);
//printf("\nAfter cudaMalloc cd\n%s\n",cudaGetErrorString(cudaGetLastError()));
//copy value from host to device
cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(bd,b,size,cudaMemcpyHostToDevice);
printf("\nAfter HostToDevice Memcpy\n%s\n",cudaGetErrorString(cudaGetLastError()));
//calculate execution configuration
dim3 blocksize(16,16); //each block contains 16 * 16 (=256) threads
dim3 gridsize(N/16,N/16); //creating just sufficient no of blocks
//GPU timer code
float time;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
matrix_mul <<< gridsize, blocksize >>> (ad,bd,cd, N);
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop); //time taken in kernel call calculated
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy back results
cudaMemcpy(c,cd,sizeof(float)* N*N,cudaMemcpyDeviceToHost);
printf("\nAfter DeviceToHost Memcpy\n%s\n",cudaGetErrorString(cudaGetLastError()));
//output results in output_file
fprintf(f,"Array A was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",a[i*N+j]);
fprintf(f,"\n");
}
fprintf(f,"\nArray B was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",b[i*N+j]);
fprintf(f,"\n");
}
fprintf(f,"\nMultiplication of A and B gives C----\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",c[i*N+j]); //if correctly computed, then all values must be N
fprintf(f,"\n");
}
printf("\nYou can see output in Parallel Mutiply.txt file in project directory");
printf("\n\nTime taken is %f (ms)\n",time);
fprintf(f,"\n\nTime taken is %f (ms)\n",time);
fclose(f);
cudaThreadExit();
//cudaFree(ad); cudaFree(bd); cudaFree (cd);
free(a);free(b);free(c);
//_getch();
return 1;
}
I commented that __ldg part in my kernel and executed by normal execution, and vice versa.
In both cases it gives me correct multiplication result. I am confused with the time difference I am getting between these executions, because its huge almost more than 100X!
In case of __ldg it gives me: Time taken is 0.014432 (ms)
And in case of normal execution without __ldg it gives me : Time taken is 36.858398 (ms)
Is this the exact way of using __ldg intrisic? What is the significance of __ldg intrinsic and what is the proper way of using it? Apparently what I did above in my code is wrong and naive. I am looking for explanation and example. Thanks in advance.
From the CUDA C Programming Guide
Global memory accesses for devices of compute capability 3.x are cached in L2 and for devices of compute capability 3.5, may also be cached in the read-only data cache described in the previous section; they are not cached in L1.
...
Data that is read-only for the entire lifetime of the kernel can also be cached in the read-only data cache described in the previous section by reading it using the __ldg() function (see Read-Only Data Cache Load Function). When the compiler detects that the read-only condition is satisfied for some data, it will use __ldg() to read it. The compiler might not always be able to detect that the read-only condition is satisfied for some data. Marking pointers used for loading such data with both the const and __restrict__ qualifiers increases the likelihood that the compiler will detect the read-only condition.
The read only cache accesses have a much lower latency than the global memory accesses. Because matrix multiplication accesses the same values from memory many times, caching in the read only cache gives a huge speedup (in memory bound applications).
In NVIDIA GPU there is a texture - images with special and not hard logic to work with images.
This texture memory is another type of memory available in GPU. In particularly constant, global and register file memory has not any relation to this texture memory.
Kepler GPUs and later add the ability to use this memory from "GPU texture pipeline".
But let's specify the difference between constant cache and read-only cache.
Constant Cache
Data loaded through the constant cache must be relatively small and must be accessed in such way that all threads of a warp should access the same location at any given time.
Read-only Cache or Texture Memory Cache
Cache can be much larger and can be accessed in a non-uniform pattern.
Read Only cache has granularity 32 bytes.
You can use this as "read-only cache" for your CUDA kernel.
1. Data stored in global memory can be cached in that place GPU Texture Memory
2. With doing that you give promise to the compiler that data is read-only for the
duration of a kernel execution in GPU.
There are two ways to achieve this.
A. Using an intrinsic function __ldg
Example: output[i] += __ldg(&input[j]);
B. Qualifying pointers to global memory
const float* __restrict__ input
output[idx] += input[idx];
Comparision:
The intrinsic __ldg is a better choice for deep compiler reasons.

Monitoring how thread blocks are allocated to SMs across execution time?

I am a beginner with CUDA profiling. I basically want to generate a timeline that shows each SM and the the thread block that was assigned to it across execution time.
Something similar to this:
Author: Sreepathi Pai
I have read about reading %smid register, but I don't know how to incorporate it with the code that I want to test, or how to relate that to thread blocks or time.
The full code is beyond the scope of this answer so this answer provides the building blocks for you to implement block trace.
Allocate a buffer 16 bytes * number of blocks. This can be done per launch or a larger buffer can be allocated and maintained for multiple launches.
Pass the pointer of the block either through a constant variable or as an additional kernel parameter.
Modify your global functions to accept the parameter and perform the code listed below. I recommend writing new global function wrappers and have the wrapper kernel call the old code. This makes it easier to handle kernels with multiple exit points.
Visualizing Data
On compute capability 2.x devices the timestamp function should be clock64. This clock is not synchronized across SMs. The recommend approach is to sort the times per SM and use the lowest time per SM as the time of the kernel launch. This will only be off by 100s of cycles from the real time so for reasonable size kernels this drift is negligible.
Remove the smid from the lower 4-bits of the first 8 byte value. Clear the lower 4-bits of the end timestamp.
Allocate a device buffer equal to number of blocks * 16 bytes. Each 16 byte records will store the start and end timestamp as well as a 5-bit smid packed into the start time.
static __device__ inline uint32_t __smid()
{
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;
}
// use globaltimer for compute capability >= 3.0 (kepler and maxwell)
// use clock64 for compute capability 2.x (fermi)
static __device__ inline uint64_t __timestamp()
{
uint64_t globaltime;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(globaltime) );
return globaltime;
}
__global__ blocktime(uint64_t* pBlockTime)
{
// START TIMESTAMP
uint64_t startTime = __timestamp();
// flatBlockIdx should be adjusted to 1D, 2D, and 3D launches to minimize
// overhead. Reduce to uint32_t if launch index does not exceed 32-bit.
uint64_t flatBlockIdx = (blockIdx.z * gridDim.x * gridDim.y)
+ (blockIdx.y * gridDim.x)
+ blockIdx.x;
// reduce this based upon dimensions of block to minimize overhead
if (threadIdx.x == 0 && theradIdx.y == 0 && threadIdx.z == 0)
{
// Put the smid in the 4 lower bits. If the MultiprocessCounter exceeds
// 16 then increase to 5-bits. The lower 5-bits of globaltimer are
// junk. If using clock64 and you want the improve precision then use
// the most significant 4-5 bits.
uint64_t smid = __smid();
uint64_t data = (startTime & 0xF) | smid;
pBlockTime[flatBlockIdx * 2 + 0] = data;
}
// do work
// I would recommend changing your current __global__ function to be
// a __global__ __device__ function and call it here. This will result
// in easier handling of kernels that have multiple exit points.
// END TIMESTAMP
// All threads in block will write out. This is not very efficient.
// Depending on the kernel this can be reduced to 1 thread or 1 thread per warp.
uint64_t endTime = __timestamp();
pBlockTime[flatBlockIdx * 2 + 1] = endTime;
}
__noinline__ __device__ uint get_smid(void)
{
uint ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}
Source here.

Low memory copy throughput Host to Device

I have a vector of vectors vector<vector<double>> data.
I want to copy only the information contained in that "2D matrix" as there are no vectors in CUDA.
So the first approach I used was
vector<vector<double>> *values;
vector<vector<double>>::iterator it;
double *d_values;
double *dst;
checkCudaErr(
cudaMalloc((void**)&d_values, sizeof(double)*M*N)
);
dst = d_values;
for (it = values->begin(); it != values->end(); ++it){
double *src = &((*it)[0]);
size_t s = it->size();
checkCudaErr(
cudaMemcpy(dst, src, sizeof(double)*s, cudaMemcpyHostToDevice)
);
dst += s;
}
After profiling with NVVP I got a very low cudaMempcpy throughput. I think this is logic as I'm sending a very small amount of
bytes in each cudaMemcpy call.
So I decided to change a little bit the code to try to improve this, so the second approach is
double *h_values = new double[M*N];
dst = h_values;
for (it = values->begin(); it != values->end(); ++it){
double *src = &((*it)[0]);
size_t s = it->size();
memcpy(dst, src, sizeof(double)*s);
dst += s;
}
checkCudaErr(
cudaMemcpy(d_values, h_values, sizeof(double)*M*N, cudaMemcpyHostToDevice)
);
the result after profiling is still a low memcpy throughput.
So, my question is, how can I improve the copies from host to device?
I'm using a Quadro K4000. I'm getting 25 MB/s for the first case and about 2 GB/s on the second one. M = 5 and N = 2000000. I must say the value for M is a common value, but sometimes it can get up to 50.
A reason for your slow throughput can be that you allocate your double matrix with new. This memory is not page locked. You can either use a system function (dont know which system you use) or the cuda function providing this functionality. It would be cudaMallocHost.
Just remove your =new double[M*N] and set your h_values with cudaMallocHost(&h_values, sizeof(double)*M*N) (and of course dont delete it, but free it (with cudaFreeHost)).
Btw. the theoretical top speed is 8 GB/s (PCI 2.0 x 16 lanes), practical you will stay below it (around 6 GB/s).

Kernel Launch Failure

I'm operating on a Linux system and a Tesla C2075 machine. I am launching a kernel that is a modified version of the reduction kernel. My aim is to find the mean and a step by step averaged version(time_avg) of a large data set (result). See code below.
Size of "result" and "time_avg" is same and equal to "nsamps". "time_avg" contains successive averaged sets of the array result. So, first half contains averages of every two non-overlapping samples, the quarter after that has averages of every four non-overlapping samples, the next eighth of 8 samples and so on.
__global__ void timeavg_mean(float *result, unsigned int *nsamps, float *time_avg, float *mean) {
__shared__ float temp[1024];
int ltid = threadIdx.x, gtid = blockIdx.x*blockDim.x + threadIdx.x, stride;
int start = 0, index;
unsigned int npts = *nsamps;
printf("here here\n");
// Store chunk of memory=2*blockDim.x (which is to be reduced) into shared memory
if ( (2*gtid) < npts ){
temp[2*ltid] = result[2*gtid];
temp[2*ltid+1] = result[2*gtid + 1];
}
for (stride=1; stride<blockDim.x; stride>>=1) {
__syncthreads();
if (ltid % (stride*2) == 0){
if ( (2*gtid) < npts ){
temp[2*ltid] += temp[2*ltid + stride];
index = (int)(start + gtid/stride);
time_avg[index] = (float)( temp[2*ltid]/(2.0*stride) );
}
}
start += npts/(2*stride);
}
__syncthreads();
if (ltid == 0)
{
atomicAdd(mean, temp[0]);
}
__syncthreads();
printf("%f\n", *mean);
}
Launch configuration is 40 blocks, 512 threads. Data set is ~40k samples.
In my main code, I call cudaGetLastError() after the kernel call and it returns no error. Memory allocations and memory copies return no errors. If I write cudaDeviceSynchronize() (or a cudaMemcpy to check for the value of mean) after the kernel call, the program hangs completely after the kernel call. If I remove it, program runs and exits. In neither case, do I get the outputs "here here" or the mean value printed. I understand that unless the kernel executes successfully, the printf's won't print.
Has this got to do with __syncthreads() in a recursion? All threads will go till the same depth so I think that checks out.
What is the problem here?
Thank you!
A kernel call is asynchronous, if the kernel starts successfully your host code will continue to run and you will see no error. Errors that happen during the kernel run appear only after you do an explicit synchronization or call a function that causes an implicit synchronization.
If your host hangs on synchronization than your kernel probably didn't finished running - it is either running some infinite loop or it is waiting on some __synchthreads() or some other synchronization primitive.
Your code seems to contain an infinite loop: for (stride=1; stride<blockDim.x; stride>>=1). You probably want to shift the stride left not right: stride<<=1.
You mentioned recursion but your code contains only one __global__ function, there are no recursive calls.
Your kernel has an infinite loop. Replace the for loop with
for (stride=1; stride<blockDim.x; stride<<=1) {

CUDA GPU slower than CPU

I am having trouble figuring out why my cuda code runs slower than my cpu code
my desktop configuration is i7 2600S, geforce 560ti
and my code is as follows:
int** kernel_shiftSeam(int **MCEnergyMat, int **newE, int *seam, int width, int height, int direction)
{
//time measurement
float elapsed_time_ms = 0;
cudaEvent_t start, stop; //threads per block
dim3 threads(16,16);
//blocks
dim3 blocks((width+threads.x-1)/threads.x, (height+threads.y-1)/threads.y);
int *device_Seam;
int *host_Seam;
int seamSize;
if(direction == 1)
{
seamSize = height*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<height;i++)
host_Seam[i] = seam[i];
}
else
{
seamSize = width*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<width;i++)
host_Seam[i] = seam[i];
}
cudaMalloc((void**)&device_Seam, seamSize);
cudaMemcpy(device_Seam, host_Seam, seamSize, cudaMemcpyHostToDevice);
global_host_MC = MCEnergyMat;
new_host_MC = newE;
//copy host array to device
cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
cudaMemcpy(global_MC2[i], global_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);
cudaMemcpy(new_MC, new_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
cudaMemcpy(new_MC2[i], new_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//do some operations on the 2d matrix
gpu_shiftSeam<<< blocks,threads >>>(global_MC, new_MC, device_Seam, width, height);
//measure end time for cpu calcuations
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms, start, stop );
execTime += elapsed_time_ms;
//copy out the data back to host (RESULT)
for(int i=0;i<width;i++)
{
cudaMemcpy(newE[i], new_MC2[i], sizeof(int)*height, cudaMemcpyDeviceToHost);
}
return newE;
}
I looped it 800 times and I got the follow results:
GPU
Computation Time (the gpu_shiftseam part) : 1176ms
Total program run time: 22s
CPU
Computation Time (same operation as gpu_shiftseam but on host) : 12522ms
Total program run time: 12s
Apparently the GPU computation time is way shorter than the one on CPU, but
for some reason the total program run time for gpu is a lot longer, does
anyone know why? Is it because of the number of threads/blocks I am assigning
is incorrect? Or is the "slowness" coming from allocating memory on device?
Thanks a lot!
Im my experience memory accesses are the #1 reason for slowness.
Profile your array copies to see how much time is being spent. If it is a considerable amount, perhaps try optimizing your code. Instead of copying inside of a for-loop, perhaps see if you can copy sizeof(int *) * height * width directly. Reducing the amount of times you call memcpy should help.
cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
cudaMemcpy(global_MC2, global_host_MC, sizeof(int)*height*width,cudaMemcpyHostToDevice);
I had similar experience and found that cudaMalloc was the bottleneck while cudaMemcpy wasn't. In my device, I remember that 16 MB allocation took 160 ms. CUDA memory allocation however can be done before actual computation, for example, by another function call. Thus, the memory allocation time can be removed from overall performance measure, e.g., speedup although I would include cudaMemcpy operation in the speedup calculation.