How do I know that cudaMemcpyAsync is done reading host memory? - cuda

For example... Here's what I see in NVIDIA's docs:
cudaMemcpyAsync(a_d, a_h, size, cudaMemcpyHostToDevice, 0);
kernel<<<grid, block>>>(a_d);
cpuFunction();
Let's say this is wrapped in a function...
void consume() {
cudaMemcpyAsync(a_d, a_h, size, cudaMemcpyHostToDevice, 0);
kernel<<<grid, block>>>(a_d);
}
What if I also have a function
void produce() {
// do stuff
a_h[0] = 1;
a_h[1] = 3;
a_h[2] = 5;
//...
}
If I call:
produce();
consume();
produce(); // problem??
The second produce() will start to change the memory on the host at a_h
How do I know that CUDA isn't still reading the host memory during the asynchronous memory copy routine?
How can I safely write to the host a_h memory without disrupting that asynchronous mem copy?
EDIT---
I know I can call cudaDeviceSynchronize() or cudaStreamSynchronize() but that will also wait for kernel to complete. I would prefer to not wait until kernel is done.
I want to start writing to host a_h as soon as possible, while not waiting for kernel to finish.

If you use a stream for the cudaMemcpyAsync call, you can insert an event into the stream after the asynchronous transfer and then use cudaEventSynchronize to synchronize on that event. This guarantees that the copy has finished, but doesn't rely on the device being idle or the stream being empty.

Related

CUDA Transfer Timing using events vs windows

I'm transferring up 48kb data blocks (with pinned memory), and although cuda events see it go up at 5gb/sec, by the time we get back to windows we only see half that speed. Is this just unavoidable driver overhead, or are there ways to mitigate this? I've encapsulated the process in the test program below.
void transferUp(size_t size)
{
StopWatchWin timer;
timer.start();
float tUpCopyStart,tUpCopyStop;
cudaEvent_t sendUpStopEvent,sendUpStartEvent;
checkCudaErrors(cudaEventCreate( &sendUpStartEvent ));
checkCudaErrors(cudaEventCreate( &sendUpStopEvent ));
unsigned *cpu_sending = (unsigned *)malloc(size);
checkCudaErrors(cudaHostAlloc(&cpu_sending, size*sizeof(unsigned), cudaHostAllocPortable));
unsigned *gpu_receiving;
checkCudaErrors(cudaMalloc(&gpu_receiving, size*sizeof(unsigned)));
tUpCopyStart = timer.getTime();
checkCudaErrors(cudaEventRecord(sendUpStartEvent));
checkCudaErrors(cudaMemcpyAsync(gpu_receiving, cpu_sending, size*sizeof(unsigned), cudaMemcpyHostToDevice));
checkCudaErrors(cudaEventRecord(sendUpStopEvent));
checkCudaErrors(cudaEventSynchronize(sendUpStopEvent));
tUpCopyStop = timer.getTime();
double sendTimeWindows = tUpCopyStop - tUpCopyStart;
float sendTimeCuda;
checkCudaErrors(cudaEventElapsedTime( &sendTimeCuda,sendUpStartEvent,sendUpStopEvent));
float GbSec_cuda = (size*sizeof(unsigned)/1000)/(sendTimeCuda*1000);
float GbSec_win = (size*sizeof(unsigned)/1000)/(sendTimeWindows*1000);
printf("size=%06d bytes eventTime=%.03fms windowsTime=%0.3fms cudaSpeed=%.01f gb/s winSpeed=%.01f gb/s\n",
size*sizeof(unsigned),sendTimeCuda,sendTimeWindows,GbSec_cuda,GbSec_win);
checkCudaErrors(cudaEventDestroy( sendUpStartEvent ));
checkCudaErrors(cudaEventDestroy( sendUpStopEvent ));
checkCudaErrors(cudaFreeHost(cpu_sending));
checkCudaErrors(cudaFree(gpu_receiving));
}
The overhead of timing this small operation is overwhelming the measurement.
For small host->device copies (e.g., 64K or smaller), the CUDA driver will inline the data into the command buffer, so even the purportedly-synchronous memcpy calls are actually done asynchronously. But, the cudaEventSynchronize() call in your code forces the CPU to wait instead of continuing execution.

Are cudaMalloc and cudaFree synchronous or asynchronous calls?

I want to test whether cudaMalloc and cudaFree are synchronous calls, so I did some modification to the "simpleMultiGPU.cu" sample code in CUDA SDK. Following is the part I changed (the added lines are not indented):
float *dd[GPU_N];;
for (i = 0; i < GPU_N; i++){cudaSetDevice(i); cudaMalloc((void**)&dd[i], sizeof(float));}
//Start timing and compute on GPU(s)
printf("Computing with %d GPUs...\n", GPU_N);
StartTimer();
//Copy data to GPU, launch the kernel and copy data back. All asynchronously
for (i = 0; i < GPU_N; i++)
{
//Set device
checkCudaErrors(cudaSetDevice(i));
//Copy input data from CPU
checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
//Perform GPU computations
reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
getLastCudaError("reduceKernel() execution failed.\n");
//Read back GPU results
checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
cudaMalloc((void**)&dd[i],sizeof(float));
cudaFree(dd[i]);
//cudaStreamSynchronize(plan[i].stream);
}
By commenting out the cudaMalloc line and cudaFree line respectively in the large loop, I found that for a 2-GPU system, the GPU processing time are 30 milliseconds and 20 milliseconds respectively, so I concluded that cudaMalloc is an asynchronous call and cudaFree is a synchronous call. Not sure this is true or not, and not sure what is the designing concern of the CUDA architecture.
My computation capability is 2.0, and I tried both cuda4.0 and cuda5.0.
Both functions are synchronous.

synchronizing device memory access with host thread

Is it possible for a CUDA kernel to synchronize writes to device-mapped memory without any host-side invocation (e.g., of cudaDeviceSynchronize)? When I run the following program, it doesn't seem that the kernel waits for the writes to device-mapped memory to complete before terminating because examining the page-locked host memory immediately after the kernel launch does not show any modification of the memory (unless a delay is inserted or the call to cudaDeviceSynchronize is uncommented):
#include <stdio.h>
#include <cuda.h>
__global__ void func(int *a, int N) {
int idx = threadIdx.x;
if (idx < N) {
a[idx] *= -1;
__threadfence_system();
}
}
int main(void) {
int *a, *a_gpu;
const int N = 8;
size_t size = N*sizeof(int);
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaHostAlloc((void **) &a, size, cudaHostAllocMapped);
cudaHostGetDevicePointer((void **) &a_gpu, (void *) a, 0);
for (int i = 0; i < N; i++) {
a[i] = i;
}
for (int i = 0; i < N; i++) {
printf("%i ", a[i]);
}
printf("\n");
func<<<1, N>>>(a_gpu, N);
// cudaDeviceSynchronize();
for (int i = 0; i < N; i++) {
printf("%i ", a[i]);
}
printf("\n");
cudaFreeHost(a);
}
I'm compiling the above for sm_20 with CUDA 4.2.9 on Linux and running it on a Fermi GPU (S2050).
A kernel launch will immediately return to the host code before any kernel activity has occurred. Kernel execution is in this way asynchronous to host execution and does not block host execution. So it's no surprise that you have to wait a bit or else use a barrier (like cudaDeviceSynchronize()) to see the results of the kernel.
As described here:
In order to facilitate concurrent execution between host and device,
some function calls are asynchronous: Control is returned to the host
thread before the device has completed the requested task. These are:
Kernel launches;
Memory copies between two addresses to the same device memory;
Memory copies from host to device of a memory block of 64 KB or less;
Memory copies performed by functions that are suffixed with Async;
Memory set function calls.
This is all intentional of course, so that you can use the GPU and CPU simultaneously. If you don't want this behavior, a simple solution as you've already discovered is to insert a barrier. If your kernel is producing data which you will immediately copy back to the host, you don't need a separate barrier. The cudaMemcpy call after the kernel will wait until the kernel is completed before it begins it's copy operation.
I guess to answer your question, you are wanting kernel launches to be synchronous without you having even to use a barrier (why do you want to do this? Is adding the cudaDeviceSynchronize() call a problem?) It's possible to do this:
"Programmers can globally disable asynchronous kernel launches for all
CUDA applications running on a system by setting the
CUDA_LAUNCH_BLOCKING environment variable to 1. This feature is
provided for debugging purposes only and should never be used as a way
to make production software run reliably. "
If you want this synchronous behavior, it's better just to use the barriers (or depend on another subsequent cuda call, like cudaMemcpy). If you use the above method and depend on it, your code will break as soon as somebody else tries to run it without the environment variable set. So it's really not a good idea.

Get rid of busy waiting during asynchronous cuda stream executions

I looking for a way how to get rid of busy waiting in host thread in fallowing code (do not copy that code, it only shows an idea of my problem, it has many basic bugs):
cudaStream_t steams[S_N];
for (int i = 0; i < S_N; i++) {
cudaStreamCreate(streams[i]);
}
int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
while (true) {
if (cudaStreamQuery(streams[sid])) == cudaSuccess) { //BUSY WAITING !!!!
cudaMemcpyAssync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
break;
}
sid = ++sid % S_N;
}
}
Is there a way to idle host thread and wait somehow to some stream to finish, and then prepare and run another stream?
EDIT: I added while(true) into the code, to emphasize busy waiting. Now I execute all the streams, and check which of them finished to run another new one. cudaStreamSynchronize waits for particular stream to finish, but I want to wait for any of the streams which as a first finished the job.
EDIT2: I got rid of busy-waiting in fallowing way:
cudaStream_t steams[S_N];
for (int i = 0; i < S_N; i++) {
cudaStreamCreate(streams[i]);
}
int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
cudaMemcpyAssync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
sid = ++sid % S_N;
}
for (int i = 0; i < S_N; i++) {
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
}
But it appears to be a little bit slower than the version with busy-waiting on host thread. I think it is because, now I statically distribute the jobs on streams, so when the one stream finishes work it is idle till each of the stream finishes the work. The previous version dynamically distributed the work to the first idle stream, so it was more efficient, but there was busy-waiting on the host thread.
The real answer is to use cudaThreadSynchronize to wait for all previous launches to complete, cudaStreamSynchronize to wait for all launches in a certain stream to complete, and cudaEventSynchronize to wait for only a certain event on a certain stream to be recorded.
However, you need to understand how streams and sychronization work before you will be able to use them in your code.
What happens if you do not use streams at all? Consider the following code:
kernel <<< gridDim, blockDim >>> (d_data, DATA_STEP);
host_func1();
cudaThreadSynchronize();
host_func2();
The kernel is launched and the host moves on to execute host_func1 and kernel concurrently. Then, the host and the device are synchronized, ie the host waits for kernel to finish before moving on to host_func2().
Now, what if you have two different kernels?
kernel1 <<<gridDim, blockDim >>> (d_data + d1, DATA_STEP);
kernel2 <<<gridDim, blockDim >>> (d_data + d2, DATA_STEP);
kernel1 is launched asychronously! the host moves on, and kernel2 is launched before kernel1 finishes! however, kernel2 will not execute until after kernel1 finishes, because they have both been launched on stream 0 (the default stream). Consider the following alternative:
kernel1 <<<gridDim, blockDim>>> (d_data + d1, DATA_STEP);
cudaThreadSynchronize();
kernel2 <<<gridDim, blockDim>>> (d_data + d2, DATA_STEP);
There is absolutely no need to do this because the device already synchronizes kernels launched on the same stream.
So, I think that the functionality that you are looking for already exists... because a kernel always waits for previous launches in the same stream to finish before starting (even though the host passes by). That is, if you want to wait for any previous launch to finish, then simply don't use streams. This code will work fine:
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
cudaMemcpyAsync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, 0);
kernel<<<gridDim, blockDim, smSize, 0>>>(d_data, DATA_STEP);
}
Now, on to streams. you can use streams to manage concurrent device execution.
Think of a stream as a queue. You can put different memcpy calls and kernel launches into different queues. Then, kernels in stream 1 and launches in stream 2 are asynchronous! They may be executed at the same time, or in any order. If you want to be sure that only one memcpy/kernel is being executed on the device at a time, then don't use streams. Similarly, if you want kernels to be executed in a specific order, then don't use streams.
That said, keep in mind that anything put into a stream 1, is executed in order, so don't bother synchronizing. Synchronization is for synchronizing host and device calls, not two different device calls. So, if you want to execute several of your kernels at the same time because they use different device memory and have no effect on each other, then use streams. Something like...
cudaStream_t steams[S_N];
for (int i = 0; i < S_N; i++) {
cudaStreamCreate(streams[i]);
}
int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
cudaMemcpyAsync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
sid = ++sid % S_N;
}
No explicit device synchronization necessary.
My idea to solve that problem is to have one host thread per one stream. That host thread would invoke cudaStreamSynchronize to wait till the stream commands are completed.
Unfortunately it is not possible in CUDA 3.2 since it allows only one host thread deal with one CUDA context, it means one host thread per one CUDA enabled GPU.
Hopefully, in CUDA 4.0 it will be possible: CUDA 4.0 RC news
EDIT: I have tested in CUDA 4.0 RC, using open mp. I created one host thread per cuda stream. And it started to work.
There is: cudaEventRecord(event, stream) and cudaEventSynchronize(event). The reference manual http://developer.download.nvidia.com/compute/cuda/3_2/toolkit/docs/CUDA_Toolkit_Reference_Manual.pdf has all the details.
Edit: BTW streams are handy for concurrent execution of kernels and memory transfers. Why do you want to serialize the execution by waiting on the current stream to finish?
Instead of cudaStreamQuery, you want cudaStreamSynchronize
int sid = 0;
for (int d = 0; d < DATA_SIZE; d+=DATA_STEP) {
cudaStreamSynchronize(streams[sid]);
cudaMemcpyAssync(d_data, h_data + d, DATA_STEP, cudaMemcpyHostToDevice, streams[sid]);
kernel<<<gridDim, blockDim, smSize streams[sid]>>>(d_data, DATA_STEP);
sid = ++sid % S_N;
}
(You can also use cudaThreadSynchronize to wait for launches across all streams, and events with cudaEventSynchronize for more advanced host/device synchronization.)
You can further control the type of waiting that occurs with these synchronization functions. Look at the reference manual for the cudaDeviceBlockingSync flag and others. The default is probably what you want, though.
You need to copy the data-chunk and execute kernel on that data-chunk in different for loops. That'll be more efficient.
like this:
size = N*sizeof(float)/nStreams;
for (i=0; i<nStreams; i++){
offset = i*N/nStreams;
cudaMemcpyAsync(a_d+offset, a_h+offset, size, cudaMemcpyHostToDevice, stream[i]);
}
for (i=0; i<nStreams; i++){
offset = i*N/nStreams;
kernel<<<N(nThreads*nStreams), nThreads, 0, stream[i]>>> (a_d+offset);
}
In this way the memory copy doesn't have to wait for kernel execution of previous stream and vice versa.

CUDA 4.0 RC - many host threads per one GPU - cudaStreamQuery and cudaStreamSynchronize behaviour

I wrote a code which uses many host (OpenMP) threads per one GPU. Each thread has its own CUDA stream to order it requests. It looks very similar to below code:
#pragma omp parallel for num_threads(STREAM_NUMBER)
for (int sid = 0; sid < STREAM_NUMBER; sid++) {
cudaStream_t stream;
cudaStreamCreate(&stream);
while (hasJob()) {
//... code to prepare job - dData, hData, dataSize etc
cudaError_t streamStatus = cudaStreamQuery(stream);
if (streamStatus == cudaSuccess) {
cudaMemcpyAsync(dData, hData, dataSize, cudaMemcpyHostToDevice, stream);
doTheJob<<<gridDim, blockDim, smSize, stream>>>(dData, dataSize);
else {
CUDA_CHECK(streamStatus);
}
cudaStreamSynchronize(stream);
}
cudaStreamDestroy(stream);
}
And everything were good till I got many small jobs. In that case, from time to time, cudaStreamQuery returns cudaErrorNotReady, which is for me unexpected because I use cudaStreamSynchronize. Till now I were thinking that cudaStreamQuery will always return cudaSuccess if it is called after cudaStreamSynchronize. Unfortunately it appeared that cudaStreamSynchronize may finish even when cudaStreamQuery still returns cudaErrorNotReady.
I changed the code into the following and everything works correctly.
#pragma omp parallel for num_threads(STREAM_NUMBER)
for (int sid = 0; sid < STREAM_NUMBER; sid++) {
cudaStream_t stream;
cudaStreamCreate(&stream);
while (hasJob()) {
//... code to prepare job - dData, hData, dataSize etc
cudaError_t streamStatus;
while ((streamStatus = cudaStreamQuery(stream)) == cudaErrorNotReady) {
cudaStreamSynchronize();
}
if (streamStatus == cudaSuccess) {
cudaMemcpyAsync(dData, hData, dataSize, cudaMemcpyHostToDevice, stream);
doTheJob<<<gridDim, blockDim, smSize, stream>>>(dData, dataSize);
else {
CUDA_CHECK(streamStatus);
}
cudaStreamSynchronize(stream);
}
cudaStreamDestroy(stream);
}
So my question is.... is it a bug or a feature?
EDIT: it is similar to JAVA
synchronize {
while(waitCondition) {
wait();
}
}
What is under
//... code to prepare job - dData, hData, dataSize etc
Do you have any functions of kind cudaMemcpyAsync there, or the only memory transfer is in the code you have shown? Those are asynchronous functions may exit early, even when the code is not at the destination yet. When that happens cudaStreamQuery will return cudaSuccess only when memory transfers succeed.
Also, does hasJob() uses any of the host-CUDA functions?
If I am not mistaken, in a single stream, it is not possible to execute both kernel and memory transfers. Therefore, calling cudaStreamQuery is necessary only when a kernel depends on the data transferred by a different stream.
Didn't notice it earlier: cudaStreamSynchronize() should take a parameter (stream). I am not sure which stream you are synchronising when parameter is ommited, could be that it defaults to stream 0.