I try to implement a producer-consumer relationship between
two GPUs, my application makes the producer GPU record an event and then
the consumer GPU inserts a stream-wait on that event into its command
stream. It will stopprocessing its commands when consumer GPU encounters the stream-wait,
until the producer GPU has passed the 'point of execution' where cudaEventRecord was called.
I started with a staging buffer like this:
cudaError_t chCpyP2P(void *_dst, int dstDevice, const void *_src, int srcDevice, size_t N) {
cudaError_t status;
char *dst = (char*) _dst;
const char *src = (const char*) _src;
int stagingIndex = 0;
size_t copySize = min(N, STAGING_BUFFER_SIZE);
while ( N ) {
cudaSetDevice(srcDevice);
cudaStreamWaitEvent(0, g_events[dstDevice][stagingIndex],0);
cudaMemcpyAsync(g_hostBuffers[stagingIndex], src, copySize, cudaMemcpyDeviceToHost, NULL);
cudaEventRecord(g_events[srcDevice][stagingIndex] );
cudaSetDevice(dstDevice);
cudaMemcpyAsync(dst, g_hostBuffers[stagingIndex], copySize, cudaMemcpyHostToDevice, NULL);
dst += copySize;
src += copySize;
N -= copySize;
stagingIndex = 1 - stagingIndex;
}
}
But I am missing somehow an essential step as it is not working as expected. I do not find any place where to
overthink my plans.
Does anyone know what I could do ?
Thanks for help, hope my question isn't too dumb.
on the host, the staging buffer is allocated and the memcpy starts by having
the source GPU copy source data into the staging buffer and recording a event.
But: Unlike the host2device memcpy, the CPU doesn't need to
synchronize as all synchronization will be done by the GPUs. Because memcpy
and the event-record are asynchronous, directly after the initial memcpy,
the CPU can request the destination-GPU to wait on that initial event and start a memcpy of the same buffer.
In order to let the two GPUs can use the staging buffers concurrently, two staging buffers and two CUDA events are needed. The CPU loops over the input buffer and output buffers, issuing memcpy and event-record
commands, until it has requested copies for all bytes, waiting for both GPUs to finish processing.
cudaError_t chCpyP2P(void *_dst,int dstDevice,const void *_src,int srcDevice,size_t N)
{
cudaError_t status;
char *dst = (char *) _dst;
const char *src = (const char *) _src;
int stg_idx = 0; // staging-index
while (N) {
size_t sz_cpy = min(N,STAGING_BUFFER_SIZE);
cudaSetDevice( srcDevice );
cudaStreamWaitEvent(0,g_events[dstDevice][stg_idx],0);
cudaMemcpyAsync(g_hostBuffers[stg_idx],src,sz_cpy,cudaMemcpyDeviceToHost,NULL);
cudaEventRecord(g_events[srcDevice][stg_idx]);
cudaSetDevice(dstDevice);
cudaStreamWaitEvent(0,g_events[srcDevice][stg_idx],0);
cudaMemcpyAsync(dst,g_hostBuffers[stg_idx],sz_cpy,cudaMemcpyHostToDevice,NULL);
cudaEventRecord(g_events[dstDevice][stg_idx]);
dst += sz_cpy;
src += sz_cpy;
N -= sz_cpy;
stg_idx = 1 - stg_idx;
}
cudaSetDevice(srcDevice);
cudaDeviceSynchronize();
cudaSetDevice(dstDevice);
cudaDeviceSynchronize();
return status;
}
You also need to define size_t sz_cpy outside the loop ;-)
Related
I have two kernels that process some data sequentially (launched with only one thread). I want to combine the two so that I can have one kernel to launch with two threads. After doing so, I was expecting to get an exec time of max(kernel1, kernel2) but what I got was the sum of the two exec times. I narrowed down the problem to something like the code below.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<string>
#include<vector>
#include<random>
#include<functional>
#include<algorithm>
#include<iterator>
__global__ void dummyKernel(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
}
else if (i == 1) {
printf("Thread one started \n");
for (int j{}; j < 1000000; j++)
d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
printf("Thread one finished \n");
}
}
__global__ void dummyKernel2(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
//if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
//}
//else if (i == 1) {
// printf("Thread one started \n");
// for (int j{}; j < 1000000; j++)
// d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
// printf("Thread one finished \n");
//}
}
int main()
{
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
const int sizeOfFrame = 2 * 1024 * 1024;
std::vector<float> data_re(sizeOfFrame), data_im;
//random number generator
std::uniform_real_distribution<float> distribution(0.0f, 2.0f); //Values between 0 and 2
std::mt19937 engine; // Mersenne twister MT19937
auto generator = std::bind(distribution, engine);
std::generate_n(data_re.begin(), sizeOfFrame, generator);
std::copy(data_re.begin(), data_re.end(), std::back_inserter(data_im));
//
float *d_data_re, *d_data_im;
cudaMalloc(&d_data_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_data_im, sizeOfFrame * sizeof(float));
cudaMemcpy(d_data_re, data_re.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_data_im, data_im.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
float *d_pll_out_re, *d_pll_out_im;
cudaMalloc(&d_pll_out_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_pll_out_im, sizeOfFrame * sizeof(float));
dummyKernel << <1, 2 >> >(d_data_re, d_data_im,
d_pll_out_re, d_pll_out_im, sizeOfFrame);
cudaDeviceSynchronize();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
btw I got the code for random number generator from an answer to this question. So, the dummyKernel doesn't do anything useful, I just wanted to have a kernel that took relatively long to finish. If you launch dummyKernel, the order of the output will be "Thread zero started", "Thread zero finished", "Thread one started", "Thread one finished". Sequential. But if you launch dummyKernel2, the order of the output will be "Thread zero started", "Thread zero started", "Thread zero finished", "Thread zero finished" and the exec time is almost half as dummyKernel. I don't understand this behavior and the effect of the if-else I used.
OS: Windows 10, GTX 1050 Ti, CUDA Driver/Runtime version: 11.1/10.1.
Each Cuda multiprocessor has execution units (several each for int, float, special functions, ...). Those work as pipelines, which take several cycles to complete a calculation, but in each cycle a new calculation can be inserted (=scheduled) and several calculations are processed at the same time at different stages of the pipeline.
Groups of 32 threads (warps) within a block are scheduled the same instruction at the same time (same cycle or often two cycles depending on how many execution and datapath resources are available on the architecture and needed for this instruction), together with a bitfield, stating, for which threads this instruction should be actively executed. If some threads of a warp evaluated an if clause as false, they are temporarily deactivated. Or some threads may have already exited the kernel.
The effect is that if the 32 warps diverge (branch differently), each execution path has to be run through for each of the 32 threads (with some threads deactivated for each path). That should be avoided for performance reasons, as the computation resources are reserved nevertheless. Threads from different warps don't have this interdependency. The algorithm should be structured in a way to consider this.
With Volta, Independent Thread Scheduling was introduced. Each thread has its own instruction counter (and manages a separate function callstack). But the scheduler still will schedule groups of 32 threads (warps) with bitfields for active threads. What changed is that the scheduler can interleave the diverging paths. Instead of executing CCCIIIEEECCC pre-Volta (instructions: C=common, I=if branch, e=else branch), it could execute CCCIEEIIECCC, if the available execution units or the memory latency better fits. As programmer, one has to be careful, as it can be no longer assumed that the threads have not diverged, even when executing the same instruction. That is why __syncwarp was introduced and all kind of cooperation functions (e.g. the shuffle instructions) got a sync variant. Nevertheless (although we cannot know for sure, if the threads diverged) one still has to program in a way that all 32 threads can work together, if executed synchronously, especially for coalesced memory accesses. Putting __syncwarp after each possibly diverging instruction can help to ensure convergence. (But do performance profiling).
The Independent Thread Scheduling is also the reason, why __syncthreads must definitely be called correctly on the RTX 3080 - with each thread participating. A typical correcting solution for the deadlock case you mentioned in the comment is to close the if clause, sync all the threads and open a new if clause with the same condition as the previous one.
What is the definition of start and end of kernel launch in the CPU and GPU (yellow block)? Where is the boundary between them?
Please notice that the start, end, and duration of those yellow blocks in CPU and GPU are different.Why CPU invocation of vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n); takes that long time?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
//printf("id = %d \n", id);
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 1000000;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
CPU yellow block:
GPU yellow block:
Note that you mention NVPROF but the pictures you are showing are from nvvp - the visual profiler. nvprof is the command-line profiler
GPU Kernel launches are asynchronous. That means that the CPU thread launches the kernel but does not wait for the kernel to complete. In fact, the CPU activity is actually placing the kernel in a launch queue - the actual execution of the kernel may be delayed if anything else is happening on the GPU.
So there is no defined relationship between the CPU (API) activity, and the GPU activity with respect to time, except that the CPU kernel launch must obviously precede (at least slightly) the GPU kernel execution.
The CPU (API) yellow block represents the duration of time that the CPU thread spends in a library call into the CUDA Runtime library, to launch the kernel (i.e. place it in the launch queue). This library call activity usually has some time overhead associated with it, in the range of 5-50 microseconds. The start of this period is marked by the start of the call into the library. The end of this period is marked by the time at which the library returns control to your code (i.e. your next line of code after the kernel launch).
The GPU yellow block represents the actual time period during which the kernel was executing on the GPU. The start and end of this yellow block are marked by the start and end of kernel activity on the GPU. The duration here is a function of what the code in your kernel is doing, and how long it takes.
I don't think the exact reason why a GPU kernel launch takes ~5-50 microseconds of CPU time is documented or explained anywhere in an authoritative fashion, and it is a closed source library, so you will need to acknowledge that overhead as something you have little control over. If you design kernels that run for a long time and do a lot of work, this overhead can become insignificant.
I am refreshing my mind with cuda, specially the unify memory (my last real cuda dev was 3 years ago), I am a bit rusted.
The pb:
I am creating a task from a container using unify memory. However, I get a crash, after a few days of investigation,
I am not able to say where is the crash (copy constructor), but not why. Because all pointers are allocated correctly.
I am not in contraction with Nvidia post (https://devblogs.nvidia.com/parallelforall/unified-memory-in-cuda-6/)
about C++ and unify memory
#include <cuda.h>
#include <cstdio>
template<class T>
struct container{
container(int size = 1){ cudaMallocManaged(&p,size*sizeof(T));}
~container(){cudaFree(p);}
__device__ __host__ T& operator[](int i){ return p[i];}
T * p;
};
struct task{
int* a;
};
__global__ void kernel_gpu(task& t, container<task>& v){
printf(" gpu value task %i, should be 2 \n", *(t.a)); // this work
task tmp(v[0]); // BUG
printf(" gpu value task from vector %i, should be 1 \n", *(tmp.a));
}
void kernel_cpu(task& t, container<task>& v){
printf(" cpu value task %i, should be 2 \n", *(t.a)); // this work
task tmp(v[0]);
printf(" cpu value task from vector %i, should be 1 \n", *(tmp.a));
}
int main(int argc, const char * argv[]) {
int* p1;
int* p2;
cudaMallocManaged(&p1,sizeof(int));
cudaMallocManaged(&p2,sizeof(int));
*p1 = 1;
*p2 = 2;
task t1,t2;
t1.a=p1;
t2.a=p2;
container<task> c(2);
c[0] = t1;
c[1] = t2;
//gpu does not work
kernel_gpu<<<1,1>>>(c[1],c);
cudaDeviceSynchronize();
//cpu should work, no concurent access
kernel_cpu(c[1],c);
printf("job done !\n");
cudaFree(p1);
cudaFree(p2);
return 0;
}
Objectively I can pass an object as an argument where the memory has been allocated properly. However, it look like it not possible to use a second degree
of indirection (here the container)
I am doing a conceptual mistake, but I do not see where.
Best,
Timocafe
my machine: cuda 7.5, gcc 4.8.2, Tesla K20 m
Although the memories were allocated as Unified Memory, the container itself is declared in host code and allocated in host memory: container<task> c(2);. You can not pass it as a reference to the device code, and de-referencing it in a kernel will very likely result in illegal memory access.
You may want to use cuda-memcheck to identify such issues.
I am basically looking for a way to synchronize a stream from within the device. I want to avoid using cudaDeviceSynchronize(), as it would serialize execution of my kernel that I want to execute concurrently using streams;
More detailed description: I have written a kernel, that is a stabilized bi-conjugate gradient solver. I want to lunch this kernel concurrently on different data using streams.
This kernel uses cublas functions. They are called from within the kernel.
One of operations required by the solver is calculation of a dot product of two vectors. This can be done with cublasdot(). But as this call is synchronous, execution of kernels in different streams get serialized. Instead of calling a dot product function, I calculate the dot product using cublasspmv(), which is called asynchronously. The problem is that this function returns before the result is calculated. I want therefore to synchronize the stream from the device - I am looking for an equivalent of cudaStreamSynchronize() but callable from the device.
__device__ float _cDdot(cublasHandle_t & cublasHandle, const int n, real_t * x, real_t * y) {
float *norm; norm = new float;
float alpha = 1.0f; float beta = 0.0f;
cublasSgemv_v2(cublasHandle, CUBLAS_OP_N ,1 , n, &alpha, x, 1, y, 1, &beta, norm, 1);
return *norm;
}
What can I do to make sure, that the result is calculated before the function returns? Of course insertion of cudaDeviceSynchronize() works, but as I mentioned, it serializes the execution of my kernel across streams.
Probably if you read the programming guide dynamic parallelism section carefully (especially streams, events, and synchronization), you may get some ideas. Here's what I came up with:
There is an implicit NULL stream (on the device) associated with the execution sequence that calls your _cDdot function (oddly named, IMHO, since you're working with float quantities in that case, i.e. using Sgemv). Therefore, any cuda kernel or API call issued after the call to cublasSgemv_v2 in your function should wait until any cuda activity associated with the cublasSgemv_v2 function is complete. If you insert an innocuous cuda API call, or else a dummy kernel call, after the call to cublasSgemv_v2, it should wait for that to be complete. This should give you the thread-level synchronization you are after. You might also be able to use a cudaEventRecord call followed by a cudaStreamWaitEvent call.
Here's an example to show the implicit stream synchronization approach:
#include <stdio.h>
#include <cublas_v2.h>
#define SZ 16
__global__ void dummy_kernel(float *in, float *out){
*out = *in;
}
__device__ float _cDdot(cublasHandle_t & cublasHandle, const int n, float * x, float * y, const int wait) {
float *norm; norm = new float;
float alpha = 1.0f; float beta = 0.0f;
*norm = 0.0f;
cublasSgemv_v2(cublasHandle, CUBLAS_OP_N ,1 , n, &alpha, x, 1, y, 1, &beta, norm, 1);
if (wait){
dummy_kernel<<<1,1>>>(norm, norm);
}
return *norm;
}
__global__ void compute(){
cublasHandle_t my_h;
cublasStatus_t status;
status = cublasCreate(&my_h);
if (status != CUBLAS_STATUS_SUCCESS) printf("cublasCreate fail\n");
float *x, *y;
x = new float[SZ];
y = new float[SZ];
for (int i = 0; i < SZ; i++){
x[i] = 1.0f;
y[i] = 1.0f;}
float result = _cDdot(my_h, SZ, x, y, 0);
printf("result with no wait = %f\n", result);
result = _cDdot(my_h, SZ, x, y, 1);
printf("result with wait = %f\n", result);
}
int main(){
compute<<<1,1>>>();
cudaDeviceSynchronize();
return 0;
}
compile with:
nvcc -arch=sm_35 -rdc=true -o t302 t302.cu -lcudadevrt -lcublas -lcublas_device
results:
$ ./t302
result with no wait = 0.000000
result with wait = 16.000000
$
Unfortunately I tried a completely empty dummy_kernel; that did not work, unless I compiled with -G. So the compiler may be smart enough to optimize out a complete empty child kernel call.
I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).