Using multi streams in cuda graph, the execution order is uncontrolled - cuda

I am using cuda graph stream capture API to implement a small demo with multi streams. Referenced by the CUDA Programming Guide here, I wrote the complete code. In my knowledge, kernelB should execute on stream1, but with nsys I found kernelB is executed on a complete new stream. It is under-control. The scheduling graph is showed below:
Here is my code:
#include <iostream>
__global__ void kernelA() {}
__global__ void kernelB() {}
__global__ void kernelC() {}
int main() {
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaGraphExec_t graphExec = NULL;
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
for (int i = 0; i < 10; i++) {
cudaGraph_t graph;
cudaGraphExecUpdateResult updateResult;
cudaGraphNode_t errorNode;
cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
kernelA<<<512, 512, 0, stream1>>>();
cudaEventRecord(event1, stream1);
cudaStreamWaitEvent(stream2, event1, 0);
kernelB<<<256, 512, 0, stream1>>>();
kernelC<<<16, 512, 0, stream2>>>();
cudaEventRecord(event2, stream2);
cudaStreamWaitEvent(stream1, event2, 0);
cudaStreamEndCapture(stream1, &graph);
if (graphExec != NULL) {
cudaGraphExecUpdate(graphExec, graph, &errorNode, &updateResult);
}
if (graphExec == NULL || updateResult != cudaGraphExecUpdateSuccess) {
if (graphExec != NULL) {
cudaGraphExecDestroy(graphExec);
}
cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
}
cudaGraphDestroy(graph);
cudaGraphLaunch(graphExec, stream1);
cudaStreamSynchronize(stream1);
}
}

"An operation may be scheduled at any time once the nodes on which it depends are complete. Scheduling is left up to the CUDA system." Here.

I also ask in Nvidia Forums, Robert answered this question which help me a lot. Someone who are interested in the scheduling of cuda graph can also reference to this answer here.

Related

Unified memory and struct with arrays

I have a big Struct of Arrays of Structs on CUDA, that is constant and read only for my application. A quite simplified example would be
struct Graph{
Node * nodes;
int nNode;
}
struct Node{
int* pos;
int nPos;
}
My kernels would need to navigate this graph and query it. As you know, copying this struct to GPU memory with cudaMalloc and cudaMemcpy is just lots of code, that unified memory is supposed to remove the need of.
In my code, I generated the graph in CPU and then, for testing, I designed the following kernel
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nNode;
};
being called as:
// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));
testKernel<<<1,1>>>(graph,d_res);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));
with the error checks from here.
When I use the testKernel as is shown, it works fine, but if I change the kernel to:
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nodes[0].nPos;
};
I get illegal memory access errors.
Is this because the unified memory does not handle this type of data correctly?
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Full MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
// d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
graph.nodes = (Node*)malloc(2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
Your code isn't using CUDA unified memory. UM is not "automatic" in any way. It requires specific programming steps to take advantage of it and it has specific system requirements.
All of this is covered in the UM section of the programming guide.
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Proper use of UM should allow this. Here is a fully worked example. The only thing I have done is mechanically convert your malloc operations in host code to equivalent cudaMallocManaged operations.
$ cat t1389.cu
#include <algorithm>
#include <stdio.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
$ nvcc t1389.cu -o t1389
$ cuda-memcheck ./t1389
========= CUDA-MEMCHECK
2
3========= ERROR SUMMARY: 0 errors
$
UM has a number of system requirements that are documented. I'm not going to try to recite them all here. Primarily you need a cc3.0 or higher GPU. Your MCVE did not include any standard error checking, and I didn't try to add it. But if you still have problems with this code, be sure to use proper CUDA error checking and run it with cuda-memcheck.
If your entire data structure, including embedded pointers, is allocated using ordinary host allocators, and you have no control over that, then you won't be able to use it directly in a UM regime, without doing some sort of involved copying. The exception here would be on an IBM Power9 system as mentioned in section K.1.6 of the above linked programming guide section.
Before attempting to use a host allocator (e.g. malloc) with UM, you should first test the pageableMemoryAccessUsesHostPageTables property, as mentioned in that section.
That property currently won't be set on any system except a properly configured IBM Power9 system. No x86 system currently has this property set/available.

Atomic operation on the circular global buffer in cuda

I am implementing a circular global memory to enable all threads read/write data to the same buffer simultaneously. It is a very simple producer/consumer algorithm in cpu. But i found something wrong in my cuda code.
The circular buffer was defined as follows:
#define BLOCK_NUM 1024
#define THREAD_NUM 64
#define BUFFER_SIZE BLOCK_NUM*THREAD_NUM*10
struct Stack {
bool bDirty[BUFFER_SIZE];
unsigned int index;
unsigned int iStackSize;
}
The read device is implemented as
__device__ void read(Stack *pStack) {
unsigned int index = atomicDec(&pStack->index, BUFFER_SIZE-1);
if(- -index >= BUFFER_SIZE)
index = BUFFER_SIZE - 1;
// check
if(pStack->bDirty[index] == false) {
printf(“no data\n”);
return;
}
//set read flag
pStack->bDirty[index] = false;
atomicSub(&pStack->iStackSize, 1);
}
The write device function is:
__device__ void write(Stack *pStack) {
unsigned int index = atomicInc(&pStack->index, BUFFER_SIZE - 1);
//check
if(pStack->bDirty[index] == true) {
printf(“why dirty\n”);
return;
}
pStack->bDirty[index] = true;
atomicAdd(&pStack->iStackSize, 1);
}
In order to test the read/write function in a more robust way, I write the following kernels:
__global__ void kernelWrite(Stack *pStack) {
if(threadIdx.x != 0) //make write less than thread number for testing purpose
write(pStack);
}
__global__ void kernelRead(Stack *pStack) {
read(pStack);
__syncthreads();
if(threadIdx.x % 3 != 0) // make write less than read
write(pStack);
__syncthreads();
}
In the main function, I used a dead loop to test if the read/write is atomic.
int main() {
Stack *pHostStack = (Stack*)malloc(sizeof(Stack));
Stack *pStack;
cudaMalloc(&pStack, sizeof(Stack));
cudaMemset(pStack, 0, sizeof(Stack));
while(true) { //dead loop
kernelWrite<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
while(pHost->iStackSize >= BLOCK_NUM*THREAD_NUM) {
kernelRead<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
}
return 0;
}
When I execute the above code, I got error msg “why dirty” and “no data”. What is wrong to the read/write logic?
By the way, I do not map the thread ID to the linear buffer address because in my application maybe only 10% threads write to the buffer, it is unpredictable/random.
The key problem is that the atomic operation is not real atomic because of reading and writing to the same buffer. The weird thing is that when the total thread number is less then 4096, no error message will be shown.

Can't I call a __host__ __device__ function from a __device__ function?

In CUDA documentation I found that cudaDeviceGetAttribute is a __host__ __device__ function. So I thought I could call it in my __global__ function to get some attributes of my device. Sadly it seems to mean something different because I get an compile error event if I put it into a __device__ function and call this one from my global.
Is it possible to call cudaDeviceGetAttribute on my GPU? or what else does __host__ __device__ mean?
Here is my source code:
__device__ void GetAttributes(int* unique)
{
cudaDeviceAttr attr = cudaDevAttrMaxThreadsPerBlock;
cudaDeviceGetAttribute(unique, attr, 0);
}
__global__ void ClockTest(int* a, int* b, long* return_time, int* unique)
{
clock_t start = clock();
//some complex calculations
*a = *a + *b;
*b = *a + *a;
GetAttributes(unique);
*a = *a + *b - *a;
clock_t end = clock();
*return_time = end - start;
}
int main()
{
int a = 2;
int b = 3;
long time = 0;
int uni;
int* dev_a;
int* dev_b;
long* dev_time;
int* unique;
for (int i = 0; i < 10; ++i) {
cudaMalloc(&dev_a, sizeof(int));
cudaMalloc(&dev_b, sizeof(int));
cudaMalloc(&dev_time, sizeof(long));
cudaMalloc(&unique, sizeof(int));
cudaMemcpy(dev_a, &a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, &b, sizeof(int), cudaMemcpyHostToDevice);
ClockTest <<<1,1>>>(dev_a, dev_b, dev_time, unique);
cudaMemcpy(&a, dev_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&time, dev_time, sizeof(long), cudaMemcpyDeviceToHost);
cudaMemcpy(&uni, unique, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(&dev_a);
cudaFree(&dev_b);
cudaFree(&dev_time);
cudaFree(&unique);
printf("%d\n", time);
printf("unique: %d\n", uni);
cudaDeviceReset();
}
return 0;
}
EDIT: sorry, my previous answer was not correct. There does seems to be a problem in nvcc (see below).
cudaDeviceGetAttribute can work correctly in device code, here is a worked example on K20X, CUDA 8.0.61:
$ cat t1305.cu
#include <stdio.h>
__global__ void tkernel(){
int val;
cudaError_t err = cudaDeviceGetAttribute(&val, cudaDevAttrMaxThreadsPerBlock, 0);
printf("err = %d, %s\n", err, cudaGetErrorString(err));
printf("val = %d\n", val);
}
int main(){
tkernel<<<1,1>>>();
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t1305 t1305.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t1305
========= CUDA-MEMCHECK
err = 0, no error
val = 1024
========= ERROR SUMMARY: 0 errors
$
There are various runtime API functions supported for use in device code.
For the supported runtime API functions, it's generally necessary to:
compile for a cc 3.5 or higher device
compile with relocatable device code
link against the cuda device runtime library
In addition, your code has some other coding errors in that we do not pass the address of the pointer to cudaFree, just the pointer itself.
Caveats for this particular function:
There appears to be a problem in the CUDA compiler that if this device runtime API call is used without any other runtime API call in the kernel code, then the code generation will not happen correctly. The workaround at this time is to make sure your kernel contains at least one other cuda runtime API call. In my above example I used cudaGetErrorString, but you could e.g. use cudaDeviceSynchronize() or anything else, I think. I have filed an internal NVIDIA bug to report this issue.
There appears to be a documentation error in the list of device runtime API calls supported in the CDP section of the programming guide (link above). The function cudaGetDeviceProperty does not exist, but I believe it should refer to cudaDeviceGetAttribute. I have filed an internal NVIDIA bug for this documentation error.

Is there any way I can have a barrier within Device code that is controlled by Host?

For example, my code is something like this (but it doesn't work and the kernel stalls):
__device__ __managed__ int x;
__global__ void kernel() {
// do something
while(x == 1); // a barrier
// do the rest
}
int main() {
x = 1;
kernel<<< 1, 1 >>>();
x = 0;
//...
}
Is there anyway I can do this?
You cannot do this with the current implementation of managed memory because managed memory requires exclusive access to managed data by the device, when kernels are running. Host access to managed data during the time when kernels are running will lead to undefined behavior, typically seg fault.
This should be possible using zero-copy techniques, however, including the volatile recommendation from #Cicada.
Here's a worked example:
$ cat t736.cu
#include <stdio.h>
#include <unistd.h>
__global__ void mykernel(volatile int *idata, volatile int *odata){
*odata = *idata;
while (*idata == 1);
*odata = *idata+5;
}
int main(){
int *idata, *odata;
cudaHostAlloc(&idata, sizeof(int), cudaHostAllocMapped);
cudaHostAlloc(&odata, sizeof(int), cudaHostAllocMapped);
*odata = 0;
*idata = 1; // set barrier
mykernel<<<1,1>>>(idata, odata);
sleep(1);
printf("odata = %d\n", *odata); // expect this to be 1
*idata = 0; // release barrier
sleep(1);
printf("odata = %d\n", *odata); // expect this to be 5
cudaDeviceSynchronize(); // if kernel is hung, we will hang
return 0;
}
$ nvcc -o t736 t736.cu
$ cuda-memcheck ./t736
========= CUDA-MEMCHECK
odata = 1
odata = 5
========= ERROR SUMMARY: 0 errors
$
The above assumes a linux 64 bit environment.

Using CUDA Thrust algorithms sequentially on the host

I wish to compare a Thrust algorithm's runtime when executed sequentially on a single CPU core versus a parallel execution on a GPU.
Thrust specifies the thrust::seq execution policy, but how can I explicity target the host backend system? I wish to avoid executing the algorithm sequentially on the GPU.
CUDA Thrust is architecture agnostic. Accordingly, consider the code I provided as an answer to
Cumulative summation in CUDA
In that code, MatingProbability and CumulativeProbability were thrust::device_vectors. thrust::transform and thrust::inclusive_scan were automatically able to recognize that and operate accordingly on the GPU.
Below, I'm providing the same code by changing thrust::device_vector to thrust::host_vector. Again, thrust::transform and thrust::inclusive_scan are able to automatically recognize that the vectors to operate on reside on the CPU and to operate accordingly.
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdio>
template <class T>
struct scaling {
const T _a;
scaling(T a) : _a(a) { }
__host__ __device__ T operator()(const T &x) const { return _a * x; }
};
void main()
{
const int N = 20;
double a = -(double)N;
double b = 0.;
double Dx = -1./(0.5*N*(N+1));
thrust::host_vector<double> MatingProbability(N);
thrust::host_vector<double> CumulativeProbability(N+1, 0.);
thrust::transform(thrust::make_counting_iterator(a), thrust::make_counting_iterator(b), MatingProbability.begin(), scaling<double>(Dx));
thrust::inclusive_scan(MatingProbability.begin(), MatingProbability.end(), CumulativeProbability.begin() + 1);
for(int i=0; i<N+1; i++)
{
double val = CumulativeProbability[i];
printf("%d %3.15f\n", i, val);
}
}