Can we get cuda kernel function name in cudaLaunchKernel? - cuda

I was trying to insert some codes in cudaLaunchKernel and need to store its function name, but I cannot find a direct API that can help me to get the kernel function name. I have considered CUPTI, but it uses callback function to get the information so I cannot change the behavior of the kernel launch(or need heavy inter-process communication which is ugly.....)
Is there any way I can get the function name in cudaLaunchKernel(maybe by the function pointer?)?
An exampla is as follows.
cudaKernelLaunch(...) {
kernel_id = getKernelNameBySomeMethods(); // it's what I want..
send_to_other_processes(kernel_name);
return ::cudaKernelLaunch(...);
}
// for other process
receive_kernel_name_from_other_process;
store_it;
Edit: A identifier is also ok. I may send the ID to another process to store so I need to classify different cuda kernels.

There are no APIs to do this, either public or private AFAIK. The compiler emits a lot of static host side boilerplate to perform the runtime API magic we take for granted, it isn't done by the runtime library itself.
However, the nature of that boilerplate means you can build your own lookup table pretty easily -- some hacking over a lunch break got me this partial proof of concept which does what I think it is you want:
#include <cstdio>
#include <map>
#include <string>
#include <iostream>
__global__ void kernel_1(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_1\n");
if (tidx < N) out[tidx] = in[tidx];
}
__global__ void kernel_2(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_2\n");
if (tidx < N) out[tidx] = 2.f * in[tidx];
}
__global__ void kernel_3(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_3\n");
if (tidx < N) out[tidx] = 3.f * in[tidx];
}
void notakernel(float *in, float *out, int N)
{
printf("Someone bad happened\n");
}
std::map <void*, std::string> ktable = {
{ (void*)kernel_1, "kernel_1" },
{ (void*)kernel_2, "kernel_2" },
{ (void*)kernel_3, "kernel_3" } };
cudaError_t MyLaunchKernel (void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream)
{
auto it = ktable.find(func);
if (it != ktable.end()) {
std::cout << "Received request to call " << it->second << std::endl;
} else {
std::cout << "Received request to call unknown function!" << std::endl;
}
return cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
}
int main()
{
int N = 100;
float* a; cudaMalloc<float>(&a, N * sizeof(float));
float* b; cudaMalloc<float>(&b, N * sizeof(float));
void* args[] = { (void*)&a, (void*)&b, (void*)&N };
MyLaunchKernel((void*)kernel_1, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)kernel_2, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)kernel_3, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)notakernel, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
return 0;
}
which appears to work:
$ nvcc -std=c++11 -arch=sm_52 -o lookup lookup.cu
$ cuda-memcheck ./lookup
========= CUDA-MEMCHECK
Received request to call kernel_1
Running kernel_1
Received request to call kernel_2
Running kernel_2
Received request to call kernel_3
Running kernel_3
Received request to call unknown function!
========= Program hit cudaErrorInvalidDeviceFunction (error 98) due to "invalid device function" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3b9803]
========= Host Frame:./lookup [0x4ca95]
========= Host Frame:./lookup [0x746c]
========= Host Frame:./lookup [0x769f]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./lookup [0x722a]
=========
========= ERROR SUMMARY: 1 error
Obviously things need to be a bit more complex in a complete implementation for your use case -- you would require the reverse lookup implementation for another called to go from name/ID to pointer, and if you have multiple source files compiled separately, then you would need a list concatenation call for the construction of the working list at runtime. But it is important to remember that the function pointers you are passing are actually host pointers, not device pointers (thanks to the runtime API magic), so the cost and complexity of runtime setup is trivial when you can use pre-baked C++ standard library containers and algorithms and function adapters to do most of the heavy lifting.

Related

Cuda C threads synchronization with printf or other functions

I have a problem with threads' id during the block executes.
I would like to have sentence like :"My temporary string is printed via GPU!" as you see (on the attached photo ealier) the sentence has been displayed wrongly and I don't know how to fix it.
Code:
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
while (id_x < static_cast<int>(*loop_repeat))
{
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp{};
temp = Get_String_Length(my_string); //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length{};
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), HostToDevice);
char* string_GPU{};
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), HostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("%cKernel executed successfully with code: %d !%\n", NEW_LINE, final_error);
}
else
{
printf("%cKernel executed with code error: %d !\n", NEW_LINE, final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
I will be grateful for any help given.
The main issue here is that you are expecting that the thread or warp execution order has some predictable order. Actually, it does not. Your usage of __syncthreads() doesn't fix or address this issue.
If you want the warps to execute in a predictable order (not recommended) you would need to impose that order yourself. Here is an example that demonstrates that for this very simple code. It is not extensible without modification to larger strings, and this method will completely break down if you introduce more than 1 threadblock.
$ cat t1543.cu
#include <stdio.h>
#include <stdlib.h>
__global__ void Print(const char* const __string, const size_t* const loop_repeat)
{
int id_x = threadIdx.x + blockIdx.x * blockDim.x;
int warp_ID = threadIdx.x>>5;
while (id_x < static_cast<int>(*loop_repeat))
{
if (warp_ID == 0)
printf("%c", __string[id_x]);
__syncthreads();
if (warp_ID == 1)
printf("%c", __string[id_x]);
__syncthreads();
id_x += blockDim.x * gridDim.x;
}
}
int main()
{
const char* my_string = "My temporary string is printed via GPU!";
size_t temp;
temp = 40; //get the string length
//GPU MEMORY ALLOCATION
size_t* my_string_length;
cudaMalloc((void**)&my_string_length, sizeof(size_t));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(my_string_length, &temp, sizeof(size_t), cudaMemcpyHostToDevice);
char* string_GPU;
cudaMalloc((void**)&string_GPU, (temp) * sizeof(char));
//COPY VALUE FROM CPU(RAM) TO GPU
cudaMemcpy(string_GPU, my_string, (temp) * sizeof(char), cudaMemcpyHostToDevice);
dim3 grid_size(1);
dim3 block_size((temp));
Print <<< grid_size, temp >>> (string_GPU, my_string_length);
cudaError_t final_error = cudaDeviceSynchronize(); //for synchronization e.g Hello_World then printf
if (final_error == cudaSuccess)
{
printf("\nKernel executed successfully with code: %d !%\n", final_error);
}
else
{
printf("\nKernel executed with code error: %d !\n", final_error);
}
cudaFree(my_string_length);
cudaFree(string_GPU);
return 0;
}
$ nvcc -o t1543 t1543.cu
$ cuda-memcheck ./t1543
========= CUDA-MEMCHECK
My temporary string is printed via GPU!
Kernel executed successfully with code: 0 !%
========= ERROR SUMMARY: 0 errors
$
Note that I'm not suggesting the above is good coding style. It's provided for understanding of the issue. Even this code is relying on the idea that the threads within a warp will call the printf function in a predictable order, which is not guaranteed by the CUDA programming model. So the code is really still broken.
This happened because The multiprocessor creates, manages, schedules, and executes threads in groups of 32 parallel threads called warps as you can see in CUDA Programming Guide, so the first 32 threads covers "My temporary string is printed v" and the remaining part covers "ia GPU!". It seems that the kernel put the latter wrap before the first one in execution order.

Unified memory and struct with arrays

I have a big Struct of Arrays of Structs on CUDA, that is constant and read only for my application. A quite simplified example would be
struct Graph{
Node * nodes;
int nNode;
}
struct Node{
int* pos;
int nPos;
}
My kernels would need to navigate this graph and query it. As you know, copying this struct to GPU memory with cudaMalloc and cudaMemcpy is just lots of code, that unified memory is supposed to remove the need of.
In my code, I generated the graph in CPU and then, for testing, I designed the following kernel
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nNode;
};
being called as:
// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));
testKernel<<<1,1>>>(graph,d_res);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));
with the error checks from here.
When I use the testKernel as is shown, it works fine, but if I change the kernel to:
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nodes[0].nPos;
};
I get illegal memory access errors.
Is this because the unified memory does not handle this type of data correctly?
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Full MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
// d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
graph.nodes = (Node*)malloc(2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
Your code isn't using CUDA unified memory. UM is not "automatic" in any way. It requires specific programming steps to take advantage of it and it has specific system requirements.
All of this is covered in the UM section of the programming guide.
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Proper use of UM should allow this. Here is a fully worked example. The only thing I have done is mechanically convert your malloc operations in host code to equivalent cudaMallocManaged operations.
$ cat t1389.cu
#include <algorithm>
#include <stdio.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
$ nvcc t1389.cu -o t1389
$ cuda-memcheck ./t1389
========= CUDA-MEMCHECK
2
3========= ERROR SUMMARY: 0 errors
$
UM has a number of system requirements that are documented. I'm not going to try to recite them all here. Primarily you need a cc3.0 or higher GPU. Your MCVE did not include any standard error checking, and I didn't try to add it. But if you still have problems with this code, be sure to use proper CUDA error checking and run it with cuda-memcheck.
If your entire data structure, including embedded pointers, is allocated using ordinary host allocators, and you have no control over that, then you won't be able to use it directly in a UM regime, without doing some sort of involved copying. The exception here would be on an IBM Power9 system as mentioned in section K.1.6 of the above linked programming guide section.
Before attempting to use a host allocator (e.g. malloc) with UM, you should first test the pageableMemoryAccessUsesHostPageTables property, as mentioned in that section.
That property currently won't be set on any system except a properly configured IBM Power9 system. No x86 system currently has this property set/available.

Can't I call a __host__ __device__ function from a __device__ function?

In CUDA documentation I found that cudaDeviceGetAttribute is a __host__ __device__ function. So I thought I could call it in my __global__ function to get some attributes of my device. Sadly it seems to mean something different because I get an compile error event if I put it into a __device__ function and call this one from my global.
Is it possible to call cudaDeviceGetAttribute on my GPU? or what else does __host__ __device__ mean?
Here is my source code:
__device__ void GetAttributes(int* unique)
{
cudaDeviceAttr attr = cudaDevAttrMaxThreadsPerBlock;
cudaDeviceGetAttribute(unique, attr, 0);
}
__global__ void ClockTest(int* a, int* b, long* return_time, int* unique)
{
clock_t start = clock();
//some complex calculations
*a = *a + *b;
*b = *a + *a;
GetAttributes(unique);
*a = *a + *b - *a;
clock_t end = clock();
*return_time = end - start;
}
int main()
{
int a = 2;
int b = 3;
long time = 0;
int uni;
int* dev_a;
int* dev_b;
long* dev_time;
int* unique;
for (int i = 0; i < 10; ++i) {
cudaMalloc(&dev_a, sizeof(int));
cudaMalloc(&dev_b, sizeof(int));
cudaMalloc(&dev_time, sizeof(long));
cudaMalloc(&unique, sizeof(int));
cudaMemcpy(dev_a, &a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, &b, sizeof(int), cudaMemcpyHostToDevice);
ClockTest <<<1,1>>>(dev_a, dev_b, dev_time, unique);
cudaMemcpy(&a, dev_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&time, dev_time, sizeof(long), cudaMemcpyDeviceToHost);
cudaMemcpy(&uni, unique, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(&dev_a);
cudaFree(&dev_b);
cudaFree(&dev_time);
cudaFree(&unique);
printf("%d\n", time);
printf("unique: %d\n", uni);
cudaDeviceReset();
}
return 0;
}
EDIT: sorry, my previous answer was not correct. There does seems to be a problem in nvcc (see below).
cudaDeviceGetAttribute can work correctly in device code, here is a worked example on K20X, CUDA 8.0.61:
$ cat t1305.cu
#include <stdio.h>
__global__ void tkernel(){
int val;
cudaError_t err = cudaDeviceGetAttribute(&val, cudaDevAttrMaxThreadsPerBlock, 0);
printf("err = %d, %s\n", err, cudaGetErrorString(err));
printf("val = %d\n", val);
}
int main(){
tkernel<<<1,1>>>();
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t1305 t1305.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t1305
========= CUDA-MEMCHECK
err = 0, no error
val = 1024
========= ERROR SUMMARY: 0 errors
$
There are various runtime API functions supported for use in device code.
For the supported runtime API functions, it's generally necessary to:
compile for a cc 3.5 or higher device
compile with relocatable device code
link against the cuda device runtime library
In addition, your code has some other coding errors in that we do not pass the address of the pointer to cudaFree, just the pointer itself.
Caveats for this particular function:
There appears to be a problem in the CUDA compiler that if this device runtime API call is used without any other runtime API call in the kernel code, then the code generation will not happen correctly. The workaround at this time is to make sure your kernel contains at least one other cuda runtime API call. In my above example I used cudaGetErrorString, but you could e.g. use cudaDeviceSynchronize() or anything else, I think. I have filed an internal NVIDIA bug to report this issue.
There appears to be a documentation error in the list of device runtime API calls supported in the CDP section of the programming guide (link above). The function cudaGetDeviceProperty does not exist, but I believe it should refer to cudaDeviceGetAttribute. I have filed an internal NVIDIA bug for this documentation error.

Counting occurrences of specific events in CUDA kernels

Problem
I am trying to find the best way to count how many times my program ends up in some specific branches of my CUDA kernels. The idea is that some events should nearly never happen, but since the data processed by the GPU is given by a numerical optimization solver, there may be some situations where ill-defined cases become more common. Thus, I want to be able to track/monitor these phenomenons over multiple simulations to make some global statistics later.
Possible idea
The most straightforward way to do this may be to use a structure dedicated to monitoring such occurrences. Then, when entering a monitored branch, we increment the associated counter using atomicAdd. At the end of the simulation, we copy the counters back to the host and store them for some future statistics processing.
In my case, the cost of using atomicAdd should not be that important since I should not be entering those branches that much, but still, I may want to monitor some of the common branches later on, so what would be a better approach then? Since this is just for monitoring, I do not want the overhead to be too important.
I guess I could also have one monitoring structure per block and do a sum at the end, since it should not use much global memory anyway (1 unsigned int per monitored branch).
Code example
#include <iostream>
#include <time.h>
#include <cuda.h>
#include <stdio.h>
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
struct Stats
{
unsigned int even;
};
__global__ void test_kernel(int* A, int* B, Stats* stats)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int res = A[tid] + (int)tid;
if (res%2 == 0)
atomicAdd(&(stats->even), 1);
B[tid] = res;
}
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
void print_array(int* ar, unsigned int n)
{
for (unsigned int i = 0; i < n; ++i)
std::cout << ar[i] << " ";
std::cout << std::endl;
}
void print_stats(Stats* s)
{
std::cout << "even: " << s->even << std::endl;
}
int main()
{
// vector size
const unsigned int N = 10;
// device vectors
int *d_A, *d_B;
Stats *d_stats;
// host vectors
int *h_A, *h_B;
Stats *h_stats;
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc(&d_A, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_B, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_stats, sizeof(Stats)));
// allocate host memory
h_A = new int[N];
h_B = new int[N];
h_stats = new Stats;
// initialize host data
srand(time(NULL));
for (unsigned int i = 0; i < N; ++i)
{
h_A[i] = get_random_int(0,10);
h_B[i] = 0;
}
memset(h_stats, 0, sizeof(Stats));
// copy data to the device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_stats, h_stats, sizeof(Stats), cudaMemcpyHostToDevice));
// launch kernel
dim3 grid_size, block_size;
grid_size.x = N;
test_kernel<<<grid_size, block_size>>>(d_A, d_B, d_stats);
// copy result back to host
CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_stats, d_stats, sizeof(Stats), cudaMemcpyDeviceToHost));
print_array(h_B, N);
print_stats(h_stats);
// free device memory
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_stats));
// free host memory
delete [] h_A;
delete [] h_B;
delete h_stats;
}
Hardware/software information
The solution I am looking for should work for CC >= 2.0 devices and CUDA >= 5.0.
The atomicAdd is is one possibility and i would probably go that route. If you do not use the result of the atomicAdd function call the compiler will emit a reduction operation such as RED.E.ADD. Reduction is very fast as long as there are not many conflicts happening (i actually use it sometimes even if i do not need the operation to be atomic because it can be quicker than loading value from global memory, doing an arithmetic operation and saving back to global memory).
The second option you have is to use a profiler counter and use the profiler to analyze the result. Please see Profiler Counter Function for more details.

CUDA pinned memory flushing from the device

CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.
There is no shared memory access between threads, every thread is standalone.
I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize on the host.
I want to be able to:
Flush into the pinned memory as soon as the device has updated it.
Not block the device thread (maybe by copying asynchronously)
I tried calling __threadfence_system and __threadfence after each device write, but that didn't flush.
Below is a full sample CUDA code that demonstrates my question:
#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}
Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}
I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:
The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.
I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1 2 and for a long time does nothing until stream finished... is printed to the console.
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )
bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}
class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);
private:
void call();
};
void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}
void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}
int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));
int filledElementsCounter = 0;
while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.
You cannot pass the host pointer directly to the kernel. If you allocate host memory using cudaHostAlloc with cudaHostAllocMapped flag, then first you have to retrieve the device pointer of the mapped host memory before you can use it in the kernel. Use cudaHostGetDevicePointer to get the device pointer of mapped host memory.
float* hResult, *dResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
cudaHostGetDevicePointer(&dResult,hResult);
Kernel<<<1,blocks>>>(dResult);
Calling __threadfence_system() will ensure that the write is visible to the system before proceeding, but your CPU will be caching the h_result variable and hence you're just spinning on the old value in an infinite loop. Try marking h_result as volatile.