Is it possible to get assertion info from within a CUDA kernel? - cuda

Is there any way to get a kernel assert message/line number back from a kernel failure?
That is, if I have:
__global__ void my_kernel(int x){
assert(x!=0);
}
int main(){
CUDA_CHECK(my_kernel<<<1,1>>>(0));
CHECK_WITH_ASSERTION_FETCH(cudaDeviceSynchronize());
}
My understanding is that CUDA_CHECK() passes here and cudaDeviceSynchronize() would return a failure code (specifically, CUDA error: device-side assert triggered CUDA kernel errors).
Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error? The file and line number in which the assertion failed would be sufficient.

Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error?
No there isn't.
As per the documentation, one way that you can see which line of code triggered the assertion and in which block and thread the assertion was raised is by attaching the debugger to the running kernel.

Robert Crovella's the authoritative voice here and says it isn't possible for a kernel assert to get information about itself back to the host. So we need some workarounds.
A major complicating factor is that if assert is called on the device then we are no longer able to communicate with it from the host, so any data we write to device memory is lost forever. (Reference).
Below I offer three:
Using Unified Memory to pass info from the GPU to the CPU even "after" an assert is called. This is the best answer.
Improving the GPU's assertion error messages by passing stacks to the GPU.
Passing info from the GPU to the CPU by dropping asserts and writing to memory. You'd only use this if UVM wasn't an option for some reason.
Using Unified Memory
Unified Memory allows the CUDA device and the host to transparently shuffle bits between each other without the need for cudaMemcpy. The result is that even though throwing an assert blocks our access to the device via regular API calls, we can still transfer signals back to the CPU via the Unified Memory.
Note that if we want kernels to be able to run asynchronously we need a way to associate kernel launches with assertion failures. The circular buffer here provides this functionality.
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
// Copy string from `src` to `dst`
__device__ void dstrcpy(char *dst, const char *src){
for(;*src!='\0';dst++,src++){
*dst = *src;
}
*dst = '\0';
}
// Used to hold assertion data generated by the device
struct AssertionData {
char assertion_msg[1000];
char filename[1000];
char function_name[1000];
int line_number;
uint32_t caller;
dim3 block_id;
dim3 thread_id;
};
// Used to hold assertions generated by the device
struct AssertionsData {
int assertion_count;
AssertionData assertions[DEVICE_SIDE_ASSERTION_COUNT];
// Get the next place to insert an assertion failure message
__device__ int next_id(){
// Atomically increment so other threads can fail at the same time
return atomicAdd(&assertion_count, 1);
}
__device__ void insert(
const char *assertion_msg0,
const char *filename0,
const char *function_name0,
const int line_number0,
const uint32_t caller0,
const dim3 block_id0,
const dim3 thread_id0
){
const auto nid = next_id();
if(nid>DEVICE_SIDE_ASSERTION_COUNT){
printf("RAN OUT OF ASSERTION BUFFER SPACE!");
return;
}
auto& self = assertions[nid];
dstrcpy(self.assertion_msg, assertion_msg0);
dstrcpy(self.filename, filename0);
dstrcpy(self.function_name, function_name0);
self.line_number = line_number0;
self.caller = caller0;
self.block_id = block_id0;
self.thread_id = thread_id0;
}
};
// Pointer to device memory allocated to hold assertion failure messages
AssertionsData *uvm_assertions = nullptr;
// Use to hold stack traces generated by the host so that we can run kernels
// asynchronously and still associate stacks to assertion failures
struct StackTraceInfo {
boost::stacktrace::stacktrace stacktrace;
int device;
cudaStream_t stream;
uint32_t generation_number;
StackTraceInfo() = default;
StackTraceInfo(int generation_number0, cudaStream_t stream0) {
// Technically we'd want to lop the top few layers off of this
generation_number = generation_number0;
stacktrace = boost::stacktrace::stacktrace();
CUDA_CHECK_API_CALL(cudaGetDevice(&device));
stream = stream0;
}
};
// Circular buffer used to hold stacks generated by the host
struct CircularTraceBuffer {
// Assume that this is the max number of items that might ever be enqueued
// across all streams
static constexpr int max_size = 1024;
// How many stacktraces we've inserted. Used to ensure that circular queue
// doesn't provide false information by always increasing, but also to mark
// where we are inserting into the queue
uint32_t generation_number = 0;
// The buffer
std::array<StackTraceInfo, max_size> traces;
uint32_t insert(cudaStream_t stream_id) {
traces[generation_number % max_size] = StackTraceInfo(generation_number, stream_id);
return generation_number++;
}
};
// Circular buffer of host stacktraces for associating with kernel launches
CircularTraceBuffer circular_trace_buffer;
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, assertions_data, caller) \
do { \
if (! (condition)) { \
/* Atomically increment so other threads can fail at the same time */ \
assertions_data->insert( \
TOSTRING(condition), \
__FILE__, \
__FUNCTION__, \
__LINE__, \
caller, \
blockIdx, \
threadIdx \
); \
\
assert(condition); \
} \
} while (false);
// NOTE: Our kernels now need a pointer to the assertions data and an id for the caller
// NOTE: We can simplify our code by assuming these variables always have the same names
// so that they do not need to be passed to the preprocessor macro
__global__ void my_failing_kernel(int x, AssertionsData *const assertions_data, const uint32_t caller){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, assertions_data, caller);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
if(cudaDeviceSynchronize()==cudaSuccess){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace()<<std::endl;
for(int i=0;i<uvm_assertions->assertion_count;i++){
std::cout<<"Assertion failure "<<i<<std::endl;
const auto &self = uvm_assertions->assertions[i];
const auto &stack = circular_trace_buffer.traces[self.caller];
std::cout<<"GPU "<<self.filename<<":"
<<self.line_number<<"("
<<self.function_name<<"): "
<<self.assertion_msg<<std::endl;
if(stack.generation_number == self.caller){
std::cout<<stack.stacktrace
<<"Device = "<<stack.device<<", "
<<"Stream = "<<stack.stream
<<std::endl;
} else {
std::cout<<"CPU stack has been overwritten!"<<std::endl;
}
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMallocManaged(&uvm_assertions, sizeof(AssertionsData)));
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId
));
// GPU will establish direct mapping of data in CPU memory, no page faults will be generated
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetAccessedBy, 0
));
my_failing_kernel<<<1, 1, 0>>>(4, uvm_assertions, circular_trace_buffer.insert(0));
my_failing_kernel<<<1, 1, 0>>>(5, uvm_assertions, circular_trace_buffer.insert(0));
CUDA_CHECK_KERNEL_SUCCESS();
CUDA_CHECK_API_CALL(cudaFree(uvm_assertions));
return 0;
}
The output for the above is:
main_assert_um_from_device.cu:162: void my_failing_kernel(int, AssertionsData *, unsigned int): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
CUDA API call failure detected at (main_assert_um_from_device.cu:167:0):
0# 0x000055D3D8CEAFF2 in ./a.out
1# 0x000055D3D8CEB700 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x000055D3D8CEADAE in ./a.out
Assertion failure 0
GPU main_assert_um_from_device.cu:162(my_failing_kernel): x!=5
0# 0x000055D3D8CECEF9 in ./a.out
1# 0x000055D3D8CED135 in ./a.out
2# 0x000055D3D8CEB6B9 in ./a.out
3# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
4# 0x000055D3D8CEADAE in ./a.out
Device = 0, Stream = 0
Better Assert Messages
The first work around is to make the device assert message better. To do so, we collect stacktrace strings on the host and transfer them to the GPU. Then, when we call a kernel we pass a pointer to the stacktrace string. If the kernel fails an assertion condition we print out the stacktrace before triggering the assertion.
The code for that is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/stacktrace.hpp>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
// Print a beefy kernel assertion message followed by inducing failure using
// the actual assertion
#define CUDA_DEVICE_ASSERT_WITH_STACKTRACE(condition, message) \
do { \
if (! (condition)) { \
printf("Assertion '%s' failed at %s:%d as part of stacktrace:\n%s", \
TOSTRING(condition), \
__FILE__, \
__LINE__, \
message); \
} \
/* Perform actual assertion to stop kernel progress */ \
assert(condition); \
} while (false)
__global__ void my_failing_kernel(int x, const char *d_stacktrace){
CUDA_DEVICE_ASSERT_WITH_STACKTRACE(x!=5, d_stacktrace);
}
// Increases performance by cacheing stack traces so we don't repeatedly
// transfer the same data to the GPU
std::unordered_map<std::string, char*> cached_stacks;
// Send a stacktrace to the GPU, cache the pointer it's stored at, return
// said pointer
char* setup_device_stacktrace(){
std::stringstream ss;
ss << boost::stacktrace::stacktrace();
const auto cached_stack = cached_stacks.find(ss.str());
if(cached_stack!=cached_stacks.end()){
std::cerr<<"Using cached stacktrace!"<<std::endl;
return cached_stack->second;
}
char *d_stacktrace = nullptr;
cudaMalloc(&d_stacktrace, 10000);
cudaMemcpy(d_stacktrace, ss.str().c_str(), ss.str().size(), cudaMemcpyHostToDevice);
cached_stacks[ss.str()] = d_stacktrace;
return d_stacktrace;
}
// Make an interesting stack
void nested_n(int depth, int val){
if(depth<5){
nested_n(depth+1, val);
} else {
const char* d_stacktrace = setup_device_stacktrace();
my_failing_kernel<<<1, 1>>>(val, d_stacktrace);
cudaDeviceSynchronize();
}
}
// Make an interesting stack
void nested3(int val){ nested_n(0, val); }
void nested2(int val){ nested3(val); }
void nested1(int val){ nested2(val); }
int main(){
for(int i=4;i<6;i++){
std::cerr<<"Running with value = "<<i<<std::endl;
nested1(i);
}
// Clean-up
for(const auto &x: cached_stacks){
cudaFree(x.second);
}
return 0;
}
This gives the output:
Running with value = 4
Running with value = 5
Using cached stacktrace!
Assertion 'x!=5' failed at main.cu:31 as part of stacktrace:
0# 0x000055BBF4A3CF76 in ./a.out
1# 0x000055BBF4A3D262 in ./a.out
2# 0x000055BBF4A3D258 in ./a.out
3# 0x000055BBF4A3D258 in ./a.out
4# 0x000055BBF4A3D258 in ./a.out
5# 0x000055BBF4A3D258 in ./a.out
6# 0x000055BBF4A3D258 in ./a.out
7# 0x000055BBF4A3D313 in ./a.out
8# 0x000055BBF4A3D32F in ./a.out
9# 0x000055BBF4A3D34B in ./a.out
10# 0x000055BBF4A3D3CF in ./a.out
11# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
12# 0x000055BBF4A3CE0E in ./a.out
main.cu:31: void my_failing_kernel(int, const char *): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
Replace The Device Assertion With Magic
Here the idea is to replace the device-side assert with our Own Special Assert. Our OSA will write information about itself to device-side and the host will read this to see what went wrong. Note that we'd only want to do this if the Unified Memory solution wasn't possible for some reason.
Here, rather than have the kernel fail with an assert, we have any failing threads early-exit the kernel while the rest of the threads continuing working. The result is garbage, but at least we can get information about why!
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Pointer to device memory allocated to hold assertion failure messages
char *d_assert_buffer = nullptr;
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Length of each assertion failure message - if this is too small we get
// garbage as threads overwrite each other
#define DEVICE_SIDE_ASSERTION_LENGTH 500
// Total size of the assertion failure message buffer. First 4 bytes stores the
// number of logged messages
#define DEVICE_SIDE_ASSERTION_BUFFER_LEN (4 + DEVICE_SIDE_ASSERTION_COUNT * DEVICE_SIDE_ASSERTION_LENGTH)
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, buffer) \
do { \
if (! (condition)) { \
/* First four bytes of the buffer indicate which buffer we're using */ \
uint32_t *const msgnum_ptr = reinterpret_cast<uint32_t*>(d_assert_buffer); \
/* Atomically increment so other threads can fail at the same time */ \
const uint32_t msg_num = atomicAdd(msgnum_ptr, 1); \
if(msg_num>=DEVICE_SIDE_ASSERTION_COUNT){ \
printf("RAN OUT OF ASSERTION BUFFER SPACE!\n"); \
return; \
} \
\
/* Find the start of the buffer we'll be writing to */ \
char *const msg_ptr = d_assert_buffer + 4 + msg_num * DEVICE_SIDE_ASSERTION_LENGTH; \
\
constexpr char const assertion_string[] = TOSTRING(x==5); \
constexpr char const line_string[] = LINE_STRING; \
constexpr int assertion_size = sizeof(assertion_string); \
constexpr int filename_size = sizeof(__FILE__)-1; \
\
/* __LINE__ gets turned into a buffer of length 6, it seems, so we need to find */ \
/* the actual length in order to print the message */ \
int line_size = 0; \
for(int i=0;i<20;i++){ \
if(line_string[i]!='\0'){ \
line_size++; \
} else { \
break; \
} \
} \
\
memcpy(msg_ptr, __FILE__, filename_size); \
msg_ptr[filename_size] = ':'; \
memcpy(msg_ptr+filename_size+1, line_string, line_size); \
msg_ptr[filename_size+1+line_size] = ':'; \
memcpy(msg_ptr+filename_size+1+line_size+1, assertion_string, assertion_size); \
msg_ptr[filename_size+1+line_size+1+assertion_size] = '\0'; \
/* If we actually assert then we can't ever get the message to the host, so we */ \
/* return and let the kernel generate garbage */ \
return; \
} \
} while (false);
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
__global__ void my_failing_kernel(int x, char *d_assert_buffer){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, d_assert_buffer);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
std::array<char, DEVICE_SIDE_ASSERTION_BUFFER_LEN> cuda_assert_buffer = {0};
CUDA_CHECK_API_CALL(cudaDeviceSynchronize());
assert(d_assert_buffer!=nullptr);
// NOTE: We could maybe save time by only moving the message count initially and copying the messages
// conditionally.
CUDA_CHECK_API_CALL(cudaMemcpy(cuda_assert_buffer.data(), d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN, cudaMemcpyDeviceToHost));
CUDA_CHECK_API_CALL(cudaDeviceSynchronize()); // NOTE: Needed for buffers of <64kB
const uint32_t& msg_num = *reinterpret_cast<uint32_t*>(cuda_assert_buffer.data());
if(msg_num==0){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace();
std::cout<<"Assertion messages ("<<msg_num<<" messages):"<<std::endl;
for(int i=0;i<msg_num;i++){
std::cout<<" "<<i<<" "<<cuda_assert_buffer.data()+(4+i*DEVICE_SIDE_ASSERTION_LENGTH)<<std::endl;
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMalloc(&d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN));
my_failing_kernel<<<1, 1>>>(4, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
my_failing_kernel<<<1, 1>>>(5, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
// Clean-up
cudaFree(d_assert_buffer);
return 0;
}
And the output looks like:
CUDA API call failure detected at (main_assert_from_device.cu:91:0):
0# 0x00005573A1F633A5 in ./a.out
1# 0x00005573A1F637C2 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x00005573A1F62D9E in ./a.out
Assertion messages (1 messages):
0 main_assert_from_device.cu:86:x==5

Related

Why is cudaPointerGetAttributes() returning invalid argument for host pointer?

I want to write a function that tells me if a pointer is a host or device pointer. This is essentially a wrapper around cudaPointerGetAttributes() that returns either 1 or 0 if the pointer is for the device or not.
What I can't understand is why cudaPointerGetAttributes fails my error checking by returning invalid argument when I'm testing a host pointer. An example is provided below.
#include <stdio.h>
#include <stdlib.h>
#define CUDA_ERROR_CHECK(fun) \
do{ \
cudaError_t err = fun; \
if(err != cudaSuccess) \
{ \
fprintf(stderr, "Cuda error %d %s:: %s\n", __LINE__, __func__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}while(0);
int is_device_pointer(const void *ptr)
{
int is_device_ptr = 0;
cudaPointerAttributes attributes;
CUDA_ERROR_CHECK(cudaPointerGetAttributes(&attributes, ptr));
if(attributes.devicePointer != NULL)
{
is_device_ptr = 1;
}
return is_device_ptr;
}
int main()
{
int *host_ptr, x = 0;
int is_dev_ptr;
host_ptr = &x;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, 16);
//is_dev_ptr = is_device_pointer((const void *)host_ptr); //Causes invalid argument
is_dev_ptr = is_device_pointer((const void *)dev_ptr); //Works
if(is_dev_ptr == 1)
{
fprintf(stdout, "Device pointer\n");
}
else
{
fprintf(stdout, "Not device Pointer\n");
}
CUDA_ERROR_CHECK(cudaFree((void *)dev_ptr));
CUDA_ERROR_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
This is expected behavior. cudaPointerGetAttributes can only introspect pointers that have been recorded in some fashion with the CUDA runtime API. Refer to the documentation:
If pointer was not allocated in, mapped by or registered with context supporting unified addressing cudaErrorInvalidValue is returned.
What this is saying is that the pointer must have been returned or passed through an API such as cudaMalloc, cudaMallocManaged, cudaHostRegister, etc. for it to be "recognized" by cudaPointerGetAttributes. You must be in a UVA regime, and you must have acquired the pointer using an appropriate method.
In your case, passing a bare host pointer this way doesn't meet the requirements spelled out in the documentation, so the error return is expected.
This particular error return code is a "non-sticky" CUDA error, meaning it can be cleared out via cudaGetLastError(). In my view, it should be safe to interpret this error return code as "this is an ordinary host pointer". But of course, if you pass a garbage value, or an unallocated pointer, you will get the same error code.

CUDA mapped memory: device -> host writes are not visible on host

What I was trying to do is modifying a variable which resides in mapped memory that would cause the main program to exit.
But instead of this the main program keeps spinning on while (var == 0) ; line. I don't know how the new value could be flushed out so it would be visible on the host side too.
Btw. the variable is declared as volatile everywhere and I tried using the __threadfence_system() function with no success.
The host -> device direction works well.
System: Windows 7 x64, driver 358.50, GTX 560
Here is the piece of code that I can't get working:
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof (int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
echoKernel <<< 1, 1 >>> (devptr);
while (var == 0) ;
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
When I run your code on linux, it runs as-is without issue.
However on windows, there is a problem around WDDM command batching. In effect, your kernel does not launch and is not getting launched before you enter the while-loop that hangs.
The WDDM command queue is a queue of commands that will eventually go to the GPU device. Various events will cause this queue to be "flushed" and the contents to be delivered as a "batch" of commands to the GPU.
Various cuda runtime API calls may effectively force the "flushing" of the command queue, such as cudaDeviceSynchronize() or cudaMemcpy(). However after the kernel launch, you are not issuing any runtime API calls before entering your while-loop. As a result, in this scenario it seems that the kernel call is getting "stuck" in the queue and never "flushed".
You can work around this in a variety of ways, for example by recording an event after the launch of the kernel and then querying the status of that event. This will have the effect of flushing the queue, which will launch the kernel.
Here's an example modification of your code that works for me:
#include <stdio.h>
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof(int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
cudaEvent_t my_event;
CUDA_ERROR_CHECK(cudaEventCreate(&my_event));
echoKernel << < 1, 1 >> > (devptr);
CUDA_ERROR_CHECK(cudaEventRecord(my_event));
cudaEventQuery(my_event);
while (var == 0);
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
Tested on CUDA 7.5, Driver 358.50, Win7 x64 release project, GTX460M.
Note that we don't wrap the cudaEventQuery call in a standard error checker, because the expected behavior for it is to return a non-zero status when the event has not been completed yet.

Loading multiple modules in JCuda is not working

In jCuda one can load cuda files as PTX or CUBIN format and call(launch) __global__ functions (kernels) from Java.
With keeping that in mind, I want to develop a framework with JCuda that gets user's __device__ function in a .cu file at run-time, loads and runs it.
And I have already implemented a __global__ function, in which each thread finds out the start point of its related data, perform some computation, initialization and then call user's __device__ function.
Here is my kernel pseudo code:
extern "C" __device__ void userFunc(args);
extern "C" __global__ void kernel(){
// initialize
userFunc(args);
// rest of the kernel
}
And user's __device__ function:
extern "C" __device__ void userFunc(args){
// do something
}
And in Java side, here is the part that I load the modules(modules are made from ptx files which are successfully created from cuda files with this command: nvcc -m64 -ptx path/to/cudaFile -o cudaFile.ptx)
CUmodule kernelModule = new CUmodule(); // 1
CUmodule userFuncModule = new CUmodule(); // 2
cuModuleLoad(kernelModule, ptxKernelFileName); // 3
cuModuleLoad(userFuncModule, ptxUserFuncFileName); // 4
When I try to run it I got error at line 3 : CUDA_ERROR_NO_BINARY_FOR_GPU. After some searching I get that my ptx file has some syntax error. After running this suggested command:
ptxas -arch=sm_30 kernel.ptx
I got:
ptxas fatal : Unresolved extern function 'userFunc'
Even when I replace line 3 with 4 to load userFunc before kernel I get this error. I got stuck at this phase. Is this the correct way to load multiple modules that need to be linked together in JCuda? Or is it even possible?
Edit:
Second part of the question is here
The really short answer is: No, you can't load multiple modules into a context in the runtime API.
You can do what you want, but it requires explicit setup and execution of a JIT linking call. I have no idea how (or even whether) that has been implemented in JCUDA, but I can show you how to do it with the standard driver API. Hold on...
If you have a device function in one file, and a kernel in another, for example:
// test_function.cu
#include <math.h>
__device__ float mathop(float &x, float &y, float &z)
{
float res = sin(x) + cos(y) + sqrt(z);
return res;
}
and
// test_kernel.cu
extern __device__ float mathop(float & x, float & y, float & z);
__global__ void kernel(float *xvals, float * yvals, float * zvals, float *res)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
res[tid] = mathop(xvals[tid], yvals[tid], zvals[tid]);
}
You can compile them to PTX as usual:
$ nvcc -arch=sm_30 -ptx test_function.cu
$ nvcc -arch=sm_30 -ptx test_kernel.cu
$ head -14 test_kernel.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324607
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_30
.address_size 64
// .globl _Z6kernelPfS_S_S_
.extern .func (.param .b32 func_retval0) _Z6mathopRfS_S_
At runtime, your code must create a JIT link session, add each PTX to the linker session, then finalise the linker session. This will give you a handle to a compiled cubin image which can be loaded as a module as usual. The simplest possible driver API code to put this together looks like this:
#include <cstdio>
#include <cuda.h>
#define drvErrChk(ans) { drvAssert(ans, __FILE__, __LINE__); }
inline void drvAssert(CUresult code, const char *file, int line, bool abort=true)
{
if (code != CUDA_SUCCESS) {
fprintf(stderr, "Driver API Error %04d at %s %d\n", int(code), file, line);
exit(-1);
}
}
int main()
{
cuInit(0);
CUdevice device;
drvErrChk( cuDeviceGet(&device, 0) );
CUcontext context;
drvErrChk( cuCtxCreate(&context, 0, device) );
CUlinkState state;
drvErrChk( cuLinkCreate(0, 0, 0, &state) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_function.ptx", 0, 0, 0) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_kernel.ptx" , 0, 0, 0) );
size_t sz;
char * image;
drvErrChk( cuLinkComplete(state, (void **)&image, &sz) );
CUmodule module;
drvErrChk( cuModuleLoadData(&module, image) );
drvErrChk( cuLinkDestroy(state) );
CUfunction function;
drvErrChk( cuModuleGetFunction(&function, module, "_Z6kernelPfS_S_S_") );
return 0;
}
You should be able to compile and run this as posted and verify it works OK. It should serve as a template for a JCUDA implementation, if they have JIT linking support implemented.

Unified Memory and Streams in C

I am trying to use streams with CUDA 6 and unified memory in C. My previous stream implementation was looking like this :
for(x=0; x<DSIZE; x+=N*2){
gpuErrchk(cudaMemcpyAsync(array_d0, array_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(array_d1, array_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(data_d0, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(data_d1, data_h, wrap->size*sizeof(int), cudaMemcpyHostToDevice, stream1));
searchGPUModified<<<N/128,128,0,stream0>>>(data_d0, array_d0, out_d0 );
searchGPUModified<<<N/128,128,0,stream1>>>(data_d1, array_d1, out_d1);
gpuErrchk(cudaMemcpyAsync(out_h+x, out_d0 , N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
gpuErrchk(cudaMemcpyAsync(out_h+x+N, out_d1 ,N * sizeof(int), cudaMemcpyDeviceToHost, stream1));
}
but I cannot find an example of streams and unified memory, using the same technique, where chuncks of data are sent to the GPU. I am thus wondering if there is a way to do this ?
You should read section J.2.2 of the programming guide (and preferably all of appendix J).
With Unified Memory, memory allocated using cudaMallocManaged is by default attached to all streams ("global") and we must modify this in order to make effective use of streams, e.g. for compute/copy overlap. We can do this with the cudaStreamAttachMemAsync function as described in section J.2.2.3 By associating each memory "chunk" with a stream in this fashion, the UM subsystem can make intelligent decisions about when to transfer each data item.
The following example demonstrates this:
#include <stdio.h>
#include <time.h>
#define DSIZE 1048576
#define DWAIT 100000ULL
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
__global__ void mykernel(mytype *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < DSIZE) data[idx] = 1;
unsigned long long int tstart = clock64();
while (clock64() < tstart + DWAIT);
}
int main(){
mytype *data1, *data2, *data3;
cudaStream_t stream1, stream2, stream3;
cudaMallocManaged(&data1, DSIZE*sizeof(mytype));
cudaMallocManaged(&data2, DSIZE*sizeof(mytype));
cudaMallocManaged(&data3, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMallocManaged fail");
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaCheckErrors("cudaStreamCreate fail");
cudaStreamAttachMemAsync(stream1, data1);
cudaStreamAttachMemAsync(stream2, data2);
cudaStreamAttachMemAsync(stream3, data3);
cudaDeviceSynchronize();
cudaCheckErrors("cudaStreamAttach fail");
memset(data1, 0, DSIZE*sizeof(mytype));
memset(data2, 0, DSIZE*sizeof(mytype));
memset(data3, 0, DSIZE*sizeof(mytype));
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream1>>>(data1);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream2>>>(data2);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream3>>>(data3);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
for (int i = 0; i < DSIZE; i++){
if (data1[i] != 1) {printf("data1 mismatch at %d, should be: %d, was: %d\n", i, 1, data1[i]); return 1;}
if (data2[i] != 1) {printf("data2 mismatch at %d, should be: %d, was: %d\n", i, 1, data2[i]); return 1;}
if (data3[i] != 1) {printf("data3 mismatch at %d, should be: %d, was: %d\n", i, 1, data3[i]); return 1;}
}
printf("Success!\n");
return 0;
}
The above program creates a kernel that runs artificially long using clock64(), so as to give us a simulated opportunity for compute/copy overlap (simulating a compute-intensive kernel). We are launching 3 instances of this kernel, each instance operating on a separate "chunk" of data.
When we profile the above program, the following is seen:
First, note that the 3rd kernel launch is highlighted in yellow, and it begins immediately after the second kernel launch highlighted in purple. The actual cudaLaunch runtime API event that launches this 3rd kernel is indicated in the runtime API line by the mouse pointer, also highlighted in yellow (and is preceded by the cudaLaunch events for the first 2 kernels). Since this launch happens during execution of the first kernel, and there is no intervening "empty space" from that point until the start of the 3rd kernel, we can observe that the transfer of the data for the 3rd kernel launch (i.e. data3) occurred while kernels 1 and 2 were executing. Therefore we have effective overlap of copy and compute. (We could make a similar observation about kernel 2).
Although I haven't shown it here, if we omit the cudaStreamAttachMemAsync lines, the program still compiles and runs correctly, but if we profile it, we observe a different relationship between the cudaLaunch events and the kernels. The overall profile looks similar, and the kernels are executing back to back, but the entire cudaLaunch process now begins and ends before the first kernel begins executing, and there are no cudaLaunch events during the kernel execution. This indicates that (since all the cudaMallocManaged memory is global) all of the data transfers are taking place prior to the first kernel launch. The program has no way to associate a "global" allocation with any particular kernel, so all such allocated memory must be transferred before the first kernel launch (even though that kernel is only using data1).

printf() in my CUDA kernel doesn't result produce any output

I have added some printf() statements in my CUDA program
__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
//Kernel call
printf("calling kernel\n");
Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
printf("kernel called\n");
....
}
int main(int argc , char **argv)
{ ....
printf("beforeDeviceFunc\n\n");
DeviceFunc(a_h , numvar , b_h); //Showing the data
printf("after DeviceFunc\n\n");
....
}
Also in the Kernel.cu, I wrote:
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d", idx, idy, size);
....
}
Then I compile using -arch=sm_20 like this:
nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main
Now when I run the program, I see:
beforeDeviceFunc
calling kernel
kernel called
after DeviceFunc
So the printf() inside the kernel is not printed. How can I fix that?
printf() output is only displayed if the kernel finishes successfully, so check the return codes of all CUDA function calls and make sure no errors are reported.
Furthermore printf() output is only displayed at certain points in the program. Appendix B.32.2 of the Programming Guide lists these as
Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
Context destruction via cudaDeviceReset() or cuCtxDestroy().
Prior to executing a stream callback added by cudaStreamAddCallback() or cuStreamAddCallback().
To check this is your problem, put the following code after your kernel invocation:
{
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
}
You should then see either the output of your kernel or an error message.
More conveniently, cuda-memcheck will automatically check all return codes for you if you run your executable under it. While you should always check for errors anyway, this comes handy when resolving concrete issues.
I had the same error just now and decreasing the block size to 512 helped. According to documentation maximum block size can be either 512 or 1024.
I have written a simple test that showed that my GTX 1070 has a maximum block size of 1024. UPD: you can check if your kernel has ever executed by using cudaError_t cudaPeekAtLastError() that returns cudaSuccess if the kernel has started successfully, and only after it is worse calling cudaError_t cudaDeviceSynchronize().
Testing block size of 1023
Testing block size of 1024
Testing block size of 1025
CUDA error: invalid configuration argument
Block maximum size is 1024
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
__global__
void set1(int* t)
{
t[threadIdx.x] = 1;
}
inline bool failed(cudaError_t error)
{
if (cudaSuccess == error)
return false;
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
return true;
}
int main()
{
int blockSize;
for (blockSize = 1; blockSize < 1 << 12; blockSize++)
{
printf("Testing block size of %d\n", blockSize);
int* t;
if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
{
failed(cudaFree(t));
break;
}
for (int i = 0; i < blockSize; i++)
t[0] = 0;
set1 <<<1, blockSize>>> (t);
if (failed(cudaPeekAtLastError()))
{
failed(cudaFree(t));
break;
}
if (failed(cudaDeviceSynchronize()))
{
failed(cudaFree(t));
break;
}
bool hasError = false;
for (int i = 0; i < blockSize; i++)
if (1 != t[i])
{
printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
hasError = true;
break;
}
if (hasError)
{
failed(cudaFree(t));
break;
}
failed(cudaFree(t));
}
blockSize--;
if(blockSize <= 0)
{
printf("CUDA error: block size cannot be 0\n");
return 1;
}
printf("Block maximum size is %d", blockSize);
return 0;
}
P.S. Please note, that the only thing in block sizing is warp granularity which is 32 nowadays, so if 0 == yourBlockSize % 32 the warps are used pretty efficiently. The only reason to make blocks bigger then 32 is when the code needs synchronization as synchronization is available only among threads in a single block which makes a developer to use a single large block instead of many small ones. So running with higher number of smaller blocks can be even more efficient than running with lower number of larger blocks.