Is there any way to get a kernel assert message/line number back from a kernel failure?
That is, if I have:
__global__ void my_kernel(int x){
assert(x!=0);
}
int main(){
CUDA_CHECK(my_kernel<<<1,1>>>(0));
CHECK_WITH_ASSERTION_FETCH(cudaDeviceSynchronize());
}
My understanding is that CUDA_CHECK() passes here and cudaDeviceSynchronize() would return a failure code (specifically, CUDA error: device-side assert triggered CUDA kernel errors).
Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error? The file and line number in which the assertion failed would be sufficient.
Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error?
No there isn't.
As per the documentation, one way that you can see which line of code triggered the assertion and in which block and thread the assertion was raised is by attaching the debugger to the running kernel.
Robert Crovella's the authoritative voice here and says it isn't possible for a kernel assert to get information about itself back to the host. So we need some workarounds.
A major complicating factor is that if assert is called on the device then we are no longer able to communicate with it from the host, so any data we write to device memory is lost forever. (Reference).
Below I offer three:
Using Unified Memory to pass info from the GPU to the CPU even "after" an assert is called. This is the best answer.
Improving the GPU's assertion error messages by passing stacks to the GPU.
Passing info from the GPU to the CPU by dropping asserts and writing to memory. You'd only use this if UVM wasn't an option for some reason.
Using Unified Memory
Unified Memory allows the CUDA device and the host to transparently shuffle bits between each other without the need for cudaMemcpy. The result is that even though throwing an assert blocks our access to the device via regular API calls, we can still transfer signals back to the CPU via the Unified Memory.
Note that if we want kernels to be able to run asynchronously we need a way to associate kernel launches with assertion failures. The circular buffer here provides this functionality.
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
// Copy string from `src` to `dst`
__device__ void dstrcpy(char *dst, const char *src){
for(;*src!='\0';dst++,src++){
*dst = *src;
}
*dst = '\0';
}
// Used to hold assertion data generated by the device
struct AssertionData {
char assertion_msg[1000];
char filename[1000];
char function_name[1000];
int line_number;
uint32_t caller;
dim3 block_id;
dim3 thread_id;
};
// Used to hold assertions generated by the device
struct AssertionsData {
int assertion_count;
AssertionData assertions[DEVICE_SIDE_ASSERTION_COUNT];
// Get the next place to insert an assertion failure message
__device__ int next_id(){
// Atomically increment so other threads can fail at the same time
return atomicAdd(&assertion_count, 1);
}
__device__ void insert(
const char *assertion_msg0,
const char *filename0,
const char *function_name0,
const int line_number0,
const uint32_t caller0,
const dim3 block_id0,
const dim3 thread_id0
){
const auto nid = next_id();
if(nid>DEVICE_SIDE_ASSERTION_COUNT){
printf("RAN OUT OF ASSERTION BUFFER SPACE!");
return;
}
auto& self = assertions[nid];
dstrcpy(self.assertion_msg, assertion_msg0);
dstrcpy(self.filename, filename0);
dstrcpy(self.function_name, function_name0);
self.line_number = line_number0;
self.caller = caller0;
self.block_id = block_id0;
self.thread_id = thread_id0;
}
};
// Pointer to device memory allocated to hold assertion failure messages
AssertionsData *uvm_assertions = nullptr;
// Use to hold stack traces generated by the host so that we can run kernels
// asynchronously and still associate stacks to assertion failures
struct StackTraceInfo {
boost::stacktrace::stacktrace stacktrace;
int device;
cudaStream_t stream;
uint32_t generation_number;
StackTraceInfo() = default;
StackTraceInfo(int generation_number0, cudaStream_t stream0) {
// Technically we'd want to lop the top few layers off of this
generation_number = generation_number0;
stacktrace = boost::stacktrace::stacktrace();
CUDA_CHECK_API_CALL(cudaGetDevice(&device));
stream = stream0;
}
};
// Circular buffer used to hold stacks generated by the host
struct CircularTraceBuffer {
// Assume that this is the max number of items that might ever be enqueued
// across all streams
static constexpr int max_size = 1024;
// How many stacktraces we've inserted. Used to ensure that circular queue
// doesn't provide false information by always increasing, but also to mark
// where we are inserting into the queue
uint32_t generation_number = 0;
// The buffer
std::array<StackTraceInfo, max_size> traces;
uint32_t insert(cudaStream_t stream_id) {
traces[generation_number % max_size] = StackTraceInfo(generation_number, stream_id);
return generation_number++;
}
};
// Circular buffer of host stacktraces for associating with kernel launches
CircularTraceBuffer circular_trace_buffer;
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, assertions_data, caller) \
do { \
if (! (condition)) { \
/* Atomically increment so other threads can fail at the same time */ \
assertions_data->insert( \
TOSTRING(condition), \
__FILE__, \
__FUNCTION__, \
__LINE__, \
caller, \
blockIdx, \
threadIdx \
); \
\
assert(condition); \
} \
} while (false);
// NOTE: Our kernels now need a pointer to the assertions data and an id for the caller
// NOTE: We can simplify our code by assuming these variables always have the same names
// so that they do not need to be passed to the preprocessor macro
__global__ void my_failing_kernel(int x, AssertionsData *const assertions_data, const uint32_t caller){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, assertions_data, caller);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
if(cudaDeviceSynchronize()==cudaSuccess){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace()<<std::endl;
for(int i=0;i<uvm_assertions->assertion_count;i++){
std::cout<<"Assertion failure "<<i<<std::endl;
const auto &self = uvm_assertions->assertions[i];
const auto &stack = circular_trace_buffer.traces[self.caller];
std::cout<<"GPU "<<self.filename<<":"
<<self.line_number<<"("
<<self.function_name<<"): "
<<self.assertion_msg<<std::endl;
if(stack.generation_number == self.caller){
std::cout<<stack.stacktrace
<<"Device = "<<stack.device<<", "
<<"Stream = "<<stack.stream
<<std::endl;
} else {
std::cout<<"CPU stack has been overwritten!"<<std::endl;
}
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMallocManaged(&uvm_assertions, sizeof(AssertionsData)));
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId
));
// GPU will establish direct mapping of data in CPU memory, no page faults will be generated
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetAccessedBy, 0
));
my_failing_kernel<<<1, 1, 0>>>(4, uvm_assertions, circular_trace_buffer.insert(0));
my_failing_kernel<<<1, 1, 0>>>(5, uvm_assertions, circular_trace_buffer.insert(0));
CUDA_CHECK_KERNEL_SUCCESS();
CUDA_CHECK_API_CALL(cudaFree(uvm_assertions));
return 0;
}
The output for the above is:
main_assert_um_from_device.cu:162: void my_failing_kernel(int, AssertionsData *, unsigned int): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
CUDA API call failure detected at (main_assert_um_from_device.cu:167:0):
0# 0x000055D3D8CEAFF2 in ./a.out
1# 0x000055D3D8CEB700 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x000055D3D8CEADAE in ./a.out
Assertion failure 0
GPU main_assert_um_from_device.cu:162(my_failing_kernel): x!=5
0# 0x000055D3D8CECEF9 in ./a.out
1# 0x000055D3D8CED135 in ./a.out
2# 0x000055D3D8CEB6B9 in ./a.out
3# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
4# 0x000055D3D8CEADAE in ./a.out
Device = 0, Stream = 0
Better Assert Messages
The first work around is to make the device assert message better. To do so, we collect stacktrace strings on the host and transfer them to the GPU. Then, when we call a kernel we pass a pointer to the stacktrace string. If the kernel fails an assertion condition we print out the stacktrace before triggering the assertion.
The code for that is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/stacktrace.hpp>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
// Print a beefy kernel assertion message followed by inducing failure using
// the actual assertion
#define CUDA_DEVICE_ASSERT_WITH_STACKTRACE(condition, message) \
do { \
if (! (condition)) { \
printf("Assertion '%s' failed at %s:%d as part of stacktrace:\n%s", \
TOSTRING(condition), \
__FILE__, \
__LINE__, \
message); \
} \
/* Perform actual assertion to stop kernel progress */ \
assert(condition); \
} while (false)
__global__ void my_failing_kernel(int x, const char *d_stacktrace){
CUDA_DEVICE_ASSERT_WITH_STACKTRACE(x!=5, d_stacktrace);
}
// Increases performance by cacheing stack traces so we don't repeatedly
// transfer the same data to the GPU
std::unordered_map<std::string, char*> cached_stacks;
// Send a stacktrace to the GPU, cache the pointer it's stored at, return
// said pointer
char* setup_device_stacktrace(){
std::stringstream ss;
ss << boost::stacktrace::stacktrace();
const auto cached_stack = cached_stacks.find(ss.str());
if(cached_stack!=cached_stacks.end()){
std::cerr<<"Using cached stacktrace!"<<std::endl;
return cached_stack->second;
}
char *d_stacktrace = nullptr;
cudaMalloc(&d_stacktrace, 10000);
cudaMemcpy(d_stacktrace, ss.str().c_str(), ss.str().size(), cudaMemcpyHostToDevice);
cached_stacks[ss.str()] = d_stacktrace;
return d_stacktrace;
}
// Make an interesting stack
void nested_n(int depth, int val){
if(depth<5){
nested_n(depth+1, val);
} else {
const char* d_stacktrace = setup_device_stacktrace();
my_failing_kernel<<<1, 1>>>(val, d_stacktrace);
cudaDeviceSynchronize();
}
}
// Make an interesting stack
void nested3(int val){ nested_n(0, val); }
void nested2(int val){ nested3(val); }
void nested1(int val){ nested2(val); }
int main(){
for(int i=4;i<6;i++){
std::cerr<<"Running with value = "<<i<<std::endl;
nested1(i);
}
// Clean-up
for(const auto &x: cached_stacks){
cudaFree(x.second);
}
return 0;
}
This gives the output:
Running with value = 4
Running with value = 5
Using cached stacktrace!
Assertion 'x!=5' failed at main.cu:31 as part of stacktrace:
0# 0x000055BBF4A3CF76 in ./a.out
1# 0x000055BBF4A3D262 in ./a.out
2# 0x000055BBF4A3D258 in ./a.out
3# 0x000055BBF4A3D258 in ./a.out
4# 0x000055BBF4A3D258 in ./a.out
5# 0x000055BBF4A3D258 in ./a.out
6# 0x000055BBF4A3D258 in ./a.out
7# 0x000055BBF4A3D313 in ./a.out
8# 0x000055BBF4A3D32F in ./a.out
9# 0x000055BBF4A3D34B in ./a.out
10# 0x000055BBF4A3D3CF in ./a.out
11# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
12# 0x000055BBF4A3CE0E in ./a.out
main.cu:31: void my_failing_kernel(int, const char *): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
Replace The Device Assertion With Magic
Here the idea is to replace the device-side assert with our Own Special Assert. Our OSA will write information about itself to device-side and the host will read this to see what went wrong. Note that we'd only want to do this if the Unified Memory solution wasn't possible for some reason.
Here, rather than have the kernel fail with an assert, we have any failing threads early-exit the kernel while the rest of the threads continuing working. The result is garbage, but at least we can get information about why!
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Pointer to device memory allocated to hold assertion failure messages
char *d_assert_buffer = nullptr;
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Length of each assertion failure message - if this is too small we get
// garbage as threads overwrite each other
#define DEVICE_SIDE_ASSERTION_LENGTH 500
// Total size of the assertion failure message buffer. First 4 bytes stores the
// number of logged messages
#define DEVICE_SIDE_ASSERTION_BUFFER_LEN (4 + DEVICE_SIDE_ASSERTION_COUNT * DEVICE_SIDE_ASSERTION_LENGTH)
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, buffer) \
do { \
if (! (condition)) { \
/* First four bytes of the buffer indicate which buffer we're using */ \
uint32_t *const msgnum_ptr = reinterpret_cast<uint32_t*>(d_assert_buffer); \
/* Atomically increment so other threads can fail at the same time */ \
const uint32_t msg_num = atomicAdd(msgnum_ptr, 1); \
if(msg_num>=DEVICE_SIDE_ASSERTION_COUNT){ \
printf("RAN OUT OF ASSERTION BUFFER SPACE!\n"); \
return; \
} \
\
/* Find the start of the buffer we'll be writing to */ \
char *const msg_ptr = d_assert_buffer + 4 + msg_num * DEVICE_SIDE_ASSERTION_LENGTH; \
\
constexpr char const assertion_string[] = TOSTRING(x==5); \
constexpr char const line_string[] = LINE_STRING; \
constexpr int assertion_size = sizeof(assertion_string); \
constexpr int filename_size = sizeof(__FILE__)-1; \
\
/* __LINE__ gets turned into a buffer of length 6, it seems, so we need to find */ \
/* the actual length in order to print the message */ \
int line_size = 0; \
for(int i=0;i<20;i++){ \
if(line_string[i]!='\0'){ \
line_size++; \
} else { \
break; \
} \
} \
\
memcpy(msg_ptr, __FILE__, filename_size); \
msg_ptr[filename_size] = ':'; \
memcpy(msg_ptr+filename_size+1, line_string, line_size); \
msg_ptr[filename_size+1+line_size] = ':'; \
memcpy(msg_ptr+filename_size+1+line_size+1, assertion_string, assertion_size); \
msg_ptr[filename_size+1+line_size+1+assertion_size] = '\0'; \
/* If we actually assert then we can't ever get the message to the host, so we */ \
/* return and let the kernel generate garbage */ \
return; \
} \
} while (false);
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
__global__ void my_failing_kernel(int x, char *d_assert_buffer){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, d_assert_buffer);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
std::array<char, DEVICE_SIDE_ASSERTION_BUFFER_LEN> cuda_assert_buffer = {0};
CUDA_CHECK_API_CALL(cudaDeviceSynchronize());
assert(d_assert_buffer!=nullptr);
// NOTE: We could maybe save time by only moving the message count initially and copying the messages
// conditionally.
CUDA_CHECK_API_CALL(cudaMemcpy(cuda_assert_buffer.data(), d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN, cudaMemcpyDeviceToHost));
CUDA_CHECK_API_CALL(cudaDeviceSynchronize()); // NOTE: Needed for buffers of <64kB
const uint32_t& msg_num = *reinterpret_cast<uint32_t*>(cuda_assert_buffer.data());
if(msg_num==0){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace();
std::cout<<"Assertion messages ("<<msg_num<<" messages):"<<std::endl;
for(int i=0;i<msg_num;i++){
std::cout<<" "<<i<<" "<<cuda_assert_buffer.data()+(4+i*DEVICE_SIDE_ASSERTION_LENGTH)<<std::endl;
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMalloc(&d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN));
my_failing_kernel<<<1, 1>>>(4, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
my_failing_kernel<<<1, 1>>>(5, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
// Clean-up
cudaFree(d_assert_buffer);
return 0;
}
And the output looks like:
CUDA API call failure detected at (main_assert_from_device.cu:91:0):
0# 0x00005573A1F633A5 in ./a.out
1# 0x00005573A1F637C2 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x00005573A1F62D9E in ./a.out
Assertion messages (1 messages):
0 main_assert_from_device.cu:86:x==5
I want to write a function that tells me if a pointer is a host or device pointer. This is essentially a wrapper around cudaPointerGetAttributes() that returns either 1 or 0 if the pointer is for the device or not.
What I can't understand is why cudaPointerGetAttributes fails my error checking by returning invalid argument when I'm testing a host pointer. An example is provided below.
#include <stdio.h>
#include <stdlib.h>
#define CUDA_ERROR_CHECK(fun) \
do{ \
cudaError_t err = fun; \
if(err != cudaSuccess) \
{ \
fprintf(stderr, "Cuda error %d %s:: %s\n", __LINE__, __func__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}while(0);
int is_device_pointer(const void *ptr)
{
int is_device_ptr = 0;
cudaPointerAttributes attributes;
CUDA_ERROR_CHECK(cudaPointerGetAttributes(&attributes, ptr));
if(attributes.devicePointer != NULL)
{
is_device_ptr = 1;
}
return is_device_ptr;
}
int main()
{
int *host_ptr, x = 0;
int is_dev_ptr;
host_ptr = &x;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, 16);
//is_dev_ptr = is_device_pointer((const void *)host_ptr); //Causes invalid argument
is_dev_ptr = is_device_pointer((const void *)dev_ptr); //Works
if(is_dev_ptr == 1)
{
fprintf(stdout, "Device pointer\n");
}
else
{
fprintf(stdout, "Not device Pointer\n");
}
CUDA_ERROR_CHECK(cudaFree((void *)dev_ptr));
CUDA_ERROR_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
This is expected behavior. cudaPointerGetAttributes can only introspect pointers that have been recorded in some fashion with the CUDA runtime API. Refer to the documentation:
If pointer was not allocated in, mapped by or registered with context supporting unified addressing cudaErrorInvalidValue is returned.
What this is saying is that the pointer must have been returned or passed through an API such as cudaMalloc, cudaMallocManaged, cudaHostRegister, etc. for it to be "recognized" by cudaPointerGetAttributes. You must be in a UVA regime, and you must have acquired the pointer using an appropriate method.
In your case, passing a bare host pointer this way doesn't meet the requirements spelled out in the documentation, so the error return is expected.
This particular error return code is a "non-sticky" CUDA error, meaning it can be cleared out via cudaGetLastError(). In my view, it should be safe to interpret this error return code as "this is an ordinary host pointer". But of course, if you pass a garbage value, or an unallocated pointer, you will get the same error code.
What I was trying to do is modifying a variable which resides in mapped memory that would cause the main program to exit.
But instead of this the main program keeps spinning on while (var == 0) ; line. I don't know how the new value could be flushed out so it would be visible on the host side too.
Btw. the variable is declared as volatile everywhere and I tried using the __threadfence_system() function with no success.
The host -> device direction works well.
System: Windows 7 x64, driver 358.50, GTX 560
Here is the piece of code that I can't get working:
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof (int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
echoKernel <<< 1, 1 >>> (devptr);
while (var == 0) ;
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
When I run your code on linux, it runs as-is without issue.
However on windows, there is a problem around WDDM command batching. In effect, your kernel does not launch and is not getting launched before you enter the while-loop that hangs.
The WDDM command queue is a queue of commands that will eventually go to the GPU device. Various events will cause this queue to be "flushed" and the contents to be delivered as a "batch" of commands to the GPU.
Various cuda runtime API calls may effectively force the "flushing" of the command queue, such as cudaDeviceSynchronize() or cudaMemcpy(). However after the kernel launch, you are not issuing any runtime API calls before entering your while-loop. As a result, in this scenario it seems that the kernel call is getting "stuck" in the queue and never "flushed".
You can work around this in a variety of ways, for example by recording an event after the launch of the kernel and then querying the status of that event. This will have the effect of flushing the queue, which will launch the kernel.
Here's an example modification of your code that works for me:
#include <stdio.h>
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof(int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
cudaEvent_t my_event;
CUDA_ERROR_CHECK(cudaEventCreate(&my_event));
echoKernel << < 1, 1 >> > (devptr);
CUDA_ERROR_CHECK(cudaEventRecord(my_event));
cudaEventQuery(my_event);
while (var == 0);
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
Tested on CUDA 7.5, Driver 358.50, Win7 x64 release project, GTX460M.
Note that we don't wrap the cudaEventQuery call in a standard error checker, because the expected behavior for it is to return a non-zero status when the event has not been completed yet.
I cannot allocate even only 4 bytes of memory with cudaMallocHost() because of 'out of memory' error. I tried cudaSetDevice(0), cudaDeviceSynchronize(), cudaThreadSynchronize(), and cudaFree(0) at the very first of my code for initializing, but they don't work.
I think this link : cudaMalloc always gives out of memory - has the answer what I want but I cannot understand it. How can I solve this problem?
Here is my full code.
/* test.cu */
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <assert.h>
inline cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
int main()
{
cudaSetDevice(0);
cudaDeviceSynchronize();
cudaThreadSynchronize();
cudaFree(0);
int *test_ptr;
checkCuda( cudaMallocHost((void **)&test_ptr, sizeof(int)) );
cudaFreeHost(test_ptr);
printf("Test Success.\n");
return 0;
}
I compiled with this instruction:
nvcc test.cu -o test
and when I execute this:
me#me:~$ ./test
CUDA Runtime Error: out of memory
test: test.cu:10: cudaError_t checkCuda(cudaError_t): Assertion `result == cudaSuccess' failed.
Aborted
My cuda version is 5.0, I'll post more specific device information if you need.
I just rebooted my system, and the error changed :
me#me:~$ nvidia-smi -q
NVIDIA: could not open the device file /dev/nvidiactl (No such file or directory).
NVIDIA-SMI has failed because it couldn't communicate with NVIDIA driver. Make sure that latest NVIDIA driver is installed and running.
Sorry for my unconsciousness, I'll finish this thread because it became another problem. Thank you for your replies.
I have allocated page-aligned memory on host using posix_memalign. The call to posix_memalign does not return any error. However, using this pointer as argument to cudaHostRegister gives me an 'invalid argument' error. What could be the issue?
CUDA API version: 4.0
gcc version: 4.4.5
GPU compute capability: 2.0
The memory allocation is done in the application code, and a pointer is passed to a library routine.
Application code snippet:
if(posix_memalign((void **)&h_A, getpagesize(), n * n * sizeof(float))) {
printf("Error allocating aligned memory for A\n");
return 1;
}
Shared library code snippet:
if((ret = cudaSetDeviceFlags(cudaDeviceMapHost)) != cudaSuccess) {
fprintf(stderr, "Error setting device flag: %s\n",
cudaGetErrorString(ret));
return NULL;
}
if((ret = cudaHostRegister(h_A, n2 * sizeof(float),
cudaHostRegisterMapped)) != cudaSuccess) {
fprintf(stderr, "Error registering page-locked memory for A: %s\n",
cudaGetErrorString(ret));
return NULL;
}
I cannot reproduce this. If I take the code snippets you supplied and make them into a minimal executable:
#include <unistd.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
int main(void)
{
const int n2 = 100 * 100;
float *h_A;
cudaError_t ret;
if(posix_memalign((void **)&h_A, getpagesize(), n2 * sizeof(float))) {
printf("Error allocating aligned memory for A\n");
return -1;
}
if((ret = cudaSetDeviceFlags(cudaDeviceMapHost)) != cudaSuccess) {
fprintf(stderr, "Error setting device flag: %s\n",
cudaGetErrorString(ret));
return -1;
}
if((ret = cudaHostRegister(h_A, n2 * sizeof(float),
cudaHostRegisterMapped)) != cudaSuccess) {
fprintf(stderr, "Error registering page-locked memory for A: %s\n",
cudaGetErrorString(ret));
return -1;
}
return 0;
}
it compiles and runs without error under both CUDA 4.2 and CUDA 5.0 on a 64 bit linux host with the 304.54 driver. I would, therefore, conclude that either you have a broken CUDA installation or your code has a problem somewhere you haven't shown us.
Perhaps you can compile and run this code exactly as I posted and see what happens. If it works, it might help narrow down what it is that might be going wrong here.