How to solve "Segmentation fault (core dumped)" - libpcap

Here is my code:
#include<stdio.h>
#include<pcap.h>
void pcapdump(u_char* argument,const struct pcap_pkthdr* packet_header,const u_char* packet_content);
int main()
{
int i=0, devid,ret;
char errbuf[PCAP_ERRBUF_SIZE];
pcap_t *handle;
bpf_u_int32 mask;
bpf_u_int32 net;
int num_packets=500;
pcap_dumper_t *p;
pcap_if_t *alldevs;
pcap_if_t *pdev;
const struct pcap_pkthdr *packet_header;
const u_char *packet_content;
ret=pcap_findalldevs(&alldevs,errbuf);
if(ret=-1)
{
printf("%s",errbuf);
};
for (pdev = alldevs;pdev;pdev=pdev->next)
printf("#%d: %s %s %s\n",++i,pdev->name,pdev->description,pdev->description);
printf("select a device: ");
scanf("%d", &devid);
pdev=alldevs;
while (--devid)
pdev=pdev->next;
printf("Selected %s \n", pdev->name);
if (pcap_lookupnet(pdev->name,&net,&mask,errbuf)==-1)
{
printf("Couldn't get netmask for device %s: %s\n", pdev->name, errbuf);
net = 0;
mask = 0;
};
handle=pcap_open_live(pdev->name,BUFSIZ,1,0,errbuf);
printf("Number of packets: %d\n", num_packets);
pcap_dump_open(handle,"/home/jiangzhongbai/capturefiles/10.pcapng");
pcap_loop(handle,num_packets,pcap_dump,NULL);
pcap_dump_close(p);
pcap_freealldevs(alldevs);
pcap_close(handle);
printf("\nCapture complete.\n");
return 0;
}
The result is
eth0 (null) (null)
wlan0 (null) (null)
nflog Linux netfilter log (NFLOG) interface Linux netfilter log (NFLOG) interface
nfqueue Linux netfilter queue (NFQUEUE) interface Linux netfilter queue (NFQUEUE) interface
any Pseudo-device that captures on all interfaces Pseudo-device that captures on all interfaces
lo (null) (null)
select a device: 2
Selected wlan0
Number of packets: 500
Segmentation fault (core dumped)
I think there is something wrong with the functionpcap_dump_open.But I don't know how to solve the problem of Segmentation fault (core dumped).Please help me.

How to solve Segmentation fault (core dumped)
If pcap_findalldevs() returns -1, don't just print an error message, quit, because alldevs isn't necessarily set to a valid value or to NULL.
Do not assume that pdev->description is non-null - only print it if it's non-null.
Assign the result of pcap_dump_open() to the variable p.
Pass p, rather than NULL, as the fourth argument to pcap_loop().

Related

Is it possible to get assertion info from within a CUDA kernel?

Is there any way to get a kernel assert message/line number back from a kernel failure?
That is, if I have:
__global__ void my_kernel(int x){
assert(x!=0);
}
int main(){
CUDA_CHECK(my_kernel<<<1,1>>>(0));
CHECK_WITH_ASSERTION_FETCH(cudaDeviceSynchronize());
}
My understanding is that CUDA_CHECK() passes here and cudaDeviceSynchronize() would return a failure code (specifically, CUDA error: device-side assert triggered CUDA kernel errors).
Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error? The file and line number in which the assertion failed would be sufficient.
Is there a function CHECK_WITH_ASSERTION_FETCH that can somehow get info about which assertion failed when it observes that cudaDeviceSynchronize() is returning an error?
No there isn't.
As per the documentation, one way that you can see which line of code triggered the assertion and in which block and thread the assertion was raised is by attaching the debugger to the running kernel.
Robert Crovella's the authoritative voice here and says it isn't possible for a kernel assert to get information about itself back to the host. So we need some workarounds.
A major complicating factor is that if assert is called on the device then we are no longer able to communicate with it from the host, so any data we write to device memory is lost forever. (Reference).
Below I offer three:
Using Unified Memory to pass info from the GPU to the CPU even "after" an assert is called. This is the best answer.
Improving the GPU's assertion error messages by passing stacks to the GPU.
Passing info from the GPU to the CPU by dropping asserts and writing to memory. You'd only use this if UVM wasn't an option for some reason.
Using Unified Memory
Unified Memory allows the CUDA device and the host to transparently shuffle bits between each other without the need for cudaMemcpy. The result is that even though throwing an assert blocks our access to the device via regular API calls, we can still transfer signals back to the CPU via the Unified Memory.
Note that if we want kernels to be able to run asynchronously we need a way to associate kernel launches with assertion failures. The circular buffer here provides this functionality.
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
// Copy string from `src` to `dst`
__device__ void dstrcpy(char *dst, const char *src){
for(;*src!='\0';dst++,src++){
*dst = *src;
}
*dst = '\0';
}
// Used to hold assertion data generated by the device
struct AssertionData {
char assertion_msg[1000];
char filename[1000];
char function_name[1000];
int line_number;
uint32_t caller;
dim3 block_id;
dim3 thread_id;
};
// Used to hold assertions generated by the device
struct AssertionsData {
int assertion_count;
AssertionData assertions[DEVICE_SIDE_ASSERTION_COUNT];
// Get the next place to insert an assertion failure message
__device__ int next_id(){
// Atomically increment so other threads can fail at the same time
return atomicAdd(&assertion_count, 1);
}
__device__ void insert(
const char *assertion_msg0,
const char *filename0,
const char *function_name0,
const int line_number0,
const uint32_t caller0,
const dim3 block_id0,
const dim3 thread_id0
){
const auto nid = next_id();
if(nid>DEVICE_SIDE_ASSERTION_COUNT){
printf("RAN OUT OF ASSERTION BUFFER SPACE!");
return;
}
auto& self = assertions[nid];
dstrcpy(self.assertion_msg, assertion_msg0);
dstrcpy(self.filename, filename0);
dstrcpy(self.function_name, function_name0);
self.line_number = line_number0;
self.caller = caller0;
self.block_id = block_id0;
self.thread_id = thread_id0;
}
};
// Pointer to device memory allocated to hold assertion failure messages
AssertionsData *uvm_assertions = nullptr;
// Use to hold stack traces generated by the host so that we can run kernels
// asynchronously and still associate stacks to assertion failures
struct StackTraceInfo {
boost::stacktrace::stacktrace stacktrace;
int device;
cudaStream_t stream;
uint32_t generation_number;
StackTraceInfo() = default;
StackTraceInfo(int generation_number0, cudaStream_t stream0) {
// Technically we'd want to lop the top few layers off of this
generation_number = generation_number0;
stacktrace = boost::stacktrace::stacktrace();
CUDA_CHECK_API_CALL(cudaGetDevice(&device));
stream = stream0;
}
};
// Circular buffer used to hold stacks generated by the host
struct CircularTraceBuffer {
// Assume that this is the max number of items that might ever be enqueued
// across all streams
static constexpr int max_size = 1024;
// How many stacktraces we've inserted. Used to ensure that circular queue
// doesn't provide false information by always increasing, but also to mark
// where we are inserting into the queue
uint32_t generation_number = 0;
// The buffer
std::array<StackTraceInfo, max_size> traces;
uint32_t insert(cudaStream_t stream_id) {
traces[generation_number % max_size] = StackTraceInfo(generation_number, stream_id);
return generation_number++;
}
};
// Circular buffer of host stacktraces for associating with kernel launches
CircularTraceBuffer circular_trace_buffer;
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, assertions_data, caller) \
do { \
if (! (condition)) { \
/* Atomically increment so other threads can fail at the same time */ \
assertions_data->insert( \
TOSTRING(condition), \
__FILE__, \
__FUNCTION__, \
__LINE__, \
caller, \
blockIdx, \
threadIdx \
); \
\
assert(condition); \
} \
} while (false);
// NOTE: Our kernels now need a pointer to the assertions data and an id for the caller
// NOTE: We can simplify our code by assuming these variables always have the same names
// so that they do not need to be passed to the preprocessor macro
__global__ void my_failing_kernel(int x, AssertionsData *const assertions_data, const uint32_t caller){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, assertions_data, caller);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
if(cudaDeviceSynchronize()==cudaSuccess){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace()<<std::endl;
for(int i=0;i<uvm_assertions->assertion_count;i++){
std::cout<<"Assertion failure "<<i<<std::endl;
const auto &self = uvm_assertions->assertions[i];
const auto &stack = circular_trace_buffer.traces[self.caller];
std::cout<<"GPU "<<self.filename<<":"
<<self.line_number<<"("
<<self.function_name<<"): "
<<self.assertion_msg<<std::endl;
if(stack.generation_number == self.caller){
std::cout<<stack.stacktrace
<<"Device = "<<stack.device<<", "
<<"Stream = "<<stack.stream
<<std::endl;
} else {
std::cout<<"CPU stack has been overwritten!"<<std::endl;
}
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMallocManaged(&uvm_assertions, sizeof(AssertionsData)));
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId
));
// GPU will establish direct mapping of data in CPU memory, no page faults will be generated
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetAccessedBy, 0
));
my_failing_kernel<<<1, 1, 0>>>(4, uvm_assertions, circular_trace_buffer.insert(0));
my_failing_kernel<<<1, 1, 0>>>(5, uvm_assertions, circular_trace_buffer.insert(0));
CUDA_CHECK_KERNEL_SUCCESS();
CUDA_CHECK_API_CALL(cudaFree(uvm_assertions));
return 0;
}
The output for the above is:
main_assert_um_from_device.cu:162: void my_failing_kernel(int, AssertionsData *, unsigned int): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
CUDA API call failure detected at (main_assert_um_from_device.cu:167:0):
0# 0x000055D3D8CEAFF2 in ./a.out
1# 0x000055D3D8CEB700 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x000055D3D8CEADAE in ./a.out
Assertion failure 0
GPU main_assert_um_from_device.cu:162(my_failing_kernel): x!=5
0# 0x000055D3D8CECEF9 in ./a.out
1# 0x000055D3D8CED135 in ./a.out
2# 0x000055D3D8CEB6B9 in ./a.out
3# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
4# 0x000055D3D8CEADAE in ./a.out
Device = 0, Stream = 0
Better Assert Messages
The first work around is to make the device assert message better. To do so, we collect stacktrace strings on the host and transfer them to the GPU. Then, when we call a kernel we pass a pointer to the stacktrace string. If the kernel fails an assertion condition we print out the stacktrace before triggering the assertion.
The code for that is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/stacktrace.hpp>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
// Print a beefy kernel assertion message followed by inducing failure using
// the actual assertion
#define CUDA_DEVICE_ASSERT_WITH_STACKTRACE(condition, message) \
do { \
if (! (condition)) { \
printf("Assertion '%s' failed at %s:%d as part of stacktrace:\n%s", \
TOSTRING(condition), \
__FILE__, \
__LINE__, \
message); \
} \
/* Perform actual assertion to stop kernel progress */ \
assert(condition); \
} while (false)
__global__ void my_failing_kernel(int x, const char *d_stacktrace){
CUDA_DEVICE_ASSERT_WITH_STACKTRACE(x!=5, d_stacktrace);
}
// Increases performance by cacheing stack traces so we don't repeatedly
// transfer the same data to the GPU
std::unordered_map<std::string, char*> cached_stacks;
// Send a stacktrace to the GPU, cache the pointer it's stored at, return
// said pointer
char* setup_device_stacktrace(){
std::stringstream ss;
ss << boost::stacktrace::stacktrace();
const auto cached_stack = cached_stacks.find(ss.str());
if(cached_stack!=cached_stacks.end()){
std::cerr<<"Using cached stacktrace!"<<std::endl;
return cached_stack->second;
}
char *d_stacktrace = nullptr;
cudaMalloc(&d_stacktrace, 10000);
cudaMemcpy(d_stacktrace, ss.str().c_str(), ss.str().size(), cudaMemcpyHostToDevice);
cached_stacks[ss.str()] = d_stacktrace;
return d_stacktrace;
}
// Make an interesting stack
void nested_n(int depth, int val){
if(depth<5){
nested_n(depth+1, val);
} else {
const char* d_stacktrace = setup_device_stacktrace();
my_failing_kernel<<<1, 1>>>(val, d_stacktrace);
cudaDeviceSynchronize();
}
}
// Make an interesting stack
void nested3(int val){ nested_n(0, val); }
void nested2(int val){ nested3(val); }
void nested1(int val){ nested2(val); }
int main(){
for(int i=4;i<6;i++){
std::cerr<<"Running with value = "<<i<<std::endl;
nested1(i);
}
// Clean-up
for(const auto &x: cached_stacks){
cudaFree(x.second);
}
return 0;
}
This gives the output:
Running with value = 4
Running with value = 5
Using cached stacktrace!
Assertion 'x!=5' failed at main.cu:31 as part of stacktrace:
0# 0x000055BBF4A3CF76 in ./a.out
1# 0x000055BBF4A3D262 in ./a.out
2# 0x000055BBF4A3D258 in ./a.out
3# 0x000055BBF4A3D258 in ./a.out
4# 0x000055BBF4A3D258 in ./a.out
5# 0x000055BBF4A3D258 in ./a.out
6# 0x000055BBF4A3D258 in ./a.out
7# 0x000055BBF4A3D313 in ./a.out
8# 0x000055BBF4A3D32F in ./a.out
9# 0x000055BBF4A3D34B in ./a.out
10# 0x000055BBF4A3D3CF in ./a.out
11# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
12# 0x000055BBF4A3CE0E in ./a.out
main.cu:31: void my_failing_kernel(int, const char *): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
Replace The Device Assertion With Magic
Here the idea is to replace the device-side assert with our Own Special Assert. Our OSA will write information about itself to device-side and the host will read this to see what went wrong. Note that we'd only want to do this if the Unified Memory solution wasn't possible for some reason.
Here, rather than have the kernel fail with an assert, we have any failing threads early-exit the kernel while the rest of the threads continuing working. The result is garbage, but at least we can get information about why!
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Pointer to device memory allocated to hold assertion failure messages
char *d_assert_buffer = nullptr;
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Length of each assertion failure message - if this is too small we get
// garbage as threads overwrite each other
#define DEVICE_SIDE_ASSERTION_LENGTH 500
// Total size of the assertion failure message buffer. First 4 bytes stores the
// number of logged messages
#define DEVICE_SIDE_ASSERTION_BUFFER_LEN (4 + DEVICE_SIDE_ASSERTION_COUNT * DEVICE_SIDE_ASSERTION_LENGTH)
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, buffer) \
do { \
if (! (condition)) { \
/* First four bytes of the buffer indicate which buffer we're using */ \
uint32_t *const msgnum_ptr = reinterpret_cast<uint32_t*>(d_assert_buffer); \
/* Atomically increment so other threads can fail at the same time */ \
const uint32_t msg_num = atomicAdd(msgnum_ptr, 1); \
if(msg_num>=DEVICE_SIDE_ASSERTION_COUNT){ \
printf("RAN OUT OF ASSERTION BUFFER SPACE!\n"); \
return; \
} \
\
/* Find the start of the buffer we'll be writing to */ \
char *const msg_ptr = d_assert_buffer + 4 + msg_num * DEVICE_SIDE_ASSERTION_LENGTH; \
\
constexpr char const assertion_string[] = TOSTRING(x==5); \
constexpr char const line_string[] = LINE_STRING; \
constexpr int assertion_size = sizeof(assertion_string); \
constexpr int filename_size = sizeof(__FILE__)-1; \
\
/* __LINE__ gets turned into a buffer of length 6, it seems, so we need to find */ \
/* the actual length in order to print the message */ \
int line_size = 0; \
for(int i=0;i<20;i++){ \
if(line_string[i]!='\0'){ \
line_size++; \
} else { \
break; \
} \
} \
\
memcpy(msg_ptr, __FILE__, filename_size); \
msg_ptr[filename_size] = ':'; \
memcpy(msg_ptr+filename_size+1, line_string, line_size); \
msg_ptr[filename_size+1+line_size] = ':'; \
memcpy(msg_ptr+filename_size+1+line_size+1, assertion_string, assertion_size); \
msg_ptr[filename_size+1+line_size+1+assertion_size] = '\0'; \
/* If we actually assert then we can't ever get the message to the host, so we */ \
/* return and let the kernel generate garbage */ \
return; \
} \
} while (false);
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
__global__ void my_failing_kernel(int x, char *d_assert_buffer){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, d_assert_buffer);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
std::array<char, DEVICE_SIDE_ASSERTION_BUFFER_LEN> cuda_assert_buffer = {0};
CUDA_CHECK_API_CALL(cudaDeviceSynchronize());
assert(d_assert_buffer!=nullptr);
// NOTE: We could maybe save time by only moving the message count initially and copying the messages
// conditionally.
CUDA_CHECK_API_CALL(cudaMemcpy(cuda_assert_buffer.data(), d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN, cudaMemcpyDeviceToHost));
CUDA_CHECK_API_CALL(cudaDeviceSynchronize()); // NOTE: Needed for buffers of <64kB
const uint32_t& msg_num = *reinterpret_cast<uint32_t*>(cuda_assert_buffer.data());
if(msg_num==0){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace();
std::cout<<"Assertion messages ("<<msg_num<<" messages):"<<std::endl;
for(int i=0;i<msg_num;i++){
std::cout<<" "<<i<<" "<<cuda_assert_buffer.data()+(4+i*DEVICE_SIDE_ASSERTION_LENGTH)<<std::endl;
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMalloc(&d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN));
my_failing_kernel<<<1, 1>>>(4, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
my_failing_kernel<<<1, 1>>>(5, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
// Clean-up
cudaFree(d_assert_buffer);
return 0;
}
And the output looks like:
CUDA API call failure detected at (main_assert_from_device.cu:91:0):
0# 0x00005573A1F633A5 in ./a.out
1# 0x00005573A1F637C2 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x00005573A1F62D9E in ./a.out
Assertion messages (1 messages):
0 main_assert_from_device.cu:86:x==5

Why is cudaPointerGetAttributes() returning invalid argument for host pointer?

I want to write a function that tells me if a pointer is a host or device pointer. This is essentially a wrapper around cudaPointerGetAttributes() that returns either 1 or 0 if the pointer is for the device or not.
What I can't understand is why cudaPointerGetAttributes fails my error checking by returning invalid argument when I'm testing a host pointer. An example is provided below.
#include <stdio.h>
#include <stdlib.h>
#define CUDA_ERROR_CHECK(fun) \
do{ \
cudaError_t err = fun; \
if(err != cudaSuccess) \
{ \
fprintf(stderr, "Cuda error %d %s:: %s\n", __LINE__, __func__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}while(0);
int is_device_pointer(const void *ptr)
{
int is_device_ptr = 0;
cudaPointerAttributes attributes;
CUDA_ERROR_CHECK(cudaPointerGetAttributes(&attributes, ptr));
if(attributes.devicePointer != NULL)
{
is_device_ptr = 1;
}
return is_device_ptr;
}
int main()
{
int *host_ptr, x = 0;
int is_dev_ptr;
host_ptr = &x;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, 16);
//is_dev_ptr = is_device_pointer((const void *)host_ptr); //Causes invalid argument
is_dev_ptr = is_device_pointer((const void *)dev_ptr); //Works
if(is_dev_ptr == 1)
{
fprintf(stdout, "Device pointer\n");
}
else
{
fprintf(stdout, "Not device Pointer\n");
}
CUDA_ERROR_CHECK(cudaFree((void *)dev_ptr));
CUDA_ERROR_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
This is expected behavior. cudaPointerGetAttributes can only introspect pointers that have been recorded in some fashion with the CUDA runtime API. Refer to the documentation:
If pointer was not allocated in, mapped by or registered with context supporting unified addressing cudaErrorInvalidValue is returned.
What this is saying is that the pointer must have been returned or passed through an API such as cudaMalloc, cudaMallocManaged, cudaHostRegister, etc. for it to be "recognized" by cudaPointerGetAttributes. You must be in a UVA regime, and you must have acquired the pointer using an appropriate method.
In your case, passing a bare host pointer this way doesn't meet the requirements spelled out in the documentation, so the error return is expected.
This particular error return code is a "non-sticky" CUDA error, meaning it can be cleared out via cudaGetLastError(). In my view, it should be safe to interpret this error return code as "this is an ordinary host pointer". But of course, if you pass a garbage value, or an unallocated pointer, you will get the same error code.

How do identify STATUS_INVALID_CRUNTIME_PARAMETER exception

Platform is Windows 7 SP1.
I recently spent some time debugging an issue that was caused because a code was passing an invalid parameter to one of the "safe" CRT functions. As a result my application was aborted right away with no warning or anything -- not even a crash dialog.
At first, I tried to figure this out by attaching Windbg to my application. However when the crash happened, by the time the code broke into Windbg pretty much every thread had been killed save for ONE thread on which Windbg had to break into. There was no clue as to what was wrong. So, I attached Visual Studio as a debugger instead and when my application terminated, I saw every thread exiting with error code 0xc0000417. That is what gave me the clue that there is an invalid parameter issue somewhere.
Next, the way I went about trying to debug this is to once again attach Windbg to my application but this time randomly (by trial & error) place breakpoints in various places like kernel32!TerminateThread, kernel32!UnhandledExceptionFilter and kernel32!SetUnhandledExceptionFilter.
Of the lot, placing a break point at SetUnhandledExceptionFilter immediately showed the callstack of the offending thread when the crash occurred and the CRT function that we were calling incorrectly.
Question: Is there anything intuitive that should have told me to place bp on SUEF right away? I would like to understand this a bit better and not do this by trial and error. Second question is w.r.t to the error code I determined via Visual Studio. Without resorting to VS, how do I determine thread exit codes on Windbg?
i was going to just comment but this became bigger so an answer
setting windbg as postmortem debugger using Windbg -I will also route all the unhandled exception to windbg
Windbg -I should Register windbg as postmortem debugger
by default Auto is set to 1 in AeDebug Registry Key
if you don't want to debug every program you can edit this to 0
to provide you an additional DoYouWanttoDebug option in the wer Dialog
reg query "hklm\software\microsoft\windows nt\currentversion\aedebug"
HKEY_LOCAL_MACHINE\software\microsoft\windows nt\currentversion\aedebug
Debugger REG_SZ "xxxxxxxxxx\windbg.exe" -p %ld -e %ld -g
Auto REG_SZ 0
assuming you registered a postmortem debugger and you run this code
#include <stdio.h>
#include <stdlib.h>
int main (void)
{
unsigned long input[] = {1,45,0xf001,0xffffffff};
int i = 0;
char buf[5] = {0};
for(i=0;i<_countof(input);i++)
{
_ultoa_s(input[i],buf,sizeof(buf),16);
printf("%s\n",buf);
}
return 1;
}
on the exception you will see a wer dialog like this
you can now choose to debug this program
windows also writes the exit code on unhandled exception to event log
you can use powershell to retrieve one event like this
PS C:\> Get-EventLog -LogName Application -Source "Application Error" -newest 1| format-list
Index : 577102
EntryType : Error
InstanceId : 1000
Message : Faulting application name:
ultos.exe, version: 0.0.0.0, time stamp: 0x577680f1
Faulting module name: ultos.exe, version:
0.0.0.0, time stamp: 0x577680f1
Exception code: 0xc0000417
Fault offset: 0x000211c2
Faulting process id: 0x4a8
Faulting application start time: 0x01d1d3aaf61c8aaa
Faulting application path: E:\test\ulto\ultos.exe
Faulting module path: E:\test\ulto\ultos.exe
Report Id: 348d86fc-3f9e-11e6-ade2-005056c00008
Category : Application Crashing Events
CategoryNumber : 100
ReplacementStrings : {ultos.exe, 0.0.0.0, 577680f1, ultos.exe...}
Source : Application Error
TimeGenerated : 7/1/2016 8:42:21 PM
TimeWritten : 7/1/2016 8:42:21 PM
UserName :
and if you choose to debug
you can view the CallStack
0:000> kPL
# ChildEBP RetAddr
00 001ffdc8 77cf68d4 ntdll!KiFastSystemCallRet
01 001ffdcc 75e91fdb ntdll!NtTerminateProcess+0xc
02 001ffddc 012911d3 KERNELBASE!TerminateProcess+0x2c
03 001ffdec 01291174 ultos!_invoke_watson(
wchar_t * expression = 0x00000000 "",
wchar_t * function_name = 0x00000000 "",
wchar_t * file_name = 0x00000000 "",
unsigned int line_number = 0,
unsigned int reserved = 0)+0x31
04 001ffe10 01291181 ultos!_invalid_parameter(
wchar_t * expression = <Value unavailable error>,
wchar_t * function_name = <Value unavailable error>,
wchar_t * file_name = <Value unavailable error>,
unsigned int line_number = <Value unavailable error>,
unsigned int reserved = <Value unavailable error>)+0x7a
05 001ffe28 0128ad96 ultos!_invalid_parameter_noinfo(void)+0xc
06 001ffe3c 0128affa ultos!common_xtox<unsigned long,char>(
unsigned long original_value = 0xffffffff,
char * buffer = 0x001ffea4 "",
unsigned int buffer_count = 5,
unsigned int radix = 0x10,
bool is_negative = false)+0x58
07 001ffe5c 0128b496 ultos!common_xtox_s<unsigned long,char>(
unsigned long value = 0xffffffff,
char * buffer = 0x001ffea4 "",
unsigned int buffer_count = 5,
unsigned int radix = 0x10,
bool is_negative = false)+0x59
08 001ffe78 012712b2 ultos!_ultoa_s(
unsigned long value = 0xffffffff,
char * buffer = 0x001ffea4 "",
unsigned int buffer_count = 5,
int radix = 0n16)+0x18
09 001ffeac 0127151b ultos!main(void)+0x52
0a (Inline) -------- ultos!invoke_main+0x1d
0b 001ffef8 76403c45 ultos!__scrt_common_main_seh(void)+0xff
0c 001fff04 77d137f5 kernel32!BaseThreadInitThunk+0xe
0d 001fff44 77d137c8 ntdll!__RtlUserThreadStart+0x70
0e 001fff5c 00000000 ntdll!_RtlUserThreadStart+0x1b

CUDA mapped memory: device -> host writes are not visible on host

What I was trying to do is modifying a variable which resides in mapped memory that would cause the main program to exit.
But instead of this the main program keeps spinning on while (var == 0) ; line. I don't know how the new value could be flushed out so it would be visible on the host side too.
Btw. the variable is declared as volatile everywhere and I tried using the __threadfence_system() function with no success.
The host -> device direction works well.
System: Windows 7 x64, driver 358.50, GTX 560
Here is the piece of code that I can't get working:
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof (int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
echoKernel <<< 1, 1 >>> (devptr);
while (var == 0) ;
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
When I run your code on linux, it runs as-is without issue.
However on windows, there is a problem around WDDM command batching. In effect, your kernel does not launch and is not getting launched before you enter the while-loop that hangs.
The WDDM command queue is a queue of commands that will eventually go to the GPU device. Various events will cause this queue to be "flushed" and the contents to be delivered as a "batch" of commands to the GPU.
Various cuda runtime API calls may effectively force the "flushing" of the command queue, such as cudaDeviceSynchronize() or cudaMemcpy(). However after the kernel launch, you are not issuing any runtime API calls before entering your while-loop. As a result, in this scenario it seems that the kernel call is getting "stuck" in the queue and never "flushed".
You can work around this in a variety of ways, for example by recording an event after the launch of the kernel and then querying the status of that event. This will have the effect of flushing the queue, which will launch the kernel.
Here's an example modification of your code that works for me:
#include <stdio.h>
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof(int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
cudaEvent_t my_event;
CUDA_ERROR_CHECK(cudaEventCreate(&my_event));
echoKernel << < 1, 1 >> > (devptr);
CUDA_ERROR_CHECK(cudaEventRecord(my_event));
cudaEventQuery(my_event);
while (var == 0);
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
Tested on CUDA 7.5, Driver 358.50, Win7 x64 release project, GTX460M.
Note that we don't wrap the cudaEventQuery call in a standard error checker, because the expected behavior for it is to return a non-zero status when the event has not been completed yet.

CUDA kernel launch fails when using various offsets into input data

My code is giving an error message and I am trying to track down the cause of it. To make it easier to find the problem, I have stripped away code that apparently is not relevant to causing the error message. If you can tell me why the following simple code produces an error message, then I think I should be able to fix my original code:
#include "cuComplex.h"
#include <cutil.h>
__device__ void compute_energy(void *data, int isample, int nsamples) {
cuDoubleComplex * const nminusarray = (cuDoubleComplex*)data;
cuDoubleComplex * const f = (cuDoubleComplex*)(nminusarray+101);
double * const abs_est_errorrow_all = (double*)(f+3);
double * const rel_est_errorrow_all = (double*)(abs_est_errorrow_all+nsamples*51);
int * const iid_all = (int*)(rel_est_errorrow_all+nsamples*51);
int * const iiu_all = (int*)(iid_all+nsamples*21);
int * const piv_all = (int*)(iiu_all+nsamples*21);
cuDoubleComplex * const energyrow_all = (cuDoubleComplex*)(piv_all+nsamples*12);
cuDoubleComplex * const refinedenergyrow_all = (cuDoubleComplex*)(energyrow_all+nsamples*51);
cuDoubleComplex * const btplus_all = (cuDoubleComplex*)(refinedenergyrow_all+nsamples*51);
cuDoubleComplex * const btplus = btplus_all+isample*21021;
btplus[0] = make_cuDoubleComplex(0.0, 0.0);
}
__global__ void computeLamHeight(void *data, int nlambda) {
compute_energy(data, blockIdx.x, nlambda);
}
int main(int argc, char *argv[]) {
void *device_data;
CUT_DEVICE_INIT(argc, argv);
CUDA_SAFE_CALL(cudaMalloc(&device_data, 184465640));
computeLamHeight<<<dim3(101, 1, 1), dim3(512, 1, 1), 45000>>>(device_data, 101);
CUDA_SAFE_CALL(cudaThreadSynchronize());
}
I am using a GeForce GTX 480 and I am compiling the code like so:
nvcc -L /soft/cuda-sdk/4.0.17/C/lib -I /soft/cuda-sdk/4.0.17/C/common/inc -lcutil_x86_64 -arch sm_13 -O3 -Xopencc "-Wall" Main.cu
The output is:
Using device 0: GeForce GTX 480
Cuda error in file 'Main.cu' in line 31 : unspecified launch failure.
EDIT: I have now further simplified the code. The following simpler code still produces the error message:
#include <cutil.h>
__global__ void compute_energy(void *data) {
*(double*)((int*)data+101) = 0.0;
}
int main(int argc, char *argv[]) {
void *device_data;
CUT_DEVICE_INIT(argc, argv);
CUDA_SAFE_CALL(cudaMalloc(&device_data, 101*sizeof(int)+sizeof(double)));
compute_energy<<<dim3(1, 1, 1), dim3(1, 1, 1)>>>(device_data);
CUDA_SAFE_CALL(cudaThreadSynchronize());
}
Now it is easy to see that the offset should be valid. I tried running cuda-memcheck and it says the following:
========= CUDA-MEMCHECK
Using device 0: GeForce GTX 480
Cuda error in file 'Main.cu' in line 13 : unspecified launch failure.
========= Invalid __global__ write of size 8
========= at 0x00000020 in compute_energy
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x200200194 is misaligned
=========
========= ERROR SUMMARY: 1 error
I tried searching the internet to find what is meant by the address being misaligned, but I failed to find an explanation. What is the deal?
It was very hard to parse your original code with all of those magic constants, but your updated repro case makes the problem immediately obvious. The GPU architecture requires all pointers to be aligned to word boundaries. Your kernel contains a pointer access which is not correctly word aligned. Doubles are an 64 bit type, and your addressing is not aligned to an even 64 bit boundary. This:
*(double*)((int*)data+100) = 0.0; // 50th double
or this:
*(double*)((int*)data+102) = 0.0; // 51st double
are both legal. This:
*(double*)((int*)data+101) = 0.0; // not aligned to a 64 bit boundary
is not.
the error indicates out of bound memory access, please check the offset value.