Re-throwing exception from OpenMP block with the main thread with Rcpp - exception

As a followup to this question, I am looking for a solution to catch one of the errors thrown inside an OpenMP block and then re-throw it with the main thread after the OpenMP block when calling C++ code from R using Rcpp. The other question is about what is going wrong.
I have followed the answer here and tried with std::exception_ptr. This seems to work on almost all platforms and compilers with the exception of Fedora with clang-11 using libc++. Here is an example:
// openmp-exception-issue.cpp
#include <omp.h>
#include <exception>
#include <stdexcept>
// [[Rcpp::plugins(openmp)]]
#include <Rcpp.h>
// [[Rcpp::export()]]
double that_cpp_func(int const n_it){
std::exception_ptr Ptr = nullptr;
bool is_set = false;
double out(0.);
#pragma omp parallel for num_threads(4) reduction(+:out)
for(int i = 0; i < n_it; ++i)
try
{
if(i > -1)
throw std::runtime_error("boh :(");
out += i;
}
catch (...)
{
#pragma omp critical
if(!is_set){
Ptr = std::current_exception();
is_set = true;
}
}
if(Ptr)
std::rethrow_exception(Ptr);
return out;
}
Running (you will need to change the path)
/root/R-devel/bin/R -d valgrind -e "Rcpp::sourceCpp('/sdir/openmp-exception-issue.cpp'); that_cpp_func(100)"
with the setup below yields the following:
==15467== Memcheck, a memory error detector
==15467== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==15467== Using Valgrind-3.16.1 and LibVEX; rerun with -h for copyright info
==15467== Command: /root/R-devel/bin/exec/R -e Rcpp::sourceCpp('/sdir/openmp-exception-issue.cpp');~+~that_cpp_func(100)
==15467==
R Under development (unstable) (2021-02-24 r80033) -- "Unsuffered Consequences"
Copyright (C) 2021 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
...
> Rcpp::sourceCpp('/sdir/openmp-exception-issue.cpp'); that_cpp_func(100)
In file included from openmp-exception-issue.cpp:7:
In file included from /root/R-devel/library/Rcpp/include/Rcpp.h:57:
/root/R-devel/library/Rcpp/include/Rcpp/DataFrame.h:136:18: warning: unused variable 'data' [-Wunused-variable]
SEXP data = Parent::get__();
^
1 warning generated.
==15467== Syscall param sched_setaffinity(mask) points to unaddressable byte(s)
==15467== at 0x550F55D: syscall (in /usr/lib64/libc-2.32.so)
==15467== by 0x539AD1C: ??? (in /usr/lib64/libomp.so)
==15467== by 0x536AAE9: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535774A: ??? (in /usr/lib64/libomp.so)
==15467== by 0x5357B8C: ??? (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB07: that_cpp_func(int) (openmp-exception-issue.cpp:10)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467== by 0x4EED11: R_execClosure (eval.c:0)
==15467== by 0x4EE288: Rf_applyClosure (eval.c:1823)
==15467== Address 0x0 is not stack'd, malloc'd or (recently) free'd
==15467==
==15467== Invalid read of size 8
==15467== at 0x71EC2DF: __cxa_end_catch (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0x10B0FE08: .omp_outlined._debug__ (openmp-exception-issue.cpp:30)
==15467== by 0x10B0FE08: .omp_outlined. (openmp-exception-issue.cpp:15)
==15467== by 0x53B27C2: __kmp_invoke_microtask (in /usr/lib64/libomp.so)
==15467== by 0x5358068: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535BFF2: __kmp_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x534A740: __kmpc_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB63: that_cpp_func(int) (openmp-exception-issue.cpp:15)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467== by 0x4EED11: R_execClosure (eval.c:0)
==15467== Address 0x99d3990 is 96 bytes inside a block of size 144 free'd
==15467== at 0x483A9F5: free (vg_replace_malloc.c:538)
==15467== by 0xB6AE48A: __cxa_decrement_exception_refcount (in /usr/lib64/libc++abi.so.1.0)
==15467== by 0x10B0FDE9: .omp_outlined._debug__ (openmp-exception-issue.cpp:27)
==15467== by 0x10B0FDE9: .omp_outlined. (openmp-exception-issue.cpp:15)
==15467== by 0x53B27C2: __kmp_invoke_microtask (in /usr/lib64/libomp.so)
==15467== by 0x5358068: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535BFF2: __kmp_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x534A740: __kmpc_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB63: that_cpp_func(int) (openmp-exception-issue.cpp:15)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467== Block was alloc'd at
==15467== at 0x4839809: malloc (vg_replace_malloc.c:307)
==15467== by 0x71EC0C3: __cxa_allocate_exception (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0x10B0FD64: .omp_outlined._debug__ (openmp-exception-issue.cpp:20)
==15467== by 0x10B0FD64: .omp_outlined. (openmp-exception-issue.cpp:15)
==15467== by 0x53B27C2: __kmp_invoke_microtask (in /usr/lib64/libomp.so)
==15467== by 0x5358068: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535BFF2: __kmp_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x534A740: __kmpc_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB63: that_cpp_func(int) (openmp-exception-issue.cpp:15)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467==
10 more errors...
==15467==
==15467== Invalid free() / delete / delete[] / realloc()
==15467== at 0x483AEDD: operator delete(void*) (vg_replace_malloc.c:584)
==15467== by 0x72030FB: std::runtime_error::~runtime_error() (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0xB6AE47E: __cxa_decrement_exception_refcount (in /usr/lib64/libc++abi.so.1.0)
==15467== by 0x10B0FBC0: that_cpp_func(int) (openmp-exception-issue.cpp:36)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467== by 0x4EED11: R_execClosure (eval.c:0)
==15467== by 0x4EE288: Rf_applyClosure (eval.c:1823)
==15467== by 0x4D512D: Rf_eval (eval.c:850)
==15467== by 0x523CC9: Rf_ReplIteration (main.c:264)
==15467== Address 0x9604a10 is 0 bytes inside a block of size 31 free'd
==15467== at 0x483AEDD: operator delete(void*) (vg_replace_malloc.c:584)
==15467== by 0x72030FB: std::runtime_error::~runtime_error() (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0xB6AE47E: __cxa_decrement_exception_refcount (in /usr/lib64/libc++abi.so.1.0)
==15467== by 0x10B0FDE9: .omp_outlined._debug__ (openmp-exception-issue.cpp:27)
==15467== by 0x10B0FDE9: .omp_outlined. (openmp-exception-issue.cpp:15)
==15467== by 0x53B27C2: __kmp_invoke_microtask (in /usr/lib64/libomp.so)
==15467== by 0x5358068: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535BFF2: __kmp_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x534A740: __kmpc_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB63: that_cpp_func(int) (openmp-exception-issue.cpp:15)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== Block was alloc'd at
==15467== at 0x4839E7D: operator new(unsigned long) (vg_replace_malloc.c:342)
==15467== by 0x7213B20: ??? (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0x7213CB6: ??? (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0x721409D: std::runtime_error::runtime_error(char const*) (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0x10B0FD72: .omp_outlined._debug__ (openmp-exception-issue.cpp:20)
==15467== by 0x10B0FD72: .omp_outlined. (openmp-exception-issue.cpp:15)
==15467== by 0x53B27C2: __kmp_invoke_microtask (in /usr/lib64/libomp.so)
==15467== by 0x5358068: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535BFF2: __kmp_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x534A740: __kmpc_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB63: that_cpp_func(int) (openmp-exception-issue.cpp:15)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467==
==15467== Invalid free() / delete / delete[] / realloc()
==15467== at 0x483A9F5: free (vg_replace_malloc.c:538)
==15467== by 0xB6AE48A: __cxa_decrement_exception_refcount (in /usr/lib64/libc++abi.so.1.0)
==15467== by 0x10B0FBC0: that_cpp_func(int) (openmp-exception-issue.cpp:36)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467== by 0x4EED11: R_execClosure (eval.c:0)
==15467== by 0x4EE288: Rf_applyClosure (eval.c:1823)
==15467== by 0x4D512D: Rf_eval (eval.c:850)
==15467== by 0x523CC9: Rf_ReplIteration (main.c:264)
==15467== by 0x52545F: R_ReplConsole (main.c:314)
==15467== Address 0x99d3930 is 0 bytes inside a block of size 144 free'd
==15467== at 0x483A9F5: free (vg_replace_malloc.c:538)
==15467== by 0xB6AE48A: __cxa_decrement_exception_refcount (in /usr/lib64/libc++abi.so.1.0)
==15467== by 0x10B0FDE9: .omp_outlined._debug__ (openmp-exception-issue.cpp:27)
==15467== by 0x10B0FDE9: .omp_outlined. (openmp-exception-issue.cpp:15)
==15467== by 0x53B27C2: __kmp_invoke_microtask (in /usr/lib64/libomp.so)
==15467== by 0x5358068: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535BFF2: __kmp_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x534A740: __kmpc_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB63: that_cpp_func(int) (openmp-exception-issue.cpp:15)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467== Block was alloc'd at
==15467== at 0x4839809: malloc (vg_replace_malloc.c:307)
==15467== by 0x71EC0C3: __cxa_allocate_exception (in /usr/lib64/libstdc++.so.6.0.28)
==15467== by 0x10B0FD64: .omp_outlined._debug__ (openmp-exception-issue.cpp:20)
==15467== by 0x10B0FD64: .omp_outlined. (openmp-exception-issue.cpp:15)
==15467== by 0x53B27C2: __kmp_invoke_microtask (in /usr/lib64/libomp.so)
==15467== by 0x5358068: ??? (in /usr/lib64/libomp.so)
==15467== by 0x535BFF2: __kmp_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x534A740: __kmpc_fork_call (in /usr/lib64/libomp.so)
==15467== by 0x10B0FB63: that_cpp_func(int) (openmp-exception-issue.cpp:15)
==15467== by 0x10B0FEAA: sourceCpp_1_that_cpp_func (openmp-exception-issue.cpp:47)
==15467== by 0x49DA3A: R_doDotCall (dotcode.c:598)
==15467== by 0x4A0510: do_dotcall (dotcode.c:1281)
==15467== by 0x4D50BC: Rf_eval (eval.c:830)
==15467==
1 more error...
Error in that_cpp_func(100) : c++ exception (unknown reason)
Calls: that_cpp_func -> .Call
Execution halted
==15467==
==15467== HEAP SUMMARY:
==15467== in use at exit: 55,351,131 bytes in 10,962 blocks
==15467== total heap usage: 30,797 allocs, 19,837 frees, 94,080,022 bytes allocated
==15467==
==15467== LEAK SUMMARY:
==15467== definitely lost: 0 bytes in 0 blocks
==15467== indirectly lost: 0 bytes in 0 blocks
==15467== possibly lost: 0 bytes in 0 blocks
==15467== still reachable: 55,351,131 bytes in 10,962 blocks
==15467== of which reachable via heuristic:
==15467== newarray : 4,264 bytes in 1 blocks
==15467== suppressed: 0 bytes in 0 blocks
==15467== Rerun with --leak-check=full to see details of leaked memory
==15467==
==15467== For lists of detected and suppressed errors, rerun with: -s
==15467== ERROR SUMMARY: 32 errors from 18 contexts (suppressed: 0 from 0)
It seems to work on all other platform and compiler configurations I have tested though. I have tried to make an example without Rcpp (a pure C++ example) but this does not produce the error shown above.
The motivating example is my mdgc package which yields the following error on CRAN's r-devel-linux-x86_64-fedora-clang:
*** caught segfault ***
address 0xffffffff, cause 'memory not mapped'
An irrecoverable exception occurred. R is aborting now ...
This is consistent with the Valgrind output.
Update
You also get similar errors if you remove the #pragma omp parallel for num_threads(4) reduction(+:out) and #pragma omp critical. I.e. it is not related to OpenMP.
Producing the Result
I ran the following to produce the results (like CRAN's r-devel-linux-x86_64-fedora-clang):
sudo docker run -ti rhub/fedora-clang
export _R_CHECK_INSTALL_DEPENDS_=true
export _R_CHECK_SUGGESTS_ONLY_=true
export _R_CHECK_NO_RECOMMENDED_=true
export _R_CHECK_DOC_SIZES2_=true
export _R_CHECK_DEPRECATED_DEFUNCT_=true
export _R_CHECK_SCREEN_DEVICE_=warn
export _R_CHECK_REPLACING_IMPORTS_=true
export _R_CHECK_TOPLEVEL_FILES_=true
export _R_CHECK_DOT_FIRSTLIB_=true
export _R_CHECK_RD_LINE_WIDTHS_=true
export _R_CHECK_S3_METHODS_NOT_REGISTERED_=true
export _R_CHECK_OVERWRITE_REGISTERED_S3_METHODS_=true
export _R_CHECK_CODE_USAGE_WITH_ONLY_BASE_ATTACHED_=TRUE
export _R_CHECK_NATIVE_ROUTINE_REGISTRATION_=true
export _R_CHECK_FF_CALLS_=registration
export _R_CHECK_PRAGMAS_=true
export _R_CHECK_COMPILATION_FLAGS_=true
export _R_CHECK_R_DEPENDS_=true
export _R_CHECK_PACKAGES_USED_IN_TESTS_USE_SUBDIRS_=true
export _R_CHECK_SHLIB_OPENMP_FLAGS_=true
export _R_CHECK_CODE_ASSIGN_TO_GLOBALENV_=true
export _R_CHECK_CODE_DATA_INTO_GLOBALENV_=true
export _R_CHECK_PKG_SIZES_=true
export _R_CHECK_LIMIT_CORES_=true
#export _R_CHECK_LENGTH_1_CONDITION_ package:_R_CHECK_PACKAGE_NAME_,abort,verbose
export _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_=true
export _R_CHECK_AUTOCONF_=true
export _R_CHECK_THINGS_IN_CHECK_DIR_=true
export _R_CHECK_THINGS_IN_TEMP_DIR_=true
export _R_CHECK_THINGS_IN_TEMP_DIR_EXCLUDE_="^ompi"
export _R_CHECK_BASHISMS_=true
export _R_CHECK_ORPHANED_=true
export _R_CHECK_DEPENDS_ONLY_DATA_=true
export _R_CHECK_XREFS_PKGS_ARE_DECLARED_=true
yum install wget -y
yum install java-openjdk-devel -y
yum install libcxx-devel -y
yum install rsync -y
yum install openssl-devel -y
mkdir ~/.R
echo "MAKEFLAGS = -j 6" >> ~/.R/Makevars
cd
wget -c "https://stat.ethz.ch/R/daily/R-devel.tar.gz"
tar -zxvf R-devel.tar.gz
cd R-devel
tools/rsync-recommended
# check with https://www.stats.ox.ac.uk/pub/bdr/Rconfig/r-devel-linux-x86_64-fedora-clang
./configure \
CC="clang" \
CXX="clang++ -stdlib=libc++" \
FC=gfortran \
MAKEFLAGS="-j 6" \
CFLAGS="-g -O3 -Wall -pedantic" \
FFLAGS="-g -O2 -mtune=native -Wall -pedantic" \
CXXFLAGS="-g -O3 -Wall -pedantic -frtti" \
LDFLAGS="-L/usr/local/lib64" \
JAVA_HOME=/usr/lib/jvm/java-11
make
make install
echo "options(repos = structure(c(CRAN = 'http://cran.rstudio.com')))" >> ~/.Rprofile
/root/R-devel/bin/R -e "install.packages('Rcpp')"
# replace the path with your path to openmp-exception-issue.cpp
/root/R-devel/bin/R -d valgrind -e "Rcpp::sourceCpp('/sdir/openmp-exception-issue.cpp'); that_cpp_func(100)"
Using RcppThread
RcppThread equally fails. Consider the following file:
// openmp-exception-issue.cpp
#include <exception>
#include <stdexcept>
// [[Rcpp::depends(RcppThread)]]
#include <RcppThread.h>
// [[Rcpp::export()]]
double that_cpp_func(int const n_it){
std::exception_ptr Ptr = nullptr;
double out(0.);
RcppThread::ThreadPool pool(4);
for(int i = 0; i < n_it; ++i)
pool.push([&](int const j) -> void {
if(j > -1)
throw std::runtime_error("boh :(");
out += j; // ignore the race condition
}, i);
pool.join();
return out;
}
Running the example as before yields:
==139== Invalid read of size 8
==139== at 0x71EC2DF: __cxa_end_catch (in /usr/lib64/libstdc++.so.6.0.28)
==139== by 0x10B13351: RcppThread::ThreadPool::doJob(std::__1::function<void ()>&&) (ThreadPool.hpp:321)
==139== by 0x10B13201: RcppThread::ThreadPool::startWorker()::{lambda()#1}::operator()() const (ThreadPool.hpp:304)
==139== by 0x10B12F36: __invoke<(lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:279:27)> (type_traits:3899)
==139== by 0x10B12F36: __thread_execute<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, (lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:279:27)> (thread:280)
==139== by 0x10B12F36: void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, RcppThread::ThreadPool::startWorker()::{lambda()#1}> >(std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, RcppThread::ThreadPool::startWorker()::{lambda()#1}>) (thread:291)
==139== by 0x53FA3F8: start_thread (in /usr/lib64/libpthread-2.32.so)
==139== by 0x5514B52: clone (in /usr/lib64/libc-2.32.so)
==139== Address 0x9151e90 is 96 bytes inside a block of size 144 free'd
==139== at 0x483A9F5: free (vg_replace_malloc.c:538)
==139== by 0xB6AE48A: __cxa_decrement_exception_refcount (in /usr/lib64/libc++abi.so.1.0)
==139== by 0x10B13344: RcppThread::ThreadPool::doJob(std::__1::function<void ()>&&) (ThreadPool.hpp:320)
==139== by 0x10B13201: RcppThread::ThreadPool::startWorker()::{lambda()#1}::operator()() const (ThreadPool.hpp:304)
==139== by 0x10B12F36: __invoke<(lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:279:27)> (type_traits:3899)
==139== by 0x10B12F36: __thread_execute<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, (lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:279:27)> (thread:280)
==139== by 0x10B12F36: void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, RcppThread::ThreadPool::startWorker()::{lambda()#1}> >(std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, RcppThread::ThreadPool::startWorker()::{lambda()#1}>) (thread:291)
==139== by 0x53FA3F8: start_thread (in /usr/lib64/libpthread-2.32.so)
==139== by 0x5514B52: clone (in /usr/lib64/libc-2.32.so)
==139== Block was alloc'd at
==139== at 0x4839809: malloc (vg_replace_malloc.c:307)
==139== by 0x71EC0C3: __cxa_allocate_exception (in /usr/lib64/libstdc++.so.6.0.28)
==139== by 0x10B1240C: operator() (openmp-exception-issue.cpp:17)
==139== by 0x10B1240C: operator() (ThreadPool.hpp:129)
==139== by 0x10B1240C: __invoke<(lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:129:23) &> (type_traits:3899)
==139== by 0x10B1240C: __call<(lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:129:23) &> (__functional_base:348)
==139== by 0x10B1240C: operator() (functional:1557)
==139== by 0x10B1240C: std::__1::__function::__func<void RcppThread::ThreadPool::push<that_cpp_func(int)::$_0, int&>(that_cpp_func(int)::$_0&&, int&)::{lambda()#1}, std::__1::allocator<{lambda()#1}>, void ()>::operator()() (functional:1731)
==139== by 0x10B132FA: operator() (functional:1884)
==139== by 0x10B132FA: operator() (functional:2556)
==139== by 0x10B132FA: RcppThread::ThreadPool::doJob(std::__1::function<void ()>&&) (ThreadPool.hpp:317)
==139== by 0x10B13201: RcppThread::ThreadPool::startWorker()::{lambda()#1}::operator()() const (ThreadPool.hpp:304)
==139== by 0x10B12F36: __invoke<(lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:279:27)> (type_traits:3899)
==139== by 0x10B12F36: __thread_execute<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, (lambda at /root/R-devel/library/RcppThread/include/RcppThread/ThreadPool.hpp:279:27)> (thread:280)
==139== by 0x10B12F36: void* std::__1::__thread_proxy<std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, RcppThread::ThreadPool::startWorker()::{lambda()#1}> >(std::__1::tuple<std::__1::unique_ptr<std::__1::__thread_struct, std::__1::default_delete<std::__1::__thread_struct> >, RcppThread::ThreadPool::startWorker()::{lambda()#1}>) (thread:291)
==139== by 0x53FA3F8: start_thread (in /usr/lib64/libpthread-2.32.so)
==139== by 0x5514B52: clone (in /usr/lib64/libc-2.32.so)
...
This seems to be the same error:
Invalid read at the end of the catch scope.
Memory was free'd when using std::current_exception.

Related

Command to run callback_profiling sample from CUPTI

I am running the sample code available for Nvidia CUDA CUPTI in /usr/local/cuda-11.8/extras/CUPTI/samples/callback_profiling. There is a Makefile, but I want to run it using single command (without the Makefile) because it is giving me permission errors with the Makefile. Based on the Makefile, this is the command I am writing:
nvcc --generate-line-info callback_profiling.cu -o callback_profiling -lnvperf_host -lnvperf_target -lcuda -lcupti -I/usr/local/cuda-11.8/extras/CUPTI/samples/callback_profiling/../extensions/include/profilerhost_util -I/usr/local/cuda-11.8/extras/CUPTI/samples/callback_profiling/../extensions/include/c_util -I/usr/local/cuda-11.8/extras/CUPTI/samples/callback_profiling/../../include -L /usr/local/cuda-11.8/extras/CUPTI/samples/callback_profiling/../extensions/src/profilerhost_util
I am getting the error
/usr/bin/ld: /tmp/tmpxft_00005e71_00000000-11_callback_profiling.o: in function `setupProfiling(ProfilingData_t*)':
tmpxft_00005e71_00000000-6_callback_profiling.cudafe1.cpp:(.text+0xe48): undefined reference to `NV::Metric::Config::GetConfigImage(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<unsigned char, std::allocator<unsigned char> >&, unsigned char const*)'
/usr/bin/ld: tmpxft_00005e71_00000000-6_callback_profiling.cudafe1.cpp:(.text+0xed3): undefined reference to `NV::Metric::Config::GetCounterDataPrefixImage(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<unsigned char, std::allocator<unsigned char> >&, unsigned char const*)'
/usr/bin/ld: /tmp/tmpxft_00005e71_00000000-11_callback_profiling.o: in function `main':
tmpxft_00005e71_00000000-6_callback_profiling.cudafe1.cpp:(.text+0x2751): undefined reference to `NV::Metric::Eval::PrintMetricValues(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<unsigned char, std::allocator<unsigned char> > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, unsigned char const*)'
collect2: error: ld returned 1 exit status
Can someone help me with what will be the right command to run the application?
I have an Ubuntu machine with CUDA 11.8
There is a library (libprofilerHostUtil.a) that doesn't get built as part of the samples build process that you will need to build manually.
On a typical linux install, the Makefile to build that library is in /usr/local/cuda/extras/CUPTI/samples/extensions/src/profilerhost_util. As a root user you should be able to go into that directory, and type make, and then the necessary library will get built.
Once you have done that, as an ordinary user, you should be able to build the code you are asking about like this:
nvcc callback_profiling.cu -o callback_profiling -I/usr/local/cuda/extras/CUPTI/samples/callback_profiling/../extensions/include/profilerhost_util -I/usr/local/cuda/extras/CUPTI/samples/callback_profiling/../extensions/include/c_util -I/usr/local/cuda/extras/CUPTI/samples/callback_profiling/../../include -L /usr/local/cuda/extras/CUPTI/samples/callback_profiling/../extensions/src/profilerhost_util -L/usr/local/cuda/extras/CUPTI/lib64 -lcupti -lnvperf_host -lnvperf_target -lprofilerHostUtil -lcuda
wherever you see /cuda/ in the paths above, you may need to change that to /cuda-11.8/ or whatever is needed to match your install, if you have not set up the symbolic link that the CUDA installer asks to set up.
If you want to build the library I mentioned somewhere else, manually, you will need to do something like this. First, change to the indicated directory (cd /usr/local/cuda/extras/CUPTI/samples/extensions/src/profilerhost_util), then use these commands, replacing /my/build/dir/ with the path to a location you have write access to:
nvcc -c --std=c++11 -Xcompiler -fPIC -I../../../../include -I../../../../../../include -I../../include/profilerhost_util -I../../include/c_util List.cpp -o /my/build/dir/List.o
nvcc -c --std=c++11 -Xcompiler -fPIC -I../../../../include -I../../../../../../include -I../../include/profilerhost_util -I../../include/c_util Metric.cpp -o /my/build/dir/Metric.o
nvcc -c --std=c++11 -Xcompiler -fPIC -I../../../../include -I../../../../../../include -I../../include/profilerhost_util -I../../include/c_util Eval.cpp -o /my/build/dir/Eval.o
nvcc -o /my/build/dir/libprofilerHostUtil.a -lib /my/build/dir/List.o /my/build/dir/Metric.o /my/build/dir/Eval.o -lcuda -L ../../../../../../lib64 -lnvperf_host -lnvperf_target
And if you use that route, you will need to add -L. to the command to build the executable.

CUDA constant memory usage across multiple source files showing different behaviors on cuda-11.2 and cuda-11.4

Minimum repro:
kernel.cu:
#include <stdio.h>
__constant__ int N_GPU;
void wrapper_fn(int *ptr)
{
cudaMemcpyToSymbol(N_GPU, ptr, sizeof(int), cudaMemcpyDeviceToDevice);
}
__global__ void printKernel() {
printf("N = %d; \n", N_GPU);
}
driver.cu:
#include "cuda_runtime.h"
#include <stdio.h>
void wrapper_fn(int*);
__global__ void printKernel();
int main()
{
int N = 10;
int* d_N_ptr;
cudaMalloc(&d_N_ptr, sizeof(int));
cudaMemcpy(d_N_ptr, &N, sizeof(int), cudaMemcpyDefault);
wrapper_fn(d_N_ptr);
printKernel <<<1, 1 >>>();
cudaPeekAtLastError();
cudaDeviceSynchronize();
return 0;
}
Both on cuda-11.4 and cuda-11.2, running nvcc kernel.cu driver.cu compiles. The expected output (i.e N = 10;) is only seen in 11.2 and not 11.4.
Upon running cuda-gdb on 11.4, I get the following:
...
[New Thread 0x7fffee240700 (LWP 54339)]
warning: Cuda API error detected: cudaMalloc returned (0xde)
warning: Cuda API error detected: cudaMemcpy returned (0xde)
warning: Cuda API error detected: cudaMemcpyToSymbol returned (0xde)
warning: Cuda API error detected: cudaLaunchKernel returned (0xde)
warning: Cuda API error detected: cudaPeekAtLastError returned (0xde)
warning: Cuda API error detected: cudaDeviceSynchronize returned (0xde)
[Thread 0x7fffee240700 (LWP 54339) exited]
...
Any particular nvcc flags I'm missing that's important in the 11.4? or particular API changes I'm missing? Thanks in advance!
So the answer has to do with my driver version. The error code as seen from the cuda-gdb output (0xde = 222) is due to the fact that the compiled PTX is too new for the driver installed (my driver was 460.35), and the "CUDA Enhanced Compatibility" was used to run on my older driver, that didn't support the necessary PTX JIT.
TLDR; compiling to the exact architecture-specific SASS solved for cuda 11.4.
I did this by adding the the -arch compute_70 flag to my nvcc compilation command.

Changing the compilation arguments passed to nvcc by Rust using cc

I am using cc to link a CUDA kernel written in C to Rust.
Here is my build.rs file:
extern crate cc;
fn main() {
cc::Build::new()
.cuda(true)
.flag("-cudart=shared")
.flag("-gencode")
.flag("arch=compute_61,code=sm_61")
.file("kernel/kernel.cu")
.compile("kernel/kernel.a");
}
I have this error :
running: "nvcc" "-ccbin=c++" "-O0" "-Xcompiler" "-ffunction-sections"
"-Xcompiler" "-fdata-sections" "-Xcompiler" "-fPIC" "-G" "-Xcompiler"
"-g" "-m64" "-Xcompiler" "-Wall" "-Xcompiler" "-Wextra"
"-cudart=shared" "-gencode" "arch=compute_61,code=sm_61" "-o"
"/home/ltei/Dev/Workspaces/rust_cudnn/target/debug/build/rust_cudnn-df924982e63c2363/out/kernel/kernel.o"
"-c" "kernel/kernel.cu" cargo:warning=In file included from
/usr/include/cuda_runtime.h:78:0, cargo:warning= from
:0: cargo:warning=/usr/include/host_config.h:119:2:
error: #error -- unsupported GNU version! gcc versions later than 5
are not supported! cargo:warning= #error -- unsupported GNU version!
gcc versions later than 5 are not supported! cargo:warning= ^~~~~
exit code: 1
I know that it would work if I could change the -ccbin=c++ in the command into -ccbin=clang-3.8, but I have no idea how to do it.
I could also install another version of GCC, but I'd prefer the first solution.
You can set the CXX environment variable to whatever you want.
CXX=this-is-my-cpp-compiler cargo build
This will be used as the argument to ccbin:
"nvcc" "-ccbin=this-is-my-cpp-compiler" "-O0" "-Xcompiler" "-ffunction-sections" "-Xcompiler" "-fdata-sections" "-Xcompiler" "-fPIC" "-G" "-Xcompiler" "-g" "-m64" "-Xcompiler" "-Wall" "-Xcompiler" "-Wextra" "-cudart=shared" "-gencode" "arch=compute_61,code=sm_61" "-o" "/private/tmp/c/target/debug/build/c-67ec4fdcff2f35d1/out/kernel/kernel.o" "-c" "kernel/kernel.cu"

Error message during installing caffe command 'make all'

I ran
cp Makefile.config.example Makefile.config
make all
as suggested on the website to complete the installation.
I use Ubuntu 14.04 with CUDA and OpenBlas.
The error messages showed as follows
CXX/LD -o .build_release/tools/upgrade_net_proto_text.bin
.build_release/lib/libcaffe.so: undefined reference to caffe::curandGetErrorString(curandStatus)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::weight_gpu_gemm(double const*, double const*, double*)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::forward_gpu_bias(double*, double const*)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::forward_gpu_bias(float*, float const*)
.build_release/lib/libcaffe.so: undefined reference to caffe::cudnn::dataType::zero
.build_release/lib/libcaffe.so: undefined reference to caffe::cudnn::dataType::one
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::backward_gpu_gemm(float const*, float const*, float*)
.build_release/lib/libcaffe.so: undefined reference to caffe::cublasGetErrorString(cublasStatus_t)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::forward_gpu_gemm(double const*, double const*, double*, bool)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::backward_gpu_gemm(double const*, double const*, double*)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::backward_gpu_bias(double*, double const*)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::forward_gpu_gemm(float const*, float const*, float*, bool)
.build_release/lib/libcaffe.so: undefined reference to caffe::cudnn::dataType::zero
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::weight_gpu_gemm(float const*, float const*, float*)
.build_release/lib/libcaffe.so: undefined reference to caffe::BaseConvolutionLayer::backward_gpu_bias(float*, float const*)
.build_release/lib/libcaffe.so: undefined reference to caffe::cudnn::dataType::one
collect2: error: ld returned 1 exit status
make: *** [.build_release/tools/upgrade_net_proto_text.bin] Error 1
I only modified Makefile.config.
The modified Makefile.config shown as follows
USE_CUDNN := 1
CUSTOM_CXX := g++
CUDA_DIR := /usr/local/cuda
-gencode arch=compute_20,code=sm_21 \
-gencode arch=compute_30,code=sm_30 \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_50,code=compute_50
BLAS := OpenBlas
PYTHON_INCLUDE := /usr/include/python2.7 \
/usr/lib/python2.7/dist-packages/numpy/core/include
PYTHON_LIB := /usr/lib
INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
BUILD_DIR := build
DISTRIBUTE_DIR := distribute
TEST_GPUID := 0
Q ?= #
You need to change the BLAS settings in Makefile.config to
BLAS := open
Rather than 'OpenBlas'.

How to pass structures into CUDA device?

I've been stuck on this for a while. When I pass my structures into CUDA via kernel parameters, they contain no data and everything is undefined inside of them.
Out in host global space
struct matl1
{
static const double cond;
};
const double matl1::cond = 420.5;
Then inside of main()
matl1 * h_matl1 = (matl1*)malloc(sizeof(matl1));
matl1 * d_matl1;
cudaMalloc((void**)&d_matl1, sizeof(matl1));
cudaMemcpy(d_matl1, h_matl1, sizeof(matl1), cudaMemcpyHostToDevice);
kernel<<<1,1>>>(d_matl1,...);
Then inside of kernel()
__global__ void kernel(matl1* d_matl1,...)
{
double cond = d_matl1->cond;
}
And I get the following error:
error : identifier "matl1::cond" is undefined in device code
As a quick test, if I do the following on the host in main()
cout << h_matl1->cond << endl;
It shows me the correct output of 420.5. I am not sure why this isn't making it into the device.
Here's the rest of my output
uild started: Project: test_struct, Configuration: Debug Win32 ------
Compiling CUDA source file kernel.cu...
C:\Users\User\Documents\Visual Studio 2012\Projects\test_struct\test_struct>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\kernel.cu.obj "C:\Users\User\Documents\Visual Studio 2012\Projects\test_struct\test_struct\kernel.cu"
1>C:/Users/User/Documents/Visual Studio 2012/Projects/test_struct/test_struct/kernel.cu(15): error : identifier "matl1::cond" is undefined in device code
C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V110\BuildCustomizations\CUDA 5.5.targets(592,9): error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\kernel.cu.obj "C:\Users\User\Documents\Visual Studio 2012\Projects\test_struct\test_struct\kernel.cu"" exited with code 2.
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========
I was able to reproduce your error if I do this:
struct matl1
{
static const double cond;
};
__global__ void kernel(matl1* d_matl1)
{
double cond = d_matl1->cond;
printf("cond = %lf\n", cond);
}
const double matl1::cond = 420.5;
But not if I do this:
struct matl1
{
static const double cond;
};
const double matl1::cond = 420.5;
__global__ void kernel(matl1* d_matl1)
{
double cond = d_matl1->cond;
printf("cond = %lf\n", cond);
}
You need to define the constant initializer before the kernel definition. Here is a complete example that works for me.