How to pass structures into CUDA device? - cuda

I've been stuck on this for a while. When I pass my structures into CUDA via kernel parameters, they contain no data and everything is undefined inside of them.
Out in host global space
struct matl1
{
static const double cond;
};
const double matl1::cond = 420.5;
Then inside of main()
matl1 * h_matl1 = (matl1*)malloc(sizeof(matl1));
matl1 * d_matl1;
cudaMalloc((void**)&d_matl1, sizeof(matl1));
cudaMemcpy(d_matl1, h_matl1, sizeof(matl1), cudaMemcpyHostToDevice);
kernel<<<1,1>>>(d_matl1,...);
Then inside of kernel()
__global__ void kernel(matl1* d_matl1,...)
{
double cond = d_matl1->cond;
}
And I get the following error:
error : identifier "matl1::cond" is undefined in device code
As a quick test, if I do the following on the host in main()
cout << h_matl1->cond << endl;
It shows me the correct output of 420.5. I am not sure why this isn't making it into the device.
Here's the rest of my output
uild started: Project: test_struct, Configuration: Debug Win32 ------
Compiling CUDA source file kernel.cu...
C:\Users\User\Documents\Visual Studio 2012\Projects\test_struct\test_struct>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\kernel.cu.obj "C:\Users\User\Documents\Visual Studio 2012\Projects\test_struct\test_struct\kernel.cu"
1>C:/Users/User/Documents/Visual Studio 2012/Projects/test_struct/test_struct/kernel.cu(15): error : identifier "matl1::cond" is undefined in device code
C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V110\BuildCustomizations\CUDA 5.5.targets(592,9): error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\kernel.cu.obj "C:\Users\User\Documents\Visual Studio 2012\Projects\test_struct\test_struct\kernel.cu"" exited with code 2.
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

I was able to reproduce your error if I do this:
struct matl1
{
static const double cond;
};
__global__ void kernel(matl1* d_matl1)
{
double cond = d_matl1->cond;
printf("cond = %lf\n", cond);
}
const double matl1::cond = 420.5;
But not if I do this:
struct matl1
{
static const double cond;
};
const double matl1::cond = 420.5;
__global__ void kernel(matl1* d_matl1)
{
double cond = d_matl1->cond;
printf("cond = %lf\n", cond);
}
You need to define the constant initializer before the kernel definition. Here is a complete example that works for me.

Related

PTX kernel name mangling

I cannot link my Cuda program when a kernel is compiled from ptx file.
main.cu:
extern
__global__ void kernel(int, float*);
int main()
{
...
kernel<<<...>>>(...);
...
}
kernel.cu
__global__
void kernel(int n, float* p)
{
...
}
If I compile like below, I have no problems and I get an executable:
nvcc -dc main.cu kernel.cu --gpu-architecture=sm_70
nvcc -dlink main.o kernel.o --gpu-architecture=sm_70 -o dlink.o
g++ dlink.o main.o kernel.o -lcudart
If I compile like below (by generating ptx), I get errors:
nvcc -ptx kernel.cu --gpu-architecture=sm_70
nvcc -dc main.cu kernel.ptx --gpu-architecture=sm_70
nvcc -dlink main.o kernel.o --gpu-architecture=sm_70 -o dlink.o
g++ dlink.o main.o kernel.o -lcudart
Error:
main.o: In function `main':
tmpxft_0000b5ce_00000000-5_main.cudafe1.cpp:(.text+0x4789): undefined reference to `kernel(int, float*)'
tmpxft_0000b5ce_00000000-5_main.cudafe1.cpp:(.text+0x497e): undefined reference to `kernel(int, float*)'
collect2: error: ld returned 1 exit status
I am following an example from CUDA_Compiler_Driver_NVCC.pdf.
What do I need to do to fix the error?
(This is CUDA 10.2).
If you want to write your own PTX (or modify PTX), the proper CUDA methodology to use is the CUDA driver API and associated compilation flow.
The CUDA vectorAddDrv sample code has all the plumbing and workflow that you need.

Runtime crash using fstream exceptions in c++

I am trying to use fstream and exceptions but alway I run a program using exception it crash, no erros reported from compiller.
I think is a bug in MinGw compiler or compatibility with my windows system.
could you help me solve my problem. Thank you!
Code::Blocks 17.12! rev 11256 Win32 / gcc version 3.4.5 (mingw-vista special r3) / Windows 7 ServicePack1
Example code:
#include <iostream>
#include <fstream>
int main() {
int ivalue;
try {
std::ifstream in("in.txt");
in.exceptions(std::ifstream::failbit);
in >> ivalue;
} catch (std::ios_base::failure &fail) {
// handle exception here
}
}
```
Link:https://es.cppreference.com/w/cpp/io/basic_ios/exceptions
Compiller:
-------------- Build: Release in Exceptions (compiler: GNU GCC Compiler)---------------
mingw32-g++.exe -Wall -fexceptions -O2 -IC:\C++\TestExecLibs\Include -c C:\Programing\C&C++\Exceptions\main.cpp -o obj\Release\main.o
mingw32-g++.exe -o bin\Release\Exceptions.exe obj\Release\main.o -s "C:\Program Files\CodeBlocks\MinGW\lib\libcomctl32.a"
Output file is bin\Release\Exceptions.exe with size 784.50 KB
Process terminated with status 0 (0 minute(s), 4 second(s))
0 error(s), 0 warning(s) (0 minute(s), 4 second(s))-------------- Build: Release in Exceptions (compiler: GNU GCC Compiler)---------------
mingw32-g++.exe -Wall -fexceptions -O2 -IC:\C++\TestExecLibs\Include -c C:\Programing\C&C++\Exceptions\main.cpp -o obj\Release\main.o
mingw32-g++.exe -o bin\Release\Exceptions.exe obj\Release\main.o -s "C:\Program Files\CodeBlocks\MinGW\lib\libcomctl32.a"
Output file is bin\Release\Exceptions.exe with size 784.50 KB
Process terminated with status 0 (0 minute(s), 4 second(s))
0 error(s), 0 warning(s) (0 minute(s), 4 second(s))
After Run CMD:
terminate called after throwing an instance of 'std::ios_base::failure'
what(): basic_ios::clear
This application has requested the Runtime to terminate it in an unusual way.
Please contact the application's support team for more information.
Process returned 3 (0x3) execution time : 2.948 s
Press any key to continue.
[CMD Window Result][1]
[1]: https://i.stack.imgur.com/vCudt.jpg

Cuda compiling error when coming to g++ compiling step

nvcc -D_DEBUG --use_fast_math -I"/usr/local/cuda-9.0//include" -I"/usr/include/eigen3" -I"/home/xingfu/NVIDIA_CUDA-9.0_Samples/common/inc" -dlink --machine 64 -arch=sm_50 -c -o kernel_cuda.o ../CudaTest/kernel.cu
g++ -c -pipe -g -std=gnu++11 -Wall -W -D_REENTRANT -fPIC -DQT_DEPRECATED_WARNINGS -DQT_QML_DEBUG -DQT_CORE_LIB -I../CudaTest -I. -I/usr/local/cuda-9.0/include -isystem /usr/include/eigen3 -I../NVIDIA_CUDA-9.0_Samples/common/inc -isystem /usr/local/include -I../Qt5.11.0/5.11.0/gcc_64/include -I../Qt5.11.0/5.11.0/gcc_64/include/QtCore -I. -I../Qt5.11.0/5.11.0/gcc_64/mkspecs/linux-g++ -o LBDM.o ../CudaTest/LBDM.cpp
The two steps above have passed, however, when run the following step, the error occured:
g++ -Wl,-rpath,/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -o CudaTest kernel_cuda.o LBDM.o -L/usr/local/cuda-9.0//lib64/ -lcuda -lcudart -lcublas -L/home/xingfu/CudaTest/../../../usr/local/lib/ -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -L/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -lQt5Core -lpthread
The compiler error shows:
kernel_cuda.o: In function `__sti____cudaRegisterAll()':
tmpxft_00000e7d_00000000-5_kernel.cudafe1.cpp:(.text+0x177e): undefined reference to `__cudaRegisterLinkedBinary_41_tmpxft_00000e7d_00000000_6_kernel_cpp1_ii_channel'
How can I fix the error?
What's more,
I add the -dlink, because it shows the error when dealing the following step:
nvcc -D_DEBUG --use_fast_math -I"/usr/local/cuda-9.0//include" -I"/usr/include/eigen3" -I"/home/xingfu/NVIDIA_CUDA-9.0_Samples/common/inc" --machine 64 -arch=sm_50 -c -o kernel_cuda.o ../CudaTest/kernel.cu
and the error is:
ptxas fatal : Unresolved extern function 'cublasCreate_v2'
However, when I add -dlink, the error occured like I said above.
BTW, before I add -dlink, I can run a simple function in another test project like this:
__global__ void add(float* x, float * y, float* z, int n)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
{
z[i] = x[i] + y[i];
}
}
After I add -dlink, the test project shows an error:
cuda_code_cuda.o: In function `__sti____cudaRegisterAll()':
tmpxft_000017db_00000000-5_cuda_code.cudafe1.cpp:(.text+0x861): undefined reference to `__cudaRegisterLinkedBinary_44_tmpxft_000017db_00000000_6_cuda_code_cpp1_ii_5b538d80'
which is very similar to the above error.
For relocatable device code linking, which seems to be what you're after, the recommended sequence would be as follows. In addition, it appears your code is attempting to use the cublas device interface, so for good measure we'll add those libraries to the link steps:
#replace -dlink -c with -dc
nvcc -D_DEBUG --use_fast_math -I"/usr/local/cuda-9.0//include" -I"/usr/include/eigen3" -I"/home/xingfu/NVIDIA_CUDA-9.0_Samples/common/inc" -dc --machine 64 -arch=sm_50 -o kernel_cuda.o ../CudaTest/kernel.cu
#generate device-linked object with cublas device libraries
nvcc -D_DEBUG --use_fast_math -dlink --machine 64 -arch=sm_50 -o kernel_dlink.o kernel_cuda.o -lcublas -lcublas_device -lcudadevrt
#no change to this line
g++ -c -pipe -g -std=gnu++11 -Wall -W -D_REENTRANT -fPIC -DQT_DEPRECATED_WARNINGS -DQT_QML_DEBUG -DQT_CORE_LIB -I../CudaTest -I. -I/usr/local/cuda-9.0/include -isystem /usr/include/eigen3 -I../NVIDIA_CUDA-9.0_Samples/common/inc -isystem /usr/local/include -I../Qt5.11.0/5.11.0/gcc_64/include -I../Qt5.11.0/5.11.0/gcc_64/include/QtCore -I. -I../Qt5.11.0/5.11.0/gcc_64/mkspecs/linux-g++ -o LBDM.o ../CudaTest/LBDM.cpp
#add device-linked object to final link phase plus cublas device libraries
g++ -Wl,-rpath,/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -o CudaTest kernel_cuda.o LBDM.o kernel_dlink.o -L/usr/local/cuda-9.0//lib64/ -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt -L/home/xingfu/CudaTest/../../../usr/local/lib/ -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -L/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -lQt5Core -lpthread

SWIG error for boost::posix_time::ptime conversion

I am warping a very simple C++ class by using SWIG.
The boost ptime is used in that C++ class.
When I try to execute command
swig -c++ -python example.i
There is an error:
example.h:7: Warning 315: Nothing known about 'boost::posix_time::ptime'.
example.h:7: Warning 315: Nothing known about 'boost::posix_time::ptime'.
How can I resolve this problem?
The example.i file is:
//File: example.i
%module example
%{
#define SWIG_FILE_WITH_INIT
#include <boost/date_time/posix_time/ptime.hpp>
#include "example.h"
%}
// for std:string
%include "std_string.i"
// for vector
%include "std_vector.i"
%include stl.i
%include "example.h"
The example.h file is:
#pragma once
#include <string>
#include <boost/date_time/posix_time/ptime.hpp>
using std::string;
using boost::posix_time::ptime;
class Example{
public:
Example(string name, ptime timestamp){
// doSomething...
}
};
Solved by friend Mike and me.
The correct interface file is as following (without mentioning boost header):
/* File: example.i */
%module example
%{
#define SWIG_FILE_WITH_INIT
#include "example.h"
%}
%include "example.h"
The swig execution command is (OSX):
swig -c++ -python example.i
g++ -O2 -fPIC -c example.h -std=c++11
g++ -O2 -fPIC -c example_wrap.cxx -I/Library/anaconda2/include/python2.7
g++ -bundle -flat_namespace -undefined suppress -o _example.so *.o
The swig execution command is (Ubuntu 14.04):
swig -c++ -python example.i
g++ -O2 -fPIC -c example.h -std=c++11
g++ -O2 -fPIC -c example_wrap.cxx -I/usr/include/python2.7
g++ -shared -o _example.so *.o
Only change the last line.

Why do I get a segmentation fault when calling a CUDA kernel directly from a shared library?

The way I tried it (see question title) it compiled, but I get a segmentation fault. So is it me, CMake or CUDA which doesn't support direct kernel calls from a shared library? The solution doesn't have to be with CMake
Further details:
I have the following file structure:
testKernel.hpp
__global__ void kernelTest( float x );
void callKernel( float x );
testKernel.cu
#include "testKernel.hpp"
__global__ void kernelTest( float x ) {}
void callKernel( float x ) { kernelTest<<<1,1>>>( x ); }
useKernel.cu
#include <cstdio>
#include "testKernel.hpp"
int main( void )
{
kernelTest<<<1,1>>>( 3.0f );
//callKernel( 3.0f );
printf("OK\n");
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.3.1)
project(testKernelCall)
find_package(CUDA REQUIRED)
cuda_add_library( ${PROJECT_NAME} SHARED testKernel.cu testKernel.hpp )
target_link_libraries( ${PROJECT_NAME} ${CUDA_LIBRARIES} )
cuda_add_executable("useKernel" useKernel.cu)
target_link_libraries("useKernel" ${PROJECT_NAME})
Compiling and running this with:
cmake .; make && ./useKernel
results in a segmentation fault. The backtrace with gdb is:
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff75726bd in cudart::configData::addArgument(void const*, unsigned long, unsigned long) ()
from ./libtestKernelCall.so
(gdb) bt
#0 0x00007ffff75726bd in cudart::configData::addArgument(void const*, unsigned long, unsigned long) ()
from ./libtestKernelCall.so
#1 0x00007ffff7562eb7 in cudart::cudaApiSetupArgument(void const*, unsigned long, unsigned long) ()
from ./libtestKernelCall.so
#2 0x00007ffff7591ca2 in cudaSetupArgument ()
from ./libtestKernelCall.so
#3 0x00007ffff7556125 in __device_stub__Z10kernelTestf (__par0=3)
at /tmp/tmpxft_00003900_00000000-4_testKernel.cudafe1.stub.c:7
#4 0x00007ffff755616c in kernelTest (__cuda_0=3) at ./testKernel.cu:2
#5 0x000000000040280e in main () at ./useKernel.cu:6
Tested with (which means the segfault appears in those setups):
Setup 1
cmake 3.4.1
CUDA 7.0.27
g++ 4.9.2
Debian
Setup 2
cmake 3.3.1
CUDA 6.5.14
g++ 4.7.1
There are two ways to solve this error:
change SHARED to STATIC in CMakeList.txt
use the wrapper function callKernel instead of calling the kernel directly
I don't really know how to build a CUDA shared library without CMake. I know how to build a CUDA static library, but that case seems to work with CMake, so I didn't test it without CMake.
Here are the relevant CMake commands I got with make VERBOSE=1. I changed absolute paths to relative paths, where possible, but I wasn't sure about all these library paths. Putting these commands in a file and sourcing that file compiles the shared library and the program correctly and "correctly" leads to the segmentation fault. I also added command because for me nvcc is aliased with the `-ccbin`` option.
make.sh
command nvcc "./testKernel.cu" -c -o "./testKernel.cu.o" -ccbin /usr/bin/cc -m64 -DtestKernelCall_EXPORTS -Xcompiler ,\"-fPIC\",\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ -fPIC -shared -Wl,-soname,libtestKernelCall.so -o libtestKernelCall.so ./testKernel.cu.o /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so
command nvcc "./useKernel.cu" -c -o "./useKernel.cu.o" -ccbin /usr/bin/cc -m64 -Xcompiler ,\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ ./useKernel.cu.o -o useKernel -rdynamic /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so libtestKernelCall.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so -Wl,-rpath,"."
Your code compiles and runs correctly for me using ordinary nvcc commands (not CMake) if I add the -cudart shared switch to each nvcc command. Here's a fully-worked sequence:
$ cat testKernel.hpp
__global__ void kernelTest( float x );
void callKernel( float x );
$ cat testKernel.cu
#include "testKernel.hpp"
__global__ void kernelTest( float x ) {}
void callKernel( float x ) { kernelTest<<<1,1>>>( x ); }
$ cat useKernel.cu
#include <cstdio>
#include "testKernel.hpp"
int main( void )
{
kernelTest<<<1,1>>>( 3.0f );
//callKernel( 3.0f );
cudaDeviceSynchronize();
printf("OK\n");
return 0;
}
$ nvcc -shared -cudart shared -o test.so -Xcompiler -fPIC testKernel.cu
$ nvcc -cudart shared -o test test.so useKernel.cu
$ cuda-memcheck ./test
========= CUDA-MEMCHECK
OK
========= ERROR SUMMARY: 0 errors
$
If I omit -cudart shared on either of the above nvcc commands, then the compile will still proceed, but on execution I will witness the aforementioned seg fault. Tested with CUDA 7.5 on Fedora 20.
Regarding your CMake setup, it's necessary to link against the shared cudart, according to my testing. Therefore it's insufficient to add -cudart shared to the -c commands (which are compile commands. Sorry if I was unclear. My "compile" commands above are doing both compiling and linking, at each step.)
When linking with nvcc, the correct switch is -cudart shared. However, your make.sh indicates final link is being done by the host c++ compiler:
command nvcc "./testKernel.cu" -c -o "./testKernel.cu.o" -ccbin /usr/bin/cc -m64 -DtestKernelCall_EXPORTS -Xcompiler ,\"-fPIC\",\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ -fPIC -shared -Wl,-soname,libtestKernelCall.so -o libtestKernelCall.so ./testKernel.cu.o /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so
command nvcc "./useKernel.cu" -c -o "./useKernel.cu.o" -ccbin /usr/bin/cc -m64 -Xcompiler ,\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ ./useKernel.cu.o -o useKernel -rdynamic /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so libtestKernelCall.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so -Wl,-rpath,"."
In that case, you don't want to link against:
/opt/cuda-7.0/lib64/libcudart_static.a
but instead against libcudart.so:
/opt/cuda-7.0/lib64/libcudart.so
If you were editing your make.sh directly, you would want to make that change in both of the /usr/bin/c++ command lines you have shown. For example, if I were to modify my compile sequence already presented to reflect your usage of the host c++ compiler to do the linking, it would look like this:
$ nvcc -c -Xcompiler -fPIC testKernel.cu
$ g++ -fPIC -shared -o test.so -L/usr/local/cuda/lib64 -lcudart testKernel.o
$ nvcc -c useKernel.cu
$ g++ -o test -L/usr/local/cuda/lib64 -lcudart test.so useKernel.o
$ cuda-memcheck ./test
========= CUDA-MEMCHECK
OK
========= ERROR SUMMARY: 0 errors
$
put set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) before find_package(CUDA REQUIRED) will do the job equivalent to set(CUDA_LIBRARIES "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
This is an extension to Robert Crovella's answer.
I use the following CMakeLists.txt and it works well.
cmake_minimum_required(VERSION 3.8)
project(cmake_and_cuda LANGUAGES CXX CUDA)
add_library(my_cu SHARED testKernel.cu testKernel.h)
target_link_libraries(my_cu PRIVATE cudart) #MUST!!
set(CMAKE_CUDA_FLAGS "-shared -cudart shared -Xcompiler -fPIC"
CACHE STRING "Use libcudart.dylib" FORCE)
set(CMAKE_MACOSX_RPATH FALSE)
add_executable(app useKernel.cu)
target_link_libraries(app PRIVATE cudart) #MUST!!
target_link_libraries(app PRIVATE my_cu)
I'm using CMake 3.10 and my OS is OS X EI Capitan 10.11.6.
For me, if I don't set CMAKE_MACOSX_RPATH to FALSE, I will get a Library not loaded error. Maybe it is not necessary for you.
Note that since CMake 3.8, the FindCUDA is superseded, so setting CUDA_USE_STATIC_CUDA_RUNTIME affects nothing.
You can check this post and this document for details.
In addition, this post provides a good example about how to deal with CUDA after CMake 3.8.