I try to use nvcc to build following multi-thread program which is built by "gcc -pthread a.c" before:
$ cat a.c
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
void *myThreadFun(void *vargp)
{
printf("myThreadFun \n");
return NULL;
}
int main()
{
pthread_t tid;
printf("Before Thread\n");
pthread_create(&tid, NULL, myThreadFun, NULL);
pthread_join(tid, NULL);
printf("After Thread\n");
exit(0);
}
Execute "nvcc -pthread a.c":
$ nvcc -pthread a.c
nvcc fatal : Unknown option 'pthread'
This topic said nvcc supports building multi-thread program without using -pthread option. And my test also seems right:
$ nvcc a.c
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ ldd a.out
linux-vdso.so.1 (0x00007ffcff79e000)
librt.so.1 => /usr/lib/librt.so.1 (0x00007fd4f5a43000)
libpthread.so.0 => /usr/lib/libpthread.so.0 (0x00007fd4f5825000)
libdl.so.2 => /usr/lib/libdl.so.2 (0x00007fd4f5621000)
libstdc++.so.6 => /usr/lib/libstdc++.so.6 (0x00007fd4f5299000)
libm.so.6 => /usr/lib/libm.so.6 (0x00007fd4f4f86000)
libgcc_s.so.1 => /usr/lib/libgcc_s.so.1 (0x00007fd4f4d6f000)
libc.so.6 => /usr/lib/libc.so.6 (0x00007fd4f49cb000)
/lib64/ld-linux-x86-64.so.2 (0x00007fd4f5c4b000)
But I can't find the proof from nvcc official document. Could anyone help to affirm it?
No nvcc doesn't support a pthread option. In fact, it knows nothing about pthreads. The pthread dependency is coming from dependencies in the CUDA runtime libraries. It has nothing to do with what is in your code. nvcc doesn't even compile that code, it is passed to your host compiler. nvcc is a compiler driver. It just steers compilation using other compilers. In this case the host C++ compiler.
You can see what actually happens like this:
$ nvcc -arch=sm_52 -v pthread_confusion.c
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-8.0/bin
#$ _THERE_=/opt/cuda-8.0/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-8.0/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-8.0/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-8.0/bin/../lib:/opt/cuda-8.0/lib64:/usr/lib/nx/X11/Xinerama:/usr/lib/nx/X11
#$ PATH=/opt/cuda-8.0/bin/../open64/bin:/opt/cuda-8.0/bin/../nvvm/bin:/opt/cuda-8.0/bin:/usr/local/bin:/usr/bin:/bin:/opt/cuda-8.0/bin
#$ INCLUDES="-I/opt/cuda-8.0/bin/..//include"
#$ LIBRARIES= "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -c -x c -D__NVCC__ "-I/opt/cuda-8.0/bin/..//include" -D"__CUDACC_VER__=80044" -D"__CUDACC_VER_BUILD__=44" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -m64 -o "/tmp/tmpxft_000069ca_00000000-4_pthread_confusion.o" "pthread_confusion.c"
#$ nvlink --arch=sm_52 --register-link-binaries="/tmp/tmpxft_000069ca_00000000-2_a_dlink.reg.c" -m64 "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64" -cpu-arch=X86_64 "/tmp/tmpxft_000069ca_00000000-4_pthread_confusion.o" -lcudadevrt -o "/tmp/tmpxft_000069ca_00000000-5_a_dlink.sm_52.cubin"
#$ fatbinary --create="/tmp/tmpxft_000069ca_00000000-3_a_dlink.fatbin" -64 -link "--image=profile=sm_52,file=/tmp/tmpxft_000069ca_00000000-5_a_dlink.sm_52.cubin" --embedded-fatbin="/tmp/tmpxft_000069ca_00000000-3_a_dlink.fatbin.c"
#$ rm /tmp/tmpxft_000069ca_00000000-3_a_dlink.fatbin
#$ gcc -c -x c++ -DFATBINFILE="\"/tmp/tmpxft_000069ca_00000000-3_a_dlink.fatbin.c\"" -DREGISTERLINKBINARYFILE="\"/tmp/tmpxft_000069ca_00000000-2_a_dlink.reg.c\"" -I. "-I/opt/cuda-8.0/bin/..//include" -D"__CUDACC_VER__=80044" -D"__CUDACC_VER_BUILD__=44" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -m64 -o "/tmp/tmpxft_000069ca_00000000-6_a_dlink.o" "/opt/cuda-8.0/bin/crt/link.stub"
#$ g++ -m64 -o "a.out" -Wl,--start-group "/tmp/tmpxft_000069ca_00000000-6_a_dlink.o" "/tmp/tmpxft_000069ca_00000000-4_pthread_confusion.o" "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64" -lcudadevrt -lcudart_static -lrt -lpthread -ldl -Wl,--end-group
Here, the boilerplate linking phase includes both the CUDA runtime library and the pthreads library.
If you want to be explicit with the host compiler, pass -Xcompiler="-pthread"
Related
I cannot link my Cuda program when a kernel is compiled from ptx file.
main.cu:
extern
__global__ void kernel(int, float*);
int main()
{
...
kernel<<<...>>>(...);
...
}
kernel.cu
__global__
void kernel(int n, float* p)
{
...
}
If I compile like below, I have no problems and I get an executable:
nvcc -dc main.cu kernel.cu --gpu-architecture=sm_70
nvcc -dlink main.o kernel.o --gpu-architecture=sm_70 -o dlink.o
g++ dlink.o main.o kernel.o -lcudart
If I compile like below (by generating ptx), I get errors:
nvcc -ptx kernel.cu --gpu-architecture=sm_70
nvcc -dc main.cu kernel.ptx --gpu-architecture=sm_70
nvcc -dlink main.o kernel.o --gpu-architecture=sm_70 -o dlink.o
g++ dlink.o main.o kernel.o -lcudart
Error:
main.o: In function `main':
tmpxft_0000b5ce_00000000-5_main.cudafe1.cpp:(.text+0x4789): undefined reference to `kernel(int, float*)'
tmpxft_0000b5ce_00000000-5_main.cudafe1.cpp:(.text+0x497e): undefined reference to `kernel(int, float*)'
collect2: error: ld returned 1 exit status
I am following an example from CUDA_Compiler_Driver_NVCC.pdf.
What do I need to do to fix the error?
(This is CUDA 10.2).
If you want to write your own PTX (or modify PTX), the proper CUDA methodology to use is the CUDA driver API and associated compilation flow.
The CUDA vectorAddDrv sample code has all the plumbing and workflow that you need.
nvcc -D_DEBUG --use_fast_math -I"/usr/local/cuda-9.0//include" -I"/usr/include/eigen3" -I"/home/xingfu/NVIDIA_CUDA-9.0_Samples/common/inc" -dlink --machine 64 -arch=sm_50 -c -o kernel_cuda.o ../CudaTest/kernel.cu
g++ -c -pipe -g -std=gnu++11 -Wall -W -D_REENTRANT -fPIC -DQT_DEPRECATED_WARNINGS -DQT_QML_DEBUG -DQT_CORE_LIB -I../CudaTest -I. -I/usr/local/cuda-9.0/include -isystem /usr/include/eigen3 -I../NVIDIA_CUDA-9.0_Samples/common/inc -isystem /usr/local/include -I../Qt5.11.0/5.11.0/gcc_64/include -I../Qt5.11.0/5.11.0/gcc_64/include/QtCore -I. -I../Qt5.11.0/5.11.0/gcc_64/mkspecs/linux-g++ -o LBDM.o ../CudaTest/LBDM.cpp
The two steps above have passed, however, when run the following step, the error occured:
g++ -Wl,-rpath,/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -o CudaTest kernel_cuda.o LBDM.o -L/usr/local/cuda-9.0//lib64/ -lcuda -lcudart -lcublas -L/home/xingfu/CudaTest/../../../usr/local/lib/ -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -L/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -lQt5Core -lpthread
The compiler error shows:
kernel_cuda.o: In function `__sti____cudaRegisterAll()':
tmpxft_00000e7d_00000000-5_kernel.cudafe1.cpp:(.text+0x177e): undefined reference to `__cudaRegisterLinkedBinary_41_tmpxft_00000e7d_00000000_6_kernel_cpp1_ii_channel'
How can I fix the error?
What's more,
I add the -dlink, because it shows the error when dealing the following step:
nvcc -D_DEBUG --use_fast_math -I"/usr/local/cuda-9.0//include" -I"/usr/include/eigen3" -I"/home/xingfu/NVIDIA_CUDA-9.0_Samples/common/inc" --machine 64 -arch=sm_50 -c -o kernel_cuda.o ../CudaTest/kernel.cu
and the error is:
ptxas fatal : Unresolved extern function 'cublasCreate_v2'
However, when I add -dlink, the error occured like I said above.
BTW, before I add -dlink, I can run a simple function in another test project like this:
__global__ void add(float* x, float * y, float* z, int n)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
{
z[i] = x[i] + y[i];
}
}
After I add -dlink, the test project shows an error:
cuda_code_cuda.o: In function `__sti____cudaRegisterAll()':
tmpxft_000017db_00000000-5_cuda_code.cudafe1.cpp:(.text+0x861): undefined reference to `__cudaRegisterLinkedBinary_44_tmpxft_000017db_00000000_6_cuda_code_cpp1_ii_5b538d80'
which is very similar to the above error.
For relocatable device code linking, which seems to be what you're after, the recommended sequence would be as follows. In addition, it appears your code is attempting to use the cublas device interface, so for good measure we'll add those libraries to the link steps:
#replace -dlink -c with -dc
nvcc -D_DEBUG --use_fast_math -I"/usr/local/cuda-9.0//include" -I"/usr/include/eigen3" -I"/home/xingfu/NVIDIA_CUDA-9.0_Samples/common/inc" -dc --machine 64 -arch=sm_50 -o kernel_cuda.o ../CudaTest/kernel.cu
#generate device-linked object with cublas device libraries
nvcc -D_DEBUG --use_fast_math -dlink --machine 64 -arch=sm_50 -o kernel_dlink.o kernel_cuda.o -lcublas -lcublas_device -lcudadevrt
#no change to this line
g++ -c -pipe -g -std=gnu++11 -Wall -W -D_REENTRANT -fPIC -DQT_DEPRECATED_WARNINGS -DQT_QML_DEBUG -DQT_CORE_LIB -I../CudaTest -I. -I/usr/local/cuda-9.0/include -isystem /usr/include/eigen3 -I../NVIDIA_CUDA-9.0_Samples/common/inc -isystem /usr/local/include -I../Qt5.11.0/5.11.0/gcc_64/include -I../Qt5.11.0/5.11.0/gcc_64/include/QtCore -I. -I../Qt5.11.0/5.11.0/gcc_64/mkspecs/linux-g++ -o LBDM.o ../CudaTest/LBDM.cpp
#add device-linked object to final link phase plus cublas device libraries
g++ -Wl,-rpath,/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -o CudaTest kernel_cuda.o LBDM.o kernel_dlink.o -L/usr/local/cuda-9.0//lib64/ -lcuda -lcudart -lcublas -lcublas_device -lcudadevrt -L/home/xingfu/CudaTest/../../../usr/local/lib/ -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -L/home/xingfu/Qt5.11.0/5.11.0/gcc_64/lib -lQt5Core -lpthread
I'm using Ubuntu 14.04, and I installed libmysqlclient-dev package already. But I always get link error because mysql_init symbol missing.
My source code, Makefile, run-time result and symbol info are as follows:
igsrd#naivechou/~/project/m01/uuid_sign>cat main.cpp
#include <cstdio>
#include <cstdlib>
#include "mysql/mysql.h"
int main(int argc,char** argv)
{
auto con = mysql_init(nullptr);
//printf("mysql client version : %s\n",mysql_get_client_info());
exit(EXIT_SUCCESS);
}
igsrd#naivechou/~/project/m01/uuid_sign>LANG= && make -B
g++ -o main.o -c main.cpp -std=c++11 -ggdb
g++ -o uuid_sign -L/usr/lib/x86_64-linux-gnu -lmysqlclient -lpthread -lz -lm -ldl main.o
main.o: In function `main':
/home/igsrd/project/m01/uuid_sign/main.cpp:6: undefined reference to `mysql_init'
collect2: error: ld returned 1 exit status
make: *** [uuid_sign] Error 1
igsrd#naivechou/~/project/m01/uuid_sign>nm -C /usr/lib/x86_64-linux-gnu/libmysqlclient.a | grep mysql_init
0000000000002b10 T mysql_init
0000000000002d30 T mysql_init_character_set
U mysql_init_character_set
Source is very simple, just one line to call mysql_init().
Making process shows every options for compiler and linker, I think there are no missing options.
Error message is undefined reference of linking error, so I dump libmysqlclient.a to grep the mysql_init, and it is not on undefined state.
Now I really have no idea. What's wrong with this ?
Change your second command, try this (change the position of main.o):
g++ -o uuid_sign main.o -L/usr/lib/x86_64-linux-gnu -lmysqlclient -lpthread -lz -lm -ldl
I am warping a very simple C++ class by using SWIG.
The boost ptime is used in that C++ class.
When I try to execute command
swig -c++ -python example.i
There is an error:
example.h:7: Warning 315: Nothing known about 'boost::posix_time::ptime'.
example.h:7: Warning 315: Nothing known about 'boost::posix_time::ptime'.
How can I resolve this problem?
The example.i file is:
//File: example.i
%module example
%{
#define SWIG_FILE_WITH_INIT
#include <boost/date_time/posix_time/ptime.hpp>
#include "example.h"
%}
// for std:string
%include "std_string.i"
// for vector
%include "std_vector.i"
%include stl.i
%include "example.h"
The example.h file is:
#pragma once
#include <string>
#include <boost/date_time/posix_time/ptime.hpp>
using std::string;
using boost::posix_time::ptime;
class Example{
public:
Example(string name, ptime timestamp){
// doSomething...
}
};
Solved by friend Mike and me.
The correct interface file is as following (without mentioning boost header):
/* File: example.i */
%module example
%{
#define SWIG_FILE_WITH_INIT
#include "example.h"
%}
%include "example.h"
The swig execution command is (OSX):
swig -c++ -python example.i
g++ -O2 -fPIC -c example.h -std=c++11
g++ -O2 -fPIC -c example_wrap.cxx -I/Library/anaconda2/include/python2.7
g++ -bundle -flat_namespace -undefined suppress -o _example.so *.o
The swig execution command is (Ubuntu 14.04):
swig -c++ -python example.i
g++ -O2 -fPIC -c example.h -std=c++11
g++ -O2 -fPIC -c example_wrap.cxx -I/usr/include/python2.7
g++ -shared -o _example.so *.o
Only change the last line.
The way I tried it (see question title) it compiled, but I get a segmentation fault. So is it me, CMake or CUDA which doesn't support direct kernel calls from a shared library? The solution doesn't have to be with CMake
Further details:
I have the following file structure:
testKernel.hpp
__global__ void kernelTest( float x );
void callKernel( float x );
testKernel.cu
#include "testKernel.hpp"
__global__ void kernelTest( float x ) {}
void callKernel( float x ) { kernelTest<<<1,1>>>( x ); }
useKernel.cu
#include <cstdio>
#include "testKernel.hpp"
int main( void )
{
kernelTest<<<1,1>>>( 3.0f );
//callKernel( 3.0f );
printf("OK\n");
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.3.1)
project(testKernelCall)
find_package(CUDA REQUIRED)
cuda_add_library( ${PROJECT_NAME} SHARED testKernel.cu testKernel.hpp )
target_link_libraries( ${PROJECT_NAME} ${CUDA_LIBRARIES} )
cuda_add_executable("useKernel" useKernel.cu)
target_link_libraries("useKernel" ${PROJECT_NAME})
Compiling and running this with:
cmake .; make && ./useKernel
results in a segmentation fault. The backtrace with gdb is:
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff75726bd in cudart::configData::addArgument(void const*, unsigned long, unsigned long) ()
from ./libtestKernelCall.so
(gdb) bt
#0 0x00007ffff75726bd in cudart::configData::addArgument(void const*, unsigned long, unsigned long) ()
from ./libtestKernelCall.so
#1 0x00007ffff7562eb7 in cudart::cudaApiSetupArgument(void const*, unsigned long, unsigned long) ()
from ./libtestKernelCall.so
#2 0x00007ffff7591ca2 in cudaSetupArgument ()
from ./libtestKernelCall.so
#3 0x00007ffff7556125 in __device_stub__Z10kernelTestf (__par0=3)
at /tmp/tmpxft_00003900_00000000-4_testKernel.cudafe1.stub.c:7
#4 0x00007ffff755616c in kernelTest (__cuda_0=3) at ./testKernel.cu:2
#5 0x000000000040280e in main () at ./useKernel.cu:6
Tested with (which means the segfault appears in those setups):
Setup 1
cmake 3.4.1
CUDA 7.0.27
g++ 4.9.2
Debian
Setup 2
cmake 3.3.1
CUDA 6.5.14
g++ 4.7.1
There are two ways to solve this error:
change SHARED to STATIC in CMakeList.txt
use the wrapper function callKernel instead of calling the kernel directly
I don't really know how to build a CUDA shared library without CMake. I know how to build a CUDA static library, but that case seems to work with CMake, so I didn't test it without CMake.
Here are the relevant CMake commands I got with make VERBOSE=1. I changed absolute paths to relative paths, where possible, but I wasn't sure about all these library paths. Putting these commands in a file and sourcing that file compiles the shared library and the program correctly and "correctly" leads to the segmentation fault. I also added command because for me nvcc is aliased with the `-ccbin`` option.
make.sh
command nvcc "./testKernel.cu" -c -o "./testKernel.cu.o" -ccbin /usr/bin/cc -m64 -DtestKernelCall_EXPORTS -Xcompiler ,\"-fPIC\",\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ -fPIC -shared -Wl,-soname,libtestKernelCall.so -o libtestKernelCall.so ./testKernel.cu.o /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so
command nvcc "./useKernel.cu" -c -o "./useKernel.cu.o" -ccbin /usr/bin/cc -m64 -Xcompiler ,\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ ./useKernel.cu.o -o useKernel -rdynamic /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so libtestKernelCall.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so -Wl,-rpath,"."
Your code compiles and runs correctly for me using ordinary nvcc commands (not CMake) if I add the -cudart shared switch to each nvcc command. Here's a fully-worked sequence:
$ cat testKernel.hpp
__global__ void kernelTest( float x );
void callKernel( float x );
$ cat testKernel.cu
#include "testKernel.hpp"
__global__ void kernelTest( float x ) {}
void callKernel( float x ) { kernelTest<<<1,1>>>( x ); }
$ cat useKernel.cu
#include <cstdio>
#include "testKernel.hpp"
int main( void )
{
kernelTest<<<1,1>>>( 3.0f );
//callKernel( 3.0f );
cudaDeviceSynchronize();
printf("OK\n");
return 0;
}
$ nvcc -shared -cudart shared -o test.so -Xcompiler -fPIC testKernel.cu
$ nvcc -cudart shared -o test test.so useKernel.cu
$ cuda-memcheck ./test
========= CUDA-MEMCHECK
OK
========= ERROR SUMMARY: 0 errors
$
If I omit -cudart shared on either of the above nvcc commands, then the compile will still proceed, but on execution I will witness the aforementioned seg fault. Tested with CUDA 7.5 on Fedora 20.
Regarding your CMake setup, it's necessary to link against the shared cudart, according to my testing. Therefore it's insufficient to add -cudart shared to the -c commands (which are compile commands. Sorry if I was unclear. My "compile" commands above are doing both compiling and linking, at each step.)
When linking with nvcc, the correct switch is -cudart shared. However, your make.sh indicates final link is being done by the host c++ compiler:
command nvcc "./testKernel.cu" -c -o "./testKernel.cu.o" -ccbin /usr/bin/cc -m64 -DtestKernelCall_EXPORTS -Xcompiler ,\"-fPIC\",\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ -fPIC -shared -Wl,-soname,libtestKernelCall.so -o libtestKernelCall.so ./testKernel.cu.o /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so
command nvcc "./useKernel.cu" -c -o "./useKernel.cu.o" -ccbin /usr/bin/cc -m64 -Xcompiler ,\"-g\" -DNVCC -I/opt/cuda-7.0/include -I/opt/cuda-7.0/include
/usr/bin/c++ ./useKernel.cu.o -o useKernel -rdynamic /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so libtestKernelCall.so /opt/cuda-7.0/lib64/libcudart_static.a -lpthread /usr/lib/x86_64-linux-gnu/librt.so /usr/lib/x86_64-linux-gnu/libdl.so -Wl,-rpath,"."
In that case, you don't want to link against:
/opt/cuda-7.0/lib64/libcudart_static.a
but instead against libcudart.so:
/opt/cuda-7.0/lib64/libcudart.so
If you were editing your make.sh directly, you would want to make that change in both of the /usr/bin/c++ command lines you have shown. For example, if I were to modify my compile sequence already presented to reflect your usage of the host c++ compiler to do the linking, it would look like this:
$ nvcc -c -Xcompiler -fPIC testKernel.cu
$ g++ -fPIC -shared -o test.so -L/usr/local/cuda/lib64 -lcudart testKernel.o
$ nvcc -c useKernel.cu
$ g++ -o test -L/usr/local/cuda/lib64 -lcudart test.so useKernel.o
$ cuda-memcheck ./test
========= CUDA-MEMCHECK
OK
========= ERROR SUMMARY: 0 errors
$
put set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) before find_package(CUDA REQUIRED) will do the job equivalent to set(CUDA_LIBRARIES "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
This is an extension to Robert Crovella's answer.
I use the following CMakeLists.txt and it works well.
cmake_minimum_required(VERSION 3.8)
project(cmake_and_cuda LANGUAGES CXX CUDA)
add_library(my_cu SHARED testKernel.cu testKernel.h)
target_link_libraries(my_cu PRIVATE cudart) #MUST!!
set(CMAKE_CUDA_FLAGS "-shared -cudart shared -Xcompiler -fPIC"
CACHE STRING "Use libcudart.dylib" FORCE)
set(CMAKE_MACOSX_RPATH FALSE)
add_executable(app useKernel.cu)
target_link_libraries(app PRIVATE cudart) #MUST!!
target_link_libraries(app PRIVATE my_cu)
I'm using CMake 3.10 and my OS is OS X EI Capitan 10.11.6.
For me, if I don't set CMAKE_MACOSX_RPATH to FALSE, I will get a Library not loaded error. Maybe it is not necessary for you.
Note that since CMake 3.8, the FindCUDA is superseded, so setting CUDA_USE_STATIC_CUDA_RUNTIME affects nothing.
You can check this post and this document for details.
In addition, this post provides a good example about how to deal with CUDA after CMake 3.8.