I am trying to integrate CUDA code with my existing C++ application. As instructed on some web side, I need to have a "file.cu" where in I have a wrapper function which does the memory allocation on the GPU and launch kernel. I followed that advise but, I am not able to compile the code now.
file.cu
#include <cuda.h>
#include <stdio.h>
void preComputeCorrelation_gpu( int * d )
{
//I shall write the kernel later once I am confirmed that CUDA code works
cudaDeviceProp prop;
cudaGetDeviceProperties( &prop, 0 );
printf( "name = %s\n", prop.name );
}
main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#define __CUDA_SUPPORT__
#ifdef __CUDA_SUPPORT__
// Defination to be found in "cudaWrap.cu"
extern void preComputeCorrelation_gpu( int * d );
#endif
int main()
{
//code to read d from the file and other initialization
int * d;
.
.
#ifdef __CUDA_SUPPORT__
fprintf( stderr, "GPU Computation starts" );
// Defination to be found in "cudaWrap.cu"
preComputeCorrelation_gpu( d );
#else
fprintf( stderr, "CPU Computation starts" );
preComputeCorrelation( d );
#endif
.
.
//more code
return 0 ;
}
Now, I put following commands to compile the code
$ nvcc -c cudaWrap.cu <br/>
$ g++ -I /usr/local/cuda-5.0/include -L /usr/local/cuda-5.0/lib -o GA_omp GA_dev_omp.cpp main_omp.cpp data_stats.cpp cudaWrap.o
Compilation fails and I get the following message after the 2nd command. Although the 1st command works.
cudaWrap.o: In function `preComputeCorrelation_gpu(DataSet*)':
tmpxft_00001061_00000000-3_cudaWrap.cudafe1.cpp:(.text+0x2f): undefined reference to `cudaGetDeviceProperties'
cudaWrap.o: In function `__cudaUnregisterBinaryUtil()':
tmpxft_00001061_00000000-3_cudaWrap.cudafe1.cpp:(.text+0x6b): undefined reference to `__cudaUnregisterFatBinary'
cudaWrap.o: In function `__sti____cudaRegisterAll_43_tmpxft_00001061_00000000_6_cudaWrap_cpp1_ii_f8a043c5()':
tmpxft_00001061_00000000-3_cudaWrap.cudafe1.cpp:(.text+0x8c): undefined reference to `__cudaRegisterFatBinary'
collect2: ld returned 1 exit status
How to I sort this out really?
The solution to this problem is that linking of the ordinary c++ code and cuda code needs to be done with libcudart.so
Compilation should look some like --
$ nvcc -c cudaWrap.cu
$ g++ -lcudart -I /usr/local/cuda-5.0/include -L /usr/local/cuda-5.0/lib -o GA_omp GA_dev_omp.cpp main_omp.cpp data_stats.cpp cudaWrap.o
In this case, cudaWrap.cu contains the cuda code. main_omp.cpp contains the main() and there are some other application files that need compiling too.
Related
I am new in log4cpp and swig wrapper. I am trying to write an interface for simple logging using log4cpp.
I have installed log4cpp and swig in my Ubuntu machine.
log4cpp.cpp:
#include "log4cpp/Category.hh"
#include "log4cpp/Appender.hh"
#include "log4cpp/FileAppender.hh"
#include "log4cpp/OstreamAppender.hh"
#include "log4cpp/Layout.hh"
#include "log4cpp/BasicLayout.hh"
#include "log4cpp/Priority.hh"
#include "log4cpp.h"
void writeLog() {
log4cpp::Appender *appender1 = new log4cpp::OstreamAppender("console", &std::cout);
appender1->setLayout(new log4cpp::BasicLayout());
log4cpp::Appender *appender2 = new log4cpp::FileAppender("default", "program.log");
appender2->setLayout(new log4cpp::BasicLayout());
log4cpp::Category& root = log4cpp::Category::getRoot();
root.setPriority(log4cpp::Priority::WARN);
root.addAppender(appender1);
log4cpp::Category& sub1 = log4cpp::Category::getInstance(std::string("sub1"));
sub1.addAppender(appender2);
// use of functions for logging messages
root.error("root error");
root.info("root info");
sub1.error("sub1 error");
sub1.warn("sub1 warn");
// printf-style for logging variables
root.warn("%d + %d == %s ?", 1, 1, "two");
// use of streams for logging messages
root << log4cpp::Priority::ERROR << "Streamed root error";
root << log4cpp::Priority::INFO << "Streamed root info";
sub1 << log4cpp::Priority::ERROR << "Streamed sub1 error";
sub1 << log4cpp::Priority::WARN << "Streamed sub1 warn";
// or this way:
root.errorStream() << "Another streamed error";
}
log4cpp.h:
void writeLog(void);
log4cpp.i:
%module log4cpp
%{
#include "log4cpp.h"
%}
%inline %{
extern void writeLog(void);
%}
I have done following steps to generate log4cpp.so file:
swig -tcl -c++ log4cpp.i
g++ -c -fPIC log4cpp.cpp log4cpp_wrap.cxx -I/usr/include/tcl8.5
g++ -shared log4cpp.o log4cpp_wrap.o -o log4cpp.so
It generates the log4cpp_wrap.cxx, log4cpp.o, log4cpp_wrap.o and log4cpp.so files without any warning and error.
Whenever I am running the below command in tcl.
load ./log4cpp.so
It generates an undefined symbol error:
% load ./log4cpp.so
couldn't load file "./log4cpp.so": ./log4cpp.so: undefined symbol: _ZN7log4cpp8Appender29AppenderMapStorageInitializerD1Ev
What to do that to remove this error?
You need to link your SWIG shared library to log4cxx like you would with any other C++ application that uses this library. So when you call
g++ -shared log4cpp.o log4cpp_wrap.o -o log4cpp.so
It really needs to be something like this (but adapted to have real library and search path)
g++ -shared log4cpp.o log4cpp_wrap.o -L/path/to/your/install/of/log4cxx -llog4cxx -o log4cpp.so
I'm trying to run relocatable-device-code in two shared libraries, both using cuda-thrust. Everything runs fine if I stop using thrust in kernel.cu, which is not an option.
edit: The program works too if rdc is disabled. Not an option for me either.
It compiles fine but stops with a segfault when run. gdb tells me this:
Program received signal SIGSEGV, Segmentation fault.
0x0000000000422cc8 in cudart::globalState::registerEntryFunction(void**, char const*, char*, char const*, int, uint3*, uint3*, dim3*, dim3*, int*) ()
(cuda-gdb) bt
#0 0x0000000000422cc8 in cudart::globalState::registerEntryFunction(void**, char const*, char*, char const*, int, uint3*, uint3*, dim3*, dim3*, int*) ()
#1 0x000000000040876c in __cudaRegisterFunction ()
#2 0x0000000000402b58 in __nv_cudaEntityRegisterCallback(void**) ()
#3 0x00007ffff75051a3 in __cudaRegisterLinkedBinary(__fatBinC_Wrapper_t const*, void (*)(void**), void*) ()
from /home/mindoms/rdctestmcsimple/libkernel.so
#4 0x00007ffff75050b1 in __cudaRegisterLinkedBinary_66_tmpxft_00007a5f_00000000_16_cuda_device_runtime_ compute_52_cpp1_ii_8b1a5d37 () from /home/user/rdctestmcsimple/libkernel.so
#5 0x000000000045285d in __libc_csu_init ()
#6 0x00007ffff65ea50f in __libc_start_main () from /lib64/libc.so.6
Here is my stripped down example (using cmake) that shows the error.
main.cpp:
#include "kernel.cuh"
#include "kernel2.cuh"
int main(){
Kernel k;
k.callKernel();
Kernel2 k2;
k2.callKernel2();
}
kernel.cuh:
#ifndef __KERNEL_CUH__
#define __KERNEL_CUH__
class Kernel{
public:
void callKernel();
};
#endif
kernel.cu:
#include "kernel.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel(int *data){
if (threadIdx.x == 0)
printf("the kernel says hello\n");
data[threadIdx.x] = threadIdx.x * 2;
}
void Kernel::callKernel(){
thrust::device_vector<int> D2;
D2.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D2[0]);
printf("Kernel::callKernel called\n");
thekernel <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel!" << std::endl;
}
for (int i = 0; i < D2.size(); i++)
std::cout << "Kernel D[" << i << "]=" << D2[i] << std::endl;
}
kernel2.cuh:
#ifndef __KERNEL2_CUH__
#define __KERNEL2_CUH__
class Kernel2{
public:
void callKernel2();
};
#endif
kernel2.cu
#include "kernel2.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel2(int *data2){
if (threadIdx.x == 0)
printf("the kernel2 says hello\n");
data2[threadIdx.x] = threadIdx.x * 2;
}
void Kernel2::callKernel2(){
thrust::device_vector<int> D;
D.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D[0]);
printf("Kernel2::callKernel2 called\n");
thekernel2 <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel2!" << std::endl;
}
for (int i = 0; i < D.size(); i++)
std::cout << "Kernel2 D[" << i << "]=" << D[i] << std::endl;
}
The cmake file below was used originally, but I get the same problem when I compile "by hand":
nvcc -arch=sm_35 -Xcompiler -fPIC -dc kernel2.cu
nvcc -arch=sm_35 -shared -Xcompiler -fPIC kernel2.o -o libkernel2.so
nvcc -arch=sm_35 -Xcompiler -fPIC -dc kernel.cu
nvcc -arch=sm_35 -shared -Xcompiler -fPIC kernel.o -o libkernel.so
g++ -o main main.cpp libkernel.so libkernel2.so -L/opt/cuda/current/lib64
Adding -cudart shared to every nvcc call as suggested somewhere results in a different error:
warning: Cuda API error detected: cudaFuncGetAttributes returned (0x8)
terminate called after throwing an instance of 'thrust::system::system_error'
what(): function_attributes(): after cudaFuncGetAttributes: invalid device function
Program received signal SIGABRT, Aborted.
0x000000313c432625 in raise () from /lib64/libc.so.6
(cuda-gdb) bt
#0 0x000000313c432625 in raise () from /lib64/libc.so.6
#1 0x000000313c433e05 in abort () from /lib64/libc.so.6
#2 0x00000031430bea7d in __gnu_cxx::__verbose_terminate_handler() () from /usr/lib64/libstdc++.so.6
#3 0x00000031430bcbd6 in std::set_unexpected(void (*)()) () from /usr/lib64/libstdc++.so.6
#4 0x00000031430bcc03 in std::terminate() () from /usr/lib64/libstdc++.so.6
#5 0x00000031430bcc86 in __cxa_rethrow () from /usr/lib64/libstdc++.so.6
#6 0x00007ffff7d600eb in thrust::detail::vector_base<int, thrust::device_malloc_allocator<int> >::append(unsigned long) () from ./libkernel.so
#7 0x00007ffff7d5f740 in thrust::detail::vector_base<int, thrust::device_malloc_allocator<int> >::resize(unsigned long) () from ./libkernel.so
#8 0x00007ffff7d5b19a in Kernel::callKernel() () from ./libkernel.so
#9 0x00000000004006f8 in main ()
CMakeLists.txt: Please adjust to your environment
cmake_minimum_required(VERSION 2.6.2)
project(Cuda-project)
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake/cuda" ${CMAKE_MODULE_PATH})
SET(CUDA_TOOLKIT_ROOT_DIR "/opt/cuda/current")
SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52)
find_package(CUDA REQUIRED)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
set(CUDA_SEPARABLE_COMPILATION ON)
set(BUILD_SHARED_LIBS ON)
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
CUDA_ADD_LIBRARY(kernel
kernel.cu
)
CUDA_ADD_LIBRARY(kernel2
kernel2.cu
)
cuda_add_executable(rdctest main.cpp)
TARGET_LINK_LIBRARIES(rdctest kernel kernel2 cudadevrt)
About my system:
Fedora 23
kernel: 4.4.2-301.fc23.x86_64
Nvidia Driver: 361.28
Nvidia Toolkit: 7.5.18
g++: g++ (GCC) 5.3.1 20151207 (Red Hat 5.3.1-2)
Reproduced on:
CentOS release 6.7 (Final)
Kernel: 2.6.32-573.8.1.el6.x86_64
Nvidia Driver: 352.55
Nvidia Toolkit: 7.5.18
g++ (GCC) 4.4.7 20120313 (Red Hat 4.4.7-16)
glibc 2.12
cmake to 3.5
Apparently, this has something to do with what cuda runtime is used: shared or static.
I slightly modified your example: Instead of building two shared libraries and linking them to the executable individually, I create two static libraries that are linked together to one shared library, and that one is linked to the executable.
Also, here is an updated CMake file that uses the new (>= 3.8) native CUDA language support.
cmake_minimum_required(VERSION 3.8)
project (CudaSharedThrust CXX CUDA)
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_61,code=compute_61")
if(BUILD_SHARED_LIBS)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()
add_library(kernel STATIC kernel.cu)
set_target_properties(kernel PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(kernel2 STATIC kernel2.cu)
set_target_properties(kernel2 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(allkernels empty.cu) # empty.cu is an empty file
set_target_properties(allkernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(allkernels kernel kernel2)
add_executable(rdctest main.cpp)
set_target_properties(rdctest PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(rdctest allkernels)
Building this without any CMake flags (static build), the build succeeds and the program works.
Building with -DBUILD_SHARED_LIBS=ON, the program compiles, but it crashes with the same error is yours.
Building with
cmake .. -DBUILD_SHARED_LIBS=ON -DCMAKE_CUDA_FLAGS:STRING="--cudart shared"
compiles, and actually makes it run! So for some reason, the shared CUDA runtime is required for this sort of thing.
Also note that the step from 2 SO's -> 2 Static Libs in 1 SO was necessary, because otherwise the program would crash with a hrust::system::system_error.
This, however is expected because NVCC actually ignores shared object files during device linking: http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#libraries
In jCuda one can load cuda files as PTX or CUBIN format and call(launch) __global__ functions (kernels) from Java.
With keeping that in mind, I want to develop a framework with JCuda that gets user's __device__ function in a .cu file at run-time, loads and runs it.
And I have already implemented a __global__ function, in which each thread finds out the start point of its related data, perform some computation, initialization and then call user's __device__ function.
Here is my kernel pseudo code:
extern "C" __device__ void userFunc(args);
extern "C" __global__ void kernel(){
// initialize
userFunc(args);
// rest of the kernel
}
And user's __device__ function:
extern "C" __device__ void userFunc(args){
// do something
}
And in Java side, here is the part that I load the modules(modules are made from ptx files which are successfully created from cuda files with this command: nvcc -m64 -ptx path/to/cudaFile -o cudaFile.ptx)
CUmodule kernelModule = new CUmodule(); // 1
CUmodule userFuncModule = new CUmodule(); // 2
cuModuleLoad(kernelModule, ptxKernelFileName); // 3
cuModuleLoad(userFuncModule, ptxUserFuncFileName); // 4
When I try to run it I got error at line 3 : CUDA_ERROR_NO_BINARY_FOR_GPU. After some searching I get that my ptx file has some syntax error. After running this suggested command:
ptxas -arch=sm_30 kernel.ptx
I got:
ptxas fatal : Unresolved extern function 'userFunc'
Even when I replace line 3 with 4 to load userFunc before kernel I get this error. I got stuck at this phase. Is this the correct way to load multiple modules that need to be linked together in JCuda? Or is it even possible?
Edit:
Second part of the question is here
The really short answer is: No, you can't load multiple modules into a context in the runtime API.
You can do what you want, but it requires explicit setup and execution of a JIT linking call. I have no idea how (or even whether) that has been implemented in JCUDA, but I can show you how to do it with the standard driver API. Hold on...
If you have a device function in one file, and a kernel in another, for example:
// test_function.cu
#include <math.h>
__device__ float mathop(float &x, float &y, float &z)
{
float res = sin(x) + cos(y) + sqrt(z);
return res;
}
and
// test_kernel.cu
extern __device__ float mathop(float & x, float & y, float & z);
__global__ void kernel(float *xvals, float * yvals, float * zvals, float *res)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
res[tid] = mathop(xvals[tid], yvals[tid], zvals[tid]);
}
You can compile them to PTX as usual:
$ nvcc -arch=sm_30 -ptx test_function.cu
$ nvcc -arch=sm_30 -ptx test_kernel.cu
$ head -14 test_kernel.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324607
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_30
.address_size 64
// .globl _Z6kernelPfS_S_S_
.extern .func (.param .b32 func_retval0) _Z6mathopRfS_S_
At runtime, your code must create a JIT link session, add each PTX to the linker session, then finalise the linker session. This will give you a handle to a compiled cubin image which can be loaded as a module as usual. The simplest possible driver API code to put this together looks like this:
#include <cstdio>
#include <cuda.h>
#define drvErrChk(ans) { drvAssert(ans, __FILE__, __LINE__); }
inline void drvAssert(CUresult code, const char *file, int line, bool abort=true)
{
if (code != CUDA_SUCCESS) {
fprintf(stderr, "Driver API Error %04d at %s %d\n", int(code), file, line);
exit(-1);
}
}
int main()
{
cuInit(0);
CUdevice device;
drvErrChk( cuDeviceGet(&device, 0) );
CUcontext context;
drvErrChk( cuCtxCreate(&context, 0, device) );
CUlinkState state;
drvErrChk( cuLinkCreate(0, 0, 0, &state) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_function.ptx", 0, 0, 0) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_kernel.ptx" , 0, 0, 0) );
size_t sz;
char * image;
drvErrChk( cuLinkComplete(state, (void **)&image, &sz) );
CUmodule module;
drvErrChk( cuModuleLoadData(&module, image) );
drvErrChk( cuLinkDestroy(state) );
CUfunction function;
drvErrChk( cuModuleGetFunction(&function, module, "_Z6kernelPfS_S_S_") );
return 0;
}
You should be able to compile and run this as posted and verify it works OK. It should serve as a template for a JCUDA implementation, if they have JIT linking support implemented.
I did it the way I do with gcc
nvcc a.cu ut.cu
but the compiler shows
ptxas fatal : Unresolved extern function '_Z1fi'
The problem only occurs when the function is __device__ function.
[File ut.h]
__device__ int f(int);
[File ut.c]
#include "ut.h"
__device__ int f(int a){
return a*a;
}
[File a.cu]
#include "ut.h"
__global__ void mk(){
f(5);
}
int main(){
mk<<<1,1>>>();
}
When a __device__ or __global__ function calls a __device__ function (or __global__ function, in the case of dynamic parallelism) in another translation unit (i.e. file), then it is necessary to use device linking. To enable device linking with your simple compile command, just add the -rdc=true switch:
nvcc -rdc=true a.cu ut.cu
That should fix the issue.
Note that in your compile command you list "ut.cu" but in your question you show "ut.c", I assume that should be the file "ut.cu". If not, you will also need to change the file name from "ut.c" to "ut.cu".
You can read more about device linking in the nvcc manual.
I cannot allocate even only 4 bytes of memory with cudaMallocHost() because of 'out of memory' error. I tried cudaSetDevice(0), cudaDeviceSynchronize(), cudaThreadSynchronize(), and cudaFree(0) at the very first of my code for initializing, but they don't work.
I think this link : cudaMalloc always gives out of memory - has the answer what I want but I cannot understand it. How can I solve this problem?
Here is my full code.
/* test.cu */
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <assert.h>
inline cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
int main()
{
cudaSetDevice(0);
cudaDeviceSynchronize();
cudaThreadSynchronize();
cudaFree(0);
int *test_ptr;
checkCuda( cudaMallocHost((void **)&test_ptr, sizeof(int)) );
cudaFreeHost(test_ptr);
printf("Test Success.\n");
return 0;
}
I compiled with this instruction:
nvcc test.cu -o test
and when I execute this:
me#me:~$ ./test
CUDA Runtime Error: out of memory
test: test.cu:10: cudaError_t checkCuda(cudaError_t): Assertion `result == cudaSuccess' failed.
Aborted
My cuda version is 5.0, I'll post more specific device information if you need.
I just rebooted my system, and the error changed :
me#me:~$ nvidia-smi -q
NVIDIA: could not open the device file /dev/nvidiactl (No such file or directory).
NVIDIA-SMI has failed because it couldn't communicate with NVIDIA driver. Make sure that latest NVIDIA driver is installed and running.
Sorry for my unconsciousness, I'll finish this thread because it became another problem. Thank you for your replies.