How to compile multiple files in cuda? - cuda

I did it the way I do with gcc
nvcc a.cu ut.cu
but the compiler shows
ptxas fatal : Unresolved extern function '_Z1fi'
The problem only occurs when the function is __device__ function.
[File ut.h]
__device__ int f(int);
[File ut.c]
#include "ut.h"
__device__ int f(int a){
return a*a;
}
[File a.cu]
#include "ut.h"
__global__ void mk(){
f(5);
}
int main(){
mk<<<1,1>>>();
}

When a __device__ or __global__ function calls a __device__ function (or __global__ function, in the case of dynamic parallelism) in another translation unit (i.e. file), then it is necessary to use device linking. To enable device linking with your simple compile command, just add the -rdc=true switch:
nvcc -rdc=true a.cu ut.cu
That should fix the issue.
Note that in your compile command you list "ut.cu" but in your question you show "ut.c", I assume that should be the file "ut.cu". If not, you will also need to change the file name from "ut.c" to "ut.cu".
You can read more about device linking in the nvcc manual.

Related

Static __device__ variable and kernels in separate file

I want statically declare a global variable with __device__ qualifier. In the same time I want to store functions intended to GPU in a separate file.
However, if I do so, the variable value is not transferred to GPU -- there are no errors in compilation or execution time, but memcpy functions do nothing.
When I move kernel function into the file with the host code, everything works.
I am sure, that it should be possible to split host and device functions into separate files in this case, but how to do this? I have seen just examples, when kernels and host code are in the same file.
I would be also very thankful, if somebody explained, why does it behaves so.
A sample code is listed below.
Thank you in advance.
Working directory:
$ ls
functionsGPU.cu functionsGPU.cuh staticGlobalMemory.cu
staticGlobalMemory.cu:
#include "functionsGPU.cuh"
#if VARIANT == 2
__global__ void checkGlobalVariable(){
printf("Old value (dev): %f\n", devData);
devData += 2.0f;
printf("New value (dev): %f\n", devData);
}
#endif
int main(int argc, char **argv){
int dev = 0;
float val = 3.2;
cudaSetDevice(dev);
printf("---------\nVARIANT %i\n---------\n", VARIANT);
printf("Old value (host): %f\n", val);
cudaMemcpyToSymbol(devData, &val, sizeof(float));
checkGlobalVariable <<<1, 1>>> ();
cudaMemcpyFromSymbol(&val, devData, sizeof(float));
printf("New value (host): %f\n", val);
cudaDeviceReset();
return 0;
}
functionsGPU.cuh:
#ifndef FUNCTIONSGPU_CUH
#define FUNCTIONSGPU_CUH
#include <cuda_runtime.h>
#include <stdio.h>
#define VARIANT 1
__device__ float devData;
#if VARIANT == 1
__global__ void checkGlobalVariable();
#endif
#endif
functionsGPU.cu:
#include "functionsGPU.cuh"
#if VARIANT == 1
__global__ void checkGlobalVariable(){
printf("Old value (dev): %f\n", devData);
devData += 2.0f;
printf("New value (dev): %f\n", devData);
}
#endif
This is compiled as
$ nvcc -arch=sm_61 staticGlobalMemory.cu functionsGPU.cu -o staticGlobalMemory
Output if the kernel and host code are in separate files (incorrect):
---------
VARIANT 1
---------
Old value (host): 3.200000
Old value (dev): 0.000000
New value (dev): 2.000000
New value (host): 3.200000
Output if the kernel and host code are in the same file (correct):
---------
VARIANT 2
---------
Old value (host): 3.200000
Old value (dev): 3.200000
New value (dev): 5.200000
New value (host): 5.200000
Your code structure, where device code in one compilation unit references device code or device entities in another compilation unit, will require CUDA relocatable device code compilation and linking.
In the case of __device__ variables such as what you have here:
Add -rdc=true to enable this, to your nvcc compilation command line
Add extern in front of the definition of devData, in functionsGPU.cuh
Add __device__ float devData; to staticGlobalMemory.cu
In the case of linking to a __device__ function in a separate file, along with providing the prototype typically via a header file like you would with any function in C++, you also need to add -rdc=true to your nvcc compilation command line, to enable device code linking. Steps 2 and 3 above are not needed.
That should fix the issue. Step 1 provides the necessary cross-module linkage, and steps 2 and 3 will fix the duplicate definition problem you would have, since you are including the same variable via a header file in separate compilation units.
For a reference of how to do the device code compilation setting in windows visual studio, see here.

How to call a Thrust function in a stream from a kernel?

I want to make thrust::scatter asynchronous by calling it in a device kernel(I could also do it by calling it in another host thread). thrust::cuda::par.on(stream) is host function that cannot be called from a device kernel. The following code was tried with CUDA 10.1 on Turing architecture.
__global__ void async_scatter_kernel(float* first,
float* last,
int* map,
float* output)
{
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
thrust::scatter(thrust::cuda::par.on(stream), first, last, map, output);
cudaDeviceSynchronize();
cudaStreamDestroy(stream);
}
I know thrust uses dynamic parallelism to launch its kernels when called from the device, however I couldn't find a way to specify the stream.
The following code compiles cleanly for me on CUDA 10.1.243:
$ cat t1518.cu
#include <thrust/scatter.h>
#include <thrust/execution_policy.h>
__global__ void async_scatter_kernel(float* first,
float* last,
int* map,
float* output)
{
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
thrust::scatter(thrust::cuda::par.on(stream), first, last, map, output);
cudaDeviceSynchronize();
cudaStreamDestroy(stream);
}
int main(){
float *first = NULL;
float *last = NULL;
float *output = NULL;
int *map = NULL;
async_scatter_kernel<<<1,1>>>(first, last, map, output);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -rdc=true t1518.cu -o t1518
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
$
The -arch=sm_35 (or similar) and -rdc=true are necessary (but not in all cases sufficient) compile switches for any code that uses CUDA Dynamic Parallelism. If you omit, for example, the -rdc=true switch, you get an error similar to what you describe:
$ nvcc -arch=sm_35 t1518.cu -o t1518
t1518.cu(11): error: calling a __host__ function("thrust::cuda_cub::par_t::on const") from a __global__ function("async_scatter_kernel") is not allowed
t1518.cu(11): error: identifier "thrust::cuda_cub::par_t::on const" is undefined in device code
2 errors detected in the compilation of "/tmp/tmpxft_00003a80_00000000-8_t1518.cpp1.ii".
$
So, for the example you have shown here, your compilation error can be eliminated either by updating to the latest CUDA version or by specifying the proper command line, or both.

Loading multiple modules in JCuda is not working

In jCuda one can load cuda files as PTX or CUBIN format and call(launch) __global__ functions (kernels) from Java.
With keeping that in mind, I want to develop a framework with JCuda that gets user's __device__ function in a .cu file at run-time, loads and runs it.
And I have already implemented a __global__ function, in which each thread finds out the start point of its related data, perform some computation, initialization and then call user's __device__ function.
Here is my kernel pseudo code:
extern "C" __device__ void userFunc(args);
extern "C" __global__ void kernel(){
// initialize
userFunc(args);
// rest of the kernel
}
And user's __device__ function:
extern "C" __device__ void userFunc(args){
// do something
}
And in Java side, here is the part that I load the modules(modules are made from ptx files which are successfully created from cuda files with this command: nvcc -m64 -ptx path/to/cudaFile -o cudaFile.ptx)
CUmodule kernelModule = new CUmodule(); // 1
CUmodule userFuncModule = new CUmodule(); // 2
cuModuleLoad(kernelModule, ptxKernelFileName); // 3
cuModuleLoad(userFuncModule, ptxUserFuncFileName); // 4
When I try to run it I got error at line 3 : CUDA_ERROR_NO_BINARY_FOR_GPU. After some searching I get that my ptx file has some syntax error. After running this suggested command:
ptxas -arch=sm_30 kernel.ptx
I got:
ptxas fatal : Unresolved extern function 'userFunc'
Even when I replace line 3 with 4 to load userFunc before kernel I get this error. I got stuck at this phase. Is this the correct way to load multiple modules that need to be linked together in JCuda? Or is it even possible?
Edit:
Second part of the question is here
The really short answer is: No, you can't load multiple modules into a context in the runtime API.
You can do what you want, but it requires explicit setup and execution of a JIT linking call. I have no idea how (or even whether) that has been implemented in JCUDA, but I can show you how to do it with the standard driver API. Hold on...
If you have a device function in one file, and a kernel in another, for example:
// test_function.cu
#include <math.h>
__device__ float mathop(float &x, float &y, float &z)
{
float res = sin(x) + cos(y) + sqrt(z);
return res;
}
and
// test_kernel.cu
extern __device__ float mathop(float & x, float & y, float & z);
__global__ void kernel(float *xvals, float * yvals, float * zvals, float *res)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
res[tid] = mathop(xvals[tid], yvals[tid], zvals[tid]);
}
You can compile them to PTX as usual:
$ nvcc -arch=sm_30 -ptx test_function.cu
$ nvcc -arch=sm_30 -ptx test_kernel.cu
$ head -14 test_kernel.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324607
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_30
.address_size 64
// .globl _Z6kernelPfS_S_S_
.extern .func (.param .b32 func_retval0) _Z6mathopRfS_S_
At runtime, your code must create a JIT link session, add each PTX to the linker session, then finalise the linker session. This will give you a handle to a compiled cubin image which can be loaded as a module as usual. The simplest possible driver API code to put this together looks like this:
#include <cstdio>
#include <cuda.h>
#define drvErrChk(ans) { drvAssert(ans, __FILE__, __LINE__); }
inline void drvAssert(CUresult code, const char *file, int line, bool abort=true)
{
if (code != CUDA_SUCCESS) {
fprintf(stderr, "Driver API Error %04d at %s %d\n", int(code), file, line);
exit(-1);
}
}
int main()
{
cuInit(0);
CUdevice device;
drvErrChk( cuDeviceGet(&device, 0) );
CUcontext context;
drvErrChk( cuCtxCreate(&context, 0, device) );
CUlinkState state;
drvErrChk( cuLinkCreate(0, 0, 0, &state) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_function.ptx", 0, 0, 0) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_kernel.ptx" , 0, 0, 0) );
size_t sz;
char * image;
drvErrChk( cuLinkComplete(state, (void **)&image, &sz) );
CUmodule module;
drvErrChk( cuModuleLoadData(&module, image) );
drvErrChk( cuLinkDestroy(state) );
CUfunction function;
drvErrChk( cuModuleGetFunction(&function, module, "_Z6kernelPfS_S_S_") );
return 0;
}
You should be able to compile and run this as posted and verify it works OK. It should serve as a template for a JCUDA implementation, if they have JIT linking support implemented.

Update project from older CUDA version

In my older CUDA project I had the globals:
__device__ uint8_t dev_intersect
__constant__ uint8_t dev_flags
... and used them this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,"dev_intersect")
cudaMemcpyToSymbol("dev_flags",&flags,sizeof(flags))
Now, since CUDA 5.0 (and newer) the symbols must be passed directly (without string), so I define the globals this way:
__device__ uint8_t *dev_intersect
__constant__ uint8_t *dev_flags
...and call the functions this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect)
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags))
Am I doing it right so far? I'm asking you, because when I update the code, I start getting other errors, which makes me kinda suspicious. Thanks for any help.
Switching from a POD variable to a pointer is probably not what you want.
If you didn't make changes elsewhere in your code to account for that difference, I would expect things to break.
To update your cuda function calls, leave your variables as-is:
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
And just drop the quotes from your cuda API functions that use those variables:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
Here is a complete worked example:
$ cat t524.cu
#include <stdio.h>
typedef unsigned char uint8_t;
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
__global__ void mykernel(uint8_t *d1_ptr){
printf("data 1 = %c\n", *d1_ptr);
printf("dev_flags = %c\n", dev_flags);
}
int main(){
uint8_t *ptr_dev_intersect;
uint8_t flags = 'X';
uint8_t dev_intersect_data = 'Y';
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
cudaMemcpyToSymbol(dev_intersect,&dev_intersect_data,sizeof(dev_intersect_data));
mykernel<<<1,1>>>(ptr_dev_intersect);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t524 t524.cu
$ cuda-memcheck ./t524
========= CUDA-MEMCHECK
data 1 = Y
dev_flags = X
========= ERROR SUMMARY: 0 errors
$

CUDA with C/C++ Compilation fails

I am trying to integrate CUDA code with my existing C++ application. As instructed on some web side, I need to have a "file.cu" where in I have a wrapper function which does the memory allocation on the GPU and launch kernel. I followed that advise but, I am not able to compile the code now.
file.cu
#include <cuda.h>
#include <stdio.h>
void preComputeCorrelation_gpu( int * d )
{
//I shall write the kernel later once I am confirmed that CUDA code works
cudaDeviceProp prop;
cudaGetDeviceProperties( &prop, 0 );
printf( "name = %s\n", prop.name );
}
main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#define __CUDA_SUPPORT__
#ifdef __CUDA_SUPPORT__
// Defination to be found in "cudaWrap.cu"
extern void preComputeCorrelation_gpu( int * d );
#endif
int main()
{
//code to read d from the file and other initialization
int * d;
.
.
#ifdef __CUDA_SUPPORT__
fprintf( stderr, "GPU Computation starts" );
// Defination to be found in "cudaWrap.cu"
preComputeCorrelation_gpu( d );
#else
fprintf( stderr, "CPU Computation starts" );
preComputeCorrelation( d );
#endif
.
.
//more code
return 0 ;
}
Now, I put following commands to compile the code
$ nvcc -c cudaWrap.cu <br/>
$ g++ -I /usr/local/cuda-5.0/include -L /usr/local/cuda-5.0/lib -o GA_omp GA_dev_omp.cpp main_omp.cpp data_stats.cpp cudaWrap.o
Compilation fails and I get the following message after the 2nd command. Although the 1st command works.
cudaWrap.o: In function `preComputeCorrelation_gpu(DataSet*)':
tmpxft_00001061_00000000-3_cudaWrap.cudafe1.cpp:(.text+0x2f): undefined reference to `cudaGetDeviceProperties'
cudaWrap.o: In function `__cudaUnregisterBinaryUtil()':
tmpxft_00001061_00000000-3_cudaWrap.cudafe1.cpp:(.text+0x6b): undefined reference to `__cudaUnregisterFatBinary'
cudaWrap.o: In function `__sti____cudaRegisterAll_43_tmpxft_00001061_00000000_6_cudaWrap_cpp1_ii_f8a043c5()':
tmpxft_00001061_00000000-3_cudaWrap.cudafe1.cpp:(.text+0x8c): undefined reference to `__cudaRegisterFatBinary'
collect2: ld returned 1 exit status
How to I sort this out really?
The solution to this problem is that linking of the ordinary c++ code and cuda code needs to be done with libcudart.so
Compilation should look some like --
$ nvcc -c cudaWrap.cu
$ g++ -lcudart -I /usr/local/cuda-5.0/include -L /usr/local/cuda-5.0/lib -o GA_omp GA_dev_omp.cpp main_omp.cpp data_stats.cpp cudaWrap.o
In this case, cudaWrap.cu contains the cuda code. main_omp.cpp contains the main() and there are some other application files that need compiling too.