How to use a shared/dynamically linked library in Fortran? - deep-learning

I have a neural network trained in Matlab. Now, I want to use this network in Fortran. So, I followed the instructions given here: https://www.mathworks.com/help/deeplearning/ref/network.genfunction.html:
%% First, train a static network and calculate its outputs for the training data.
[x,t] = bodyfat_dataset;
bodyfatNet = feedforwardnet(10);
bodyfatNet = train(bodyfatNet,x,t);
y = bodyfatNet(x);
%% Next, generate and test a MATLAB function. Then the new function is compiled to a shared/dynamically linked library with mcc.
genFunction(bodyfatNet,'bodyfatFcn');
y2 = bodyfatFcn(x);
accuracy2 = max(abs(y-y2))
mcc -W lib:libBodyfat -T link:lib bodyfatFcn
This leads to the generation of files with extensions .c, .h and .so
In the Fortran code test.F90, I want to be able to compute y_test for a given x_test:
y_test = bodyfatNet(x_test); Could you please tell me how this can be done/written?
Here below is my Makefile. I can make an executable from object file test.o and the shared object .so:
FORTRAN_COMPILER=gfortran
#FORTRAN_FLAGS=-O3 -Wall -Wextra -std=f2008
FORTRAN_FLAGS=-ffree-line-length-none
OBJ2 = libBodyfat.so
SRC1= test.F90
OBJ1 = $(SRC1:.F90=.o)
LIBS = $(OBJ1) $(OBJ2)
%.o: %.F90
#echo 'converting .F90 files to .o'
$(FORTRAN_COMPILER) $(FORTRAN_FLAGS) -o $# -c $<
binary: $(LIBS)
#echo 'make an executable from objet files (.o) and the shared object (.so)'
$(FORTRAN_COMPILER) $(FORTRAN_FLAGS) -o $# $(LIBS)
clean:
#echo 'cleaning'
#rm -f *.mod *.o binary
I am not certain if utilizing only ‘.so’ would suffice. But the more general question is how from test.F90 I can use the network.
UPDATE:
As suggested by PierU, the problem is more of "how to call C routines from Fortran?". Below, you find the content of the generated .h file and .c file:
/*
* MATLAB Compiler: 8.2 (R2021a)
* Date: Wed Feb 8 15:21:13 2023
* Arguments: "-B""macro_default""-W""lib:libBodyfat""-T""link:lib""bodyfatFcn"
*/
#ifndef libBodyfat_h
#define libBodyfat_h 1
#if defined(__cplusplus) && !defined(mclmcrrt_h) && defined(__linux__)
# pragma implementation "mclmcrrt.h"
#endif
#include "mclmcrrt.h"
#ifdef __cplusplus
extern "C" { // sbcheck:ok:extern_c
#endif
/* This symbol is defined in shared libraries. Define it here
* (to nothing) in case this isn't a shared library.
*/
#ifndef LIB_libBodyfat_C_API
#define LIB_libBodyfat_C_API /* No special import/export declaration */
#endif
/* GENERAL LIBRARY FUNCTIONS -- START */
extern LIB_libBodyfat_C_API
bool MW_CALL_CONV libBodyfatInitializeWithHandlers(
mclOutputHandlerFcn error_handler,
mclOutputHandlerFcn print_handler);
extern LIB_libBodyfat_C_API
bool MW_CALL_CONV libBodyfatInitialize(void);
extern LIB_libBodyfat_C_API
void MW_CALL_CONV libBodyfatTerminate(void);
extern LIB_libBodyfat_C_API
void MW_CALL_CONV libBodyfatPrintStackTrace(void);
/* GENERAL LIBRARY FUNCTIONS -- END */
/* C INTERFACE -- MLX WRAPPERS FOR USER-DEFINED MATLAB FUNCTIONS -- START */
extern LIB_libBodyfat_C_API
bool MW_CALL_CONV mlxBodyfatFcn(int nlhs, mxArray *plhs[], int nrhs, mxArray *prhs[]);
/* C INTERFACE -- MLX WRAPPERS FOR USER-DEFINED MATLAB FUNCTIONS -- END */
/* C INTERFACE -- MLF WRAPPERS FOR USER-DEFINED MATLAB FUNCTIONS -- START */
extern LIB_libBodyfat_C_API bool MW_CALL_CONV mlfBodyfatFcn(int nargout, mxArray** Y, mxArray** Xf, mxArray** Af, mxArray* X, mxArray* _U4b, mxArray* _U4c);
#ifdef __cplusplus
}
#endif
/* C INTERFACE -- MLF WRAPPERS FOR USER-DEFINED MATLAB FUNCTIONS -- END */
#endif
This is the content of .c file:
/*
* MATLAB Compiler: 8.2 (R2021a)
* Date: Wed Feb 8 15:21:13 2023
* Arguments: "-B""macro_default""-W""lib:libBodyfat""-T""link:lib""bodyfatFcn"
*/
#define EXPORTING_libBodyfat 1
#include "libBodyfat.h"
static HMCRINSTANCE _mcr_inst = NULL; /* don't use nullptr; this may be either C or C++ */
#ifdef __cplusplus
extern "C" { // sbcheck:ok:extern_c
#endif
static int mclDefaultPrintHandler(const char *s)
{
return mclWrite(1 /* stdout */, s, sizeof(char)*strlen(s));
}
#ifdef __cplusplus
} /* End extern C block */
#endif
#ifdef __cplusplus
extern "C" { // sbcheck:ok:extern_c
#endif
static int mclDefaultErrorHandler(const char *s)
{
int written = 0;
size_t len = 0;
len = strlen(s);
written = mclWrite(2 /* stderr */, s, sizeof(char)*len);
if (len > 0 && s[ len-1 ] != '\n')
written += mclWrite(2 /* stderr */, "\n", sizeof(char));
return written;
}
#ifdef __cplusplus
} /* End extern C block */
#endif
/* This symbol is defined in shared libraries. Define it here
* (to nothing) in case this isn't a shared library.
*/
#ifndef LIB_libBodyfat_C_API
#define LIB_libBodyfat_C_API /* No special import/export declaration */
#endif
LIB_libBodyfat_C_API
bool MW_CALL_CONV libBodyfatInitializeWithHandlers(
mclOutputHandlerFcn error_handler,
mclOutputHandlerFcn print_handler)
{
int bResult = 0;
if (_mcr_inst)
return true;
if (!mclmcrInitialize())
return false;
{
mclCtfStream ctfStream =
mclGetEmbeddedCtfStream((void *)(libBodyfatInitializeWithHandlers));
if (ctfStream) {
bResult = mclInitializeComponentInstanceEmbedded(&_mcr_inst,
error_handler,
print_handler,
ctfStream);
mclDestroyStream(ctfStream);
} else {
bResult = 0;
}
}
if (!bResult)
return false;
return true;
}
LIB_libBodyfat_C_API
bool MW_CALL_CONV libBodyfatInitialize(void)
{
return libBodyfatInitializeWithHandlers(mclDefaultErrorHandler,
mclDefaultPrintHandler);
}
LIB_libBodyfat_C_API
void MW_CALL_CONV libBodyfatTerminate(void)
{
if (_mcr_inst)
mclTerminateInstance(&_mcr_inst);
}
LIB_libBodyfat_C_API
void MW_CALL_CONV libBodyfatPrintStackTrace(void)
{
char** stackTrace;
int stackDepth = mclGetStackTrace(&stackTrace);
int i;
for(i=0; i<stackDepth; i++)
{
mclWrite(2 /* stderr */, stackTrace[i], sizeof(char)*strlen(stackTrace[i]));
mclWrite(2 /* stderr */, "\n", sizeof(char)*strlen("\n"));
}
mclFreeStackTrace(&stackTrace, stackDepth);
}
LIB_libBodyfat_C_API
bool MW_CALL_CONV mlxBodyfatFcn(int nlhs, mxArray *plhs[], int nrhs, mxArray *prhs[])
{
return mclFeval(_mcr_inst, "bodyfatFcn", nlhs, plhs, nrhs, prhs);
}
LIB_libBodyfat_C_API
bool MW_CALL_CONV mlfBodyfatFcn(int nargout, mxArray** Y, mxArray** Xf, mxArray** Af,
mxArray* X, mxArray* _U4b, mxArray* _U4c)
{
return mclMlfFeval(_mcr_inst, "bodyfatFcn", nargout, 3, 3, Y, Xf, Af, X, _U4b, _U4c);
}

Related

why nvcc remove my if branch during compiling?

I found a strange behaviour when compiling cuda code to ptx. If global function using return value from tex2DLod<uchar4> calls a device function with if-statement whose both branches contain a device function using uchar4 as argument, the resulting ptx file only has the code from else branch.
An example is here. I compiled the following code with both cuda 10.1 update 1 and update2. Result is always the same. When I remove the if statement and only put the else part there. The resulting ptx never changes which means the first branch has lost.
#include <stdint.h>
#include <cuda.h>
__device__ float3 rgba2rgb(uchar4 p)
{
return make_float3(p.x/255.0f, p.y/255.0f, p.z/255.0f);
}
__device__ float3 bgra2rgb(uchar4 p)
{
return make_float3(p.z/255.0f, p.y/255.0f, p.x/255.0f);
}
__device__ float3 pixel2rgb(uchar4 p, bool flag)
{
if(flag)
{
return bgra2rgb(p);
}
else
{
return rgba2rgb(p);
}
}
extern "C" __global__ void func2(
CUtexObject rgb_mip_texture,
size_t width, size_t height,
bool flag
)
{
size_t x_p = blockIdx.x * blockDim.x + threadIdx.x;
size_t y_p = blockIdx.y * blockDim.y + threadIdx.y;
if (x_p >= width || y_p >= height)
return;
uchar4 pixel = tex2DLod<uchar4>(rgb_mip_texture, x_p, y_p, (float)0);
//uchar4 pixel = make_uchar4(1, 2, 3, 4);
float3 rgb = pixel2rgb(pixel, flag);
printf("rgb=(%f,%f,%f)", rgb.x, rgb.y, rgb.z);
}
the nvcc command ccbin is clang 8.0.
/usr/bin/nvcc -ptx \
-v --ptxas-options=-v \
--compiler-options "-v" \
-ccbin "${ccbin}" \
"${input_file}" \
-o "${ptx_file}"
If the pixel is not from tex2DLod (for example from a make_uchar4) then both branches are preserved. Is this a known bug in nvcc?
This would appear to be a bug in nvcc 10.1 (the only version I have tested). It appears that the compiler attempts at automatic inline expansion of the rgba2rgb and bgra2rgb functions are breaking somehow, so that the result of compiling this:
__device__ float3 pixel2rgb(uchar4 p, bool flag)
{
if(flag)
{
return bgra2rgb(p);
}
else
{
return rgba2rgb(p);
}
}
is effectively this:
__device__ float3 pixel2rgb(uchar4 p, bool flag)
{
return rgba2rgb(p);
}
It isn't related to textures per se, because I can reproduce the problem with this code reading directly from global memory:
#include <stdint.h>
#include <cuda.h>
#include <cstdio>
__device__ float3 rgba2rgb(uchar4 p)
{
return make_float3(p.x/255.0f, p.y/255.0f, p.z/255.0f);
}
__device__ float3 bgra2rgb(uchar4 p)
{
return make_float3(p.z/255.0f, p.y/255.0f, p.x/255.0f);
}
__device__ float3 pixel2rgb(uchar4 p, bool flag)
{
if(flag)
{
return bgra2rgb(p);
}
else
{
return rgba2rgb(p);
}
}
__global__ void func2(
uchar4* pixels,
size_t width, size_t height,
bool flag
)
{
size_t x_p = blockIdx.x * blockDim.x + threadIdx.x;
size_t y_p = blockIdx.y * blockDim.y + threadIdx.y;
if ((x_p < width) && (y_p < height)) {
size_t idx = x_p * width + y_p;
uchar4 pixel = pixels[idx];
float3 rgb = pixel2rgb(pixel, flag);
printf("flag=%d idx=%ld rgb=(%f,%f,%f)\n", flag, idx, rgb.x, rgb.y, rgb.z);
}
}
int main()
{
int width = 2, height = 2;
uchar4* data;
cudaMallocManaged(&data, width * height * sizeof(uchar4));
data[0] = make_uchar4(1, 2, 3, 4);
data[1] = make_uchar4(2, 3, 4, 5);
data[2] = make_uchar4(3, 4, 5, 6);
data[3] = make_uchar4(4, 5, 6, 7);
dim3 bdim(2,2);
func2<<<1, bdim>>>(data, width, height, true);
cudaDeviceSynchronize();
func2<<<1, bdim>>>(data, width, height, false);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_52 -o wangwang wangwang.cu
$ ./wangwang
flag=1 idx=0 rgb=(0.003922,0.007843,0.011765)
flag=1 idx=2 rgb=(0.011765,0.015686,0.019608)
flag=1 idx=1 rgb=(0.007843,0.011765,0.015686)
flag=1 idx=3 rgb=(0.015686,0.019608,0.023529)
flag=0 idx=0 rgb=(0.003922,0.007843,0.011765)
flag=0 idx=2 rgb=(0.011765,0.015686,0.019608)
flag=0 idx=1 rgb=(0.007843,0.011765,0.015686)
flag=0 idx=3 rgb=(0.015686,0.019608,0.023529)
I presume that the make_uchar4 version you mention works because the compiler will do pre-computation of the results due to the constant inputs and eliminate the conversion function code all together.
Playing around, I was able to fix this by changing the code like this:
__device__ __inline__ float3 rgba2rgb(uchar4 p)
{
return make_float3(p.x/255.0f, p.y/255.0f, p.z/255.0f);
}
__device__ __inline__ float3 bgra2rgb(uchar4 p)
{
return make_float3(p.z/255.0f, p.y/255.0f, p.x/255.0f);
}
When I do this, the compile injects some swizzling logic into the inline PTX expansion it generates:
ld.global.v4.u8 {%rs2, %rs3, %rs4, %rs5}, [%rd10];
and.b16 %rs8, %rs1, 255; <---- %rs1 is the input bool
setp.eq.s16 %p4, %rs8, 0;
selp.b16 %rs9, %rs2, %rs4, %p4;
and.b16 %rs10, %rs9, 255;
selp.b16 %rs11, %rs4, %rs2, %p4;
and.b16 %rs12, %rs11, 255;
and things work correctly (your mileage may vary):
$ nvcc -arch=sm_52 -o wangwang wangwang.cu
$ ./wangwang
flag=1 idx=0 rgb=(0.011765,0.007843,0.003922)
flag=1 idx=2 rgb=(0.019608,0.015686,0.011765)
flag=1 idx=1 rgb=(0.015686,0.011765,0.007843)
flag=1 idx=3 rgb=(0.023529,0.019608,0.015686)
flag=0 idx=0 rgb=(0.003922,0.007843,0.011765)
flag=0 idx=2 rgb=(0.011765,0.015686,0.019608)
flag=0 idx=1 rgb=(0.007843,0.011765,0.015686)
flag=0 idx=3 rgb=(0.015686,0.019608,0.023529)
I would report this as a bug to NVIDIA.

How is it possible to compile code from code

I want to experiment with the programs that write programs in C code, and i want to use construction like following:
int main() {
char* srcCode="int f(int x) { return x+42; }";
int (*compiledFun)(int) = compile(srcCode);
printf("result=%d", (*compiledFun)(123));
return 0;
}
Desired output should be printed "result=165".
My question is about compile() function. I may try to put srcCode in a file, then invoke external compiler, like gcc, then try to read produced binary, probably fix some addresses, and so to fill the compiledFun memory. But I feel like that would be a very inefficient stub. Is there any way to compile a program from within a program, directly from memory to memory? Maybe some library or a subset that can be ripped off gcc sources, responsible for producting binary code from source text?
That may be important addition, all source code that should be compiled is a function that takes arguments and returns. It will not call any external libraries and function like printf, but only do some calculations and return.
Use libtcc an in-memory C compiler from TinyC.
A complete example from here https://github.com/TinyCC/tinycc/blob/mob/tests/libtcc_test.c
/*
* Simple Test program for libtcc
*
* libtcc can be useful to use tcc as a "backend" for a code generator.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "libtcc.h"
/* this function is called by the generated code */
int add(int a, int b)
{
return a + b;
}
/* this strinc is referenced by the generated code */
const char hello[] = "Hello World!";
char my_program[] =
"#include <tcclib.h>\n" /* include the "Simple libc header for TCC" */
"extern int add(int a, int b);\n"
"#ifdef _WIN32\n" /* dynamically linked data needs 'dllimport' */
" __attribute__((dllimport))\n"
"#endif\n"
"extern const char hello[];\n"
"int fib(int n)\n"
"{\n"
" if (n <= 2)\n"
" return 1;\n"
" else\n"
" return fib(n-1) + fib(n-2);\n"
"}\n"
"\n"
"int foo(int n)\n"
"{\n"
" printf(\"%s\\n\", hello);\n"
" printf(\"fib(%d) = %d\\n\", n, fib(n));\n"
" printf(\"add(%d, %d) = %d\\n\", n, 2 * n, add(n, 2 * n));\n"
" return 0;\n"
"}\n";
int main(int argc, char **argv)
{
TCCState *s;
int i;
int (*func)(int);
s = tcc_new();
if (!s) {
fprintf(stderr, "Could not create tcc state\n");
exit(1);
}
/* if tcclib.h and libtcc1.a are not installed, where can we find them */
for (i = 1; i < argc; ++i) {
char *a = argv[i];
if (a[0] == '-') {
if (a[1] == 'B')
tcc_set_lib_path(s, a+2);
else if (a[1] == 'I')
tcc_add_include_path(s, a+2);
else if (a[1] == 'L')
tcc_add_library_path(s, a+2);
}
}
/* MUST BE CALLED before any compilation */
tcc_set_output_type(s, TCC_OUTPUT_MEMORY);
if (tcc_compile_string(s, my_program) == -1)
return 1;
/* as a test, we add symbols that the compiled program can use.
You may also open a dll with tcc_add_dll() and use symbols from that */
tcc_add_symbol(s, "add", add);
tcc_add_symbol(s, "hello", hello);
/* relocate the code */
if (tcc_relocate(s, TCC_RELOCATE_AUTO) < 0)
return 1;
/* get entry symbol */
func = tcc_get_symbol(s, "foo");
if (!func)
return 1;
/* run the code */
func(32);
/* delete the state */
tcc_delete(s);
return 0;
}

TCL tclOODecls.h Functions Not Exported

My compiled TCL library (Tcl8.6.7, win7) didn't export the functions in tclOODecls.h. I exported the functions by putting: #define TCLAPI extern DLLEXPORT in tclOODecls.h before the auto generated section.
I would like to know why the functions were not exported by default and the preferred way of enabling the export.
The symbols have been defined to use MODULE_SCOPE. See issue 3010352 which mentions this a while ago. The idea is that if you want to use these symbols you should be using the Tcl stubs linking mechanism and define the USE_TCLOO_STUBS macro and link with the stubs library (tclstub86). The functions are in there - just not exported on the DLL exports.
From tcl.h:
/*
* Include platform specific public function declarations that are accessible
* via the stubs table. Make all TclOO symbols MODULE_SCOPE (which only
* has effect on building it as a shared library). See ticket [3010352].
*/
Example of static embedding Tcl with TclOO API usage
The following code will build an executable that works like the standard Tcl shell but includes an additional command that accesses the TclOO API as a demonstration.
/* Demonstrate embedding Tcl and using the TclOO API.
*
* Build with MSVC (adjust paths for local setup):
*
* cl -nologo -W3 -MT -Zi -GL -DSTATIC_BUILD -Ic:\opt\tcl\include test_embed_oo.c \
* -Fe:test_embed_oo.exe -link -debug -subsystem:console -ltcg -libpath:C:\src\tcl\kitgen\8.6\tcl\win\Release_VC13 \
* tcl86ts.lib user32.lib ws2_32.lib netapi32.lib
*
* Requires a static libary if Tcl (tcl86ts.lib on windows)
*/
#include <tcl.h>
#include <tclOO.h>
#include <locale.h>
static int
GetObjectNameCmd(ClientData clientData, Tcl_Interp *interp,
int objc, Tcl_Obj *const objv[])
{
int r = TCL_ERROR;
if (objc != 2) {
Tcl_WrongNumArgs(interp, 1, objv, "object");
return r;
}
Tcl_Object object = Tcl_GetObjectFromObj(interp, objv[1]);
if (object != NULL) {
Tcl_Obj *resultObj = Tcl_GetObjectName(interp, object);
if (resultObj != NULL) {
Tcl_SetObjResult(interp, resultObj);
r = TCL_OK;
}
}
return r;
}
#define TCL_LOCAL_APPINIT Custom_AppInit
int
Custom_AppInit(Tcl_Interp *interp)
{
Tcl_CreateObjCommand(interp, "getobjectname", GetObjectNameCmd, NULL, NULL);
return Tcl_Eval(interp, "source test_embed_oo.tcl");
}
#include "c:/src/tcl/kitgen/8.6/tcl/win/tclAppInit.c"
Embed Tcl using dynamic loading
Tcl can be linked to an application dynamically using the Tcl stubs linkage mechanism. This requires loading two functions from the DLL and then initializing the stubs table. The following shows this with TclOO access enabled.
/* Demonstrate embedding Tcl and using the TclOO API via stubs
*
* Build with MSVC (adjust paths for local setup):
*
* cl -nologo -W3 -MD -Zi -GL -Ic:\opt\tcl\include test_embed_oo_ex.c \
* -Fe:test_embed_oo_ex.exe -link -debug -ltcg -subsystem:console \
* -libpath:C:\opt\tcl\lib tclstub86.lib user32.lib
*
* Dynamically loads Tcl and then uses stubs for API access.
*/
#define WIN32_LEAN_AND_MEAN
#define STRICT
#include <windows.h>
#include <locale.h>
#define USE_TCL_STUBS
#define USE_TCLOO_STUBS
#include <tcl.h>
#include <tclOO.h>
static int
GetObjectNameCmd(ClientData clientData, Tcl_Interp *interp,
int objc, Tcl_Obj *const objv[])
{
int r = TCL_ERROR;
if (objc != 2) {
Tcl_WrongNumArgs(interp, 1, objv, "object");
return r;
}
Tcl_Object object = Tcl_GetObjectFromObj(interp, objv[1]);
if (object != NULL) {
Tcl_Obj *resultObj = Tcl_GetObjectName(interp, object);
if (resultObj != NULL) {
Tcl_SetObjResult(interp, resultObj);
r = TCL_OK;
}
}
return r;
}
typedef Tcl_Interp *(*LPFNTCLCREATEINTERP)();
typedef void *(*LPFNTCLFINDEXECUTABLE)(const char *);
static Tcl_Interp *
InitializeTcl(int argc, char *argv[])
{
Tcl_Interp *interp = NULL;
//Tcl_DString dString;
char szLibrary[16];
int nMinor;
HINSTANCE hTcl = NULL;
for (nMinor = 6; hTcl == NULL && nMinor > 4; nMinor--) {
wsprintfA(szLibrary, "tcl8%d.dll", nMinor);
hTcl = LoadLibraryA(szLibrary);
}
if (hTcl != NULL) {
LPFNTCLCREATEINTERP lpfnTcl_CreateInterp;
LPFNTCLFINDEXECUTABLE lpfnTcl_FindExecutable;
lpfnTcl_CreateInterp = (LPFNTCLCREATEINTERP)
GetProcAddress(hTcl, "Tcl_CreateInterp");
lpfnTcl_FindExecutable = (LPFNTCLFINDEXECUTABLE)
GetProcAddress(hTcl, "Tcl_FindExecutable");
if (lpfnTcl_CreateInterp != NULL) {
interp = lpfnTcl_CreateInterp();
if (interp != NULL) {
Tcl_InitStubs(interp, "8.6", 0);
#ifdef USE_TCLOO_STUBS
Tcl_OOInitStubs(interp);
#endif
lpfnTcl_FindExecutable(argv[0]);
Tcl_InitMemory(interp);
Tcl_Init(interp);
}
}
}
return interp;
}
/*
* Embed tcl interpreter into a C program.
*/
int
main(int argc, char *argv[])
{
Tcl_Interp *interp = NULL;
int r = TCL_ERROR;
setlocale(LC_ALL, "C");
interp = InitializeTcl(argc, argv);
if (interp == NULL) {
fprintf(stderr, "error: failed to initialize Tcl runtime\n");
} else {
Tcl_CreateObjCommand(interp, "getobjectname", GetObjectNameCmd, NULL, NULL);
if (argc > 1) {
r = Tcl_EvalFile(interp, argv[1]);
printf(Tcl_GetStringResult(interp));
}
Tcl_DeleteInterp(interp);
}
return r;
}
To test this you want a file that contains some tcl code that can call getobjectname and run the executable with the path to this file as the only argument.

OpenACC: calling cuda __device__ kernel from OpenACC parallel loop

If I have simple test cuda kernel in hello.cu file as:
extern "C" __device__ float radians( float f ){
return f*3.14159265;
}
And test OpenACC code in mainacc.c:
#include <stdio.h>
#include <stdlib.h>
#define N 10
#pragma acc routine seq
extern float radians( float );
int main() {
int i;
float *hptr, *dptr;
hptr = (float *) calloc(N, sizeof(float));
#pragma acc parallel loop copy(hptr[0:N])
for(i=0; i<N; i++) {
hptr[i] = radians(i*0.1f);
}
for( i=0; i< N; i++)
printf("\n %dth value : %f", i, hptr[i]);
return 0;
}
If I try to compile this code as below I get link time errors:
nvcc hello.cu -c
cc -hacc -hlist=a mainacc.c hello.o
nvlink error : Undefined reference to 'radians' in '/tmp/pe_20271//app_cubin_20271.omainacc_1.o__sec.cubin'
cuda_link: nvlink fatal error
I tried nvcc with "--relocatable-device-code true” option etc but no success. Loaded modules are:
craype-accel-nvidia35
cudatoolkit/6.5
PrgEnv-cray/5.2.40
Could you tell me correct way to use cuda device kernel within OpenACC?
I've been able to make this sort of mixing work with PGI, but I've not yet been able to produce a sample that works with the Cray compiler. Here's a simple example that works for PGI.
This is the file containing the CUDA.
// saxpy_cuda_device.cu
extern "C"
__device__
float saxpy_dev(float a, float x, float y)
{
return a * x + y;
}
This is the file containing OpenACC.
// openacc_cuda_device.cpp
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#pragma acc routine seq
extern "C" float saxpy_dev(float, float, float);
int main(int argc, char **argv)
{
float *x, *y, tmp;
int n = 1<<20, i;
x = (float*)malloc(n*sizeof(float));
y = (float*)malloc(n*sizeof(float));
#pragma acc data create(x[0:n]) copyout(y[0:n])
{
#pragma acc kernels
{
for( i = 0; i < n; i++)
{
x[i] = 1.0f;
y[i] = 0.0f;
}
}
#pragma acc parallel loop
for( i = 0; i < n; i++ )
{
y[i] = saxpy_dev(2.0, x[i], y[i]);
}
}
fprintf(stdout, "y[0] = %f\n",y[0]);
return 0;
}
Below is the compilation command.
$ make
nvcc -rdc true -c saxpy_cuda_device.cu
pgc++ -fast -acc -ta=nvidia:rdc,cuda7.0 -c openacc_cuda_device.cpp
pgc++ -o openacc_cuda_device -fast -acc -ta=nvidia:rdc,cuda7.0 saxpy_cuda_device.o openacc_cuda_device.o -Mcuda
You can use the -Wc command line option to add the generated ptx file to the CUDA link line. I've opened a bug to make sure we document how to do this.
nvcc hello.cu -ptx -arch=sm_35
cc -hacc -hlist=a mainacc.c -Wc,hello.ptx
One suggestion is to provide both a host and device version of the subroutine and then use the "bind" clause to indicate which version to call from a compute region. This will allow you to maintain portability with the host code.
For example:
% cat radians.cu
extern "C" __device__ float cuda_radians( float f ){
return f*3.14159265;
}
extern "C" float radians( float f ){
return f*3.14159265;
}
% cat test.c
#include <stdio.h>
#include <stdlib.h>
#define N 10
#pragma acc routine (radians) bind(cuda_radians) seq
extern float radians( float f);
int main() {
int i;
float *hptr, *dptr;
hptr = (float *) calloc(N, sizeof(float));
#pragma acc parallel loop copy(hptr[0:N])
for(i=0; i<N; i++) {
hptr[i] = radians(i*0.1f);
}
for( i=0; i< N; i++)
printf("\n %dth value : %f", i, hptr[i]);
return 0;
}
% nvcc -c radians.cu --relocatable-device-code true
% pgcc -acc -ta=tesla:cuda7.0 -Minfo=accel test.c radians.o -V15.7 -Mcuda
test.c:
main:
15, Generating copy(hptr[:10])
Accelerator kernel generated
Generating Tesla code
16, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
% a.out
0th value : 0.000000
1th value : 0.314159
2th value : 0.628319
3th value : 0.942478
4th value : 1.256637
5th value : 1.570796
6th value : 1.884956
7th value : 2.199115
8th value : 2.513274
9th value : 2.827434

CUDA function pointers

I was trying to make somtehing like this (actually I need to write some integration functions) in CUDA
#include <iostream>
using namespace std;
float f1(float x) {
return x * x;
}
float f2(float x) {
return x;
}
void tabulate(float p_f(float)) {
for (int i = 0; i != 10; ++i) {
std::cout << p_f(i) << ' ';
}
std::cout << std::endl;
}
int main() {
tabulate(f1);
tabulate(f2);
return 0;
}
output:
0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9
I tried the following but only got the error
Error: Function pointers and function template parameters are not supported in sm_1x.
float f1(float x) {
return x;
}
__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
for (lower; lower < upper; lower++) {
*result = *result + p_function(lower);
}
}
int main() {
float res;
float* dev_res;
cudaMalloc( (void**)&dev_res, sizeof(float) ) ;
tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("%f\n", res);
/************************************************************************/
scanf("%s");
return 0;
}
To get rid of your compile error, you'll have to use -gencode arch=compute_20,code=sm_20 as a compiler argument when compiling your code. But then you'll likely have some runtime problems:
Taken from the CUDA Programming Guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions
Function pointers to __global__ functions are supported in host code, but not in device code.
Function pointers to __device__ functions are only supported in device code compiled for devices of compute capability 2.x and higher.
It is not allowed to take the address of a __device__ function in host code.
so you can have something like this (adapted from the "FunctionPointers" sample):
//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);
//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
...
}
//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;
//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))
You can then pass the h_pointFunction as a parameter to your kernel, which can use it to call your __device__ function.
//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
unsigned char tmp;
...
tmp = (*pPointOperation)(tmp, 150.0)
...
}
//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);
Hopefully that made some sense. In all, it looks like you would have to change your f1 function to be a __device__ function and follow a similar procedure (the typedefs aren't necessary, but they do make the code nicer) to get it as a valid function pointer on the host-side to pass to your kernel. I'd also advise giving the FunctionPointers CUDA sample a look over
Even though you may be able to compile this code (see #Robert Crovella's answer) this code will not work. You cannot pass function pointers from host code as the host compiler has no way of figuring out the function address.
Here is a simple class for function pointers that are callable from within a kernel I wrote based on this question:
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr)
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
you could use it like this:
__device__ double func1(double x)
{
return x + 1.0f;
}
typedef double (*func)(double x);
__device__ func f_ = func1;
__global__ void test_kernel(func* f)
{
double x = (*f)(2.0);
printf("%g\n", x);
}
int main()
{
cudaCallableFunctionPointer<func> f(&f_);
test_kernel << < 1, 1 >> > (f.ptr);
}
output:
3