Am trying to implement the NestedHelloWorld as an example of dynamic parallelism. The environ is VS2019, must implement as Driver API interface, in C. The codes are
CUdevice* gpu_initialize(CUdevice *dev_ptr, int int_of_dev); //signature of the init function
// Host code
int main()
{
printf("Nested Parallelism : Hello World (Driver API)\n");
CUdevice device;
if (gpu_initialize(&device,0) == NULL) { //Initialize the device[0] and populate the device pointer contents
printf("Error Initializing GPU... exiting");
exit(-1);
}
CUresult result;
char* error_string;
//Create a context
CUcontext context;
unsigned int flags = CU_CTX_SCHED_YIELD; //set to mode where the GPU yields when awaiting CPU : increases latency
if ((result = cuCtxCreate(&context,flags,device)) != CUDA_SUCCESS) {
if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
printf("Error creating context : %s\n", error_string);
return -1;
}
else {
printf("Unknown error creating context\n");
return -1;
}
}
//Load the module by specifying the filename
CUmodule module;
char filename[] = "C:\\Users\\gautam\\OneDrive\\Projects\\VS 2019\\repos\\learn_cuda_nesting\\dynamic_parallelism\\x64\\Debug\\nestedHelloWorld.cu.obj"; //Use for VS 2019 compilation
if ((result = cuModuleLoad(&module,filename)) != CUDA_SUCCESS) {
if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
printf("Error loading Module using filename %s: %s \n",filename,error_string);
return -1;
}
else {
printf("Unknown error loading Module from filename %s\n",filename);
return -1;
}
}
else printf("Successfully loaded module %s\n", filename);
//Load the function from the module
CUfunction function;
char function_name[120];
strcpy_s(function_name,120,"nestedHelloWorld");
if ((result = cuModuleGetFunction(&function,module,function_name)) != CUDA_SUCCESS) {
if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
printf("Error loading function %s: %s\n",function_name, error_string);
return -1;
}
else {
printf("Unknown error loading function %s\n",function_name);
return -1;
}
}
else printf("Successfully loaded function %s\n", function_name);
//Set up kernel grid parameters
int size = 8;
int blocksize = 8; // initial block size
int igrid = 1;
int threads_per_block_x = 8;
int threads_per_block_y = 1;
int blocks_per_grid_x = (size + threads_per_block_x - 1) / threads_per_block_x;
int blocks_per_grid_y = 1;
//Launch the first function in the kernel
//Case 1 : Invoke kernel with 8 x 1 grid size
void* args[] = {(void *)&size, (void *)&igrid};
if ((result = cuLaunchKernel(function, blocks_per_grid_x, blocks_per_grid_y, 1, threads_per_block_x, threads_per_block_y, 1, 0, 0, args, 0)) != CUDA_SUCCESS) {
if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
printf("Error launching kernel %s: %s\n",function_name, error_string);
result = cuCtxDestroy(context);
return -1;
}
else {
printf("Unknown error launching kernel %s\n", function_name);
result = cuCtxDestroy(context);
return -1;
}
}
else printf("CUDA kernel launch with (%d,%d) blocks per grid, each with (%d,%d) threads per block\n", blocks_per_grid_x, blocks_per_grid_y, threads_per_block_x, threads_per_block_y);
result = cuCtxSynchronize(); //wait for kernel run to finish
result = cuCtxDestroy(context);
exit(0);
}
and the corresponding .cu file code is :
extern "C" __global__ void nestedHelloWorld(int const iSize, int iDepth)
{
int tid = threadIdx.x;
printf("Recursion=%d: Hello World from thread %d block %d\n", iDepth, tid,blockIdx.x);
// condition to stop recursive execution
if (iSize == 1) return;
// reduce block size to half
int nthreads = iSize >> 1;
// thread 0 launches child grid recursively
if(tid == 0 && nthreads > 0)
{
//nestedHelloWorld(nthreads, ++iDepth);
nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth);
printf("-------> nested execution depth: %d\n", iDepth);
}
}
The settings are as follows :
and finally, the gpu parameters and the error saying invalid PTX.
While this appears to be the "elementary code", the real intent is to identify what compilation parameter error(s) are being done. The compilation goes through without any errors.
All help is appreciated deeply...
Edit : Have added the successful compilation screenshot, and the screenshot of external command used in a separate attempt to compile the file and then link. The error has not changed.
We can build a complete example here by starting with the ptxjit CUDA sample code and then follow the additional instructions here. That's essentially everything you need.
Here's a full worked example on linux, including Makefile:
$ cat kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
printf("kernel level %d\n", N);
if ((N > 1) && (threadIdx.x == 0)) k<<<1,1>>>(N-1);
}
$ cat ptxjit.cpp
/*
*/
// System includes
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
// CUDA driver & runtime
#include <cuda.h>
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#define CUDA_DRIVER_API
#include <helper_cuda.h>
#include <helper_cuda_drvapi.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
#define PTX_FILE "kernel.ptx"
const char *sSDKname = "CDP Recursion Test (Driver API)";
bool inline findModulePath(const char *module_file, std::string &module_path,
char **argv, std::string &ptx_source) {
char *actual_path = sdkFindFilePath(module_file, argv[0]);
if (actual_path) {
module_path = actual_path;
} else {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
}
if (module_path.empty()) {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
} else {
printf("> findModulePath <%s>\n", module_path.c_str());
if (module_path.rfind(".ptx") != std::string::npos) {
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + 1];
fseek(fp, 0, SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf;
}
return true;
}
}
void ptxJIT(int argc, char **argv, CUmodule *phModule, CUfunction *phKernel,
CUlinkState *lState) {
const int options_num = 5;
CUjit_option options[options_num];
void *optionVals[options_num];
float walltime;
char error_log[8192], info_log[8192];
unsigned int logSize = 8192;
void *cuOut;
size_t outSize;
int myErr = 0;
std::string module_path, ptx_source;
// Setup linker options
// Return walltime from JIT compilation
options[0] = CU_JIT_WALL_TIME;
optionVals[0] = (void *)&walltime;
// Pass a buffer for info messages
options[1] = CU_JIT_INFO_LOG_BUFFER;
optionVals[1] = (void *)info_log;
// Pass the size of the info buffer
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
optionVals[2] = (void *)(long)logSize;
// Pass a buffer for error message
options[3] = CU_JIT_ERROR_LOG_BUFFER;
optionVals[3] = (void *)error_log;
// Pass the size of the error buffer
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optionVals[4] = (void *)(long)logSize;
// Create a pending linker invocation
checkCudaErrors(cuLinkCreate(options_num, options, optionVals, lState));
// first search for the module path before we load the results
if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
printf("> findModulePath could not find <kernel> ptx\n");
exit(EXIT_FAILURE);
} else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
// Load the PTX from the ptx file
printf("Loading ptxjit_kernel[] program\n");
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void *)ptx_source.c_str(),
strlen(ptx_source.c_str()) + 1, 0, 0, 0, 0);
if (myErr != CUDA_SUCCESS) {
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
// above.
fprintf(stderr, "PTX Linker Error:\n%s\n", error_log);
}
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL);
if (myErr != CUDA_SUCCESS) {
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
// above.
fprintf(stderr, "Library Linker Error:\n%s\n", error_log);
}
// Complete the linker step
checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize));
// Linker walltime and info_log were requested in options above.
printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime,
info_log);
// Load resulting cuBin into module
checkCudaErrors(cuModuleLoadData(phModule, cuOut));
// Locate the kernel entry poin
checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "k"));
// Destroy the linker invocation
checkCudaErrors(cuLinkDestroy(*lState));
}
// Variables
CUcontext cuContext;
int main(int argc, char **argv) {
const unsigned int nThreads = 1;
const unsigned int nBlocks = 1;
CUmodule hModule = 0;
CUfunction hKernel = 0;
CUlinkState lState;
int cuda_device = 0;
printf("[%s] - Starting...\n", sSDKname);
// Initialize
checkCudaErrors(cuInit(0));
CUdevice dev = findCudaDeviceDRV(argc, (const char **)argv);
int driverVersion;
cudaDriverGetVersion(&driverVersion);
if (driverVersion < CUDART_VERSION) {
printf("driverVersion = %d < CUDART_VERSION = %d \n"
"Enhanced compatibility is not supported for this sample.. waving execution\n", driverVersion, CUDART_VERSION);
exit(EXIT_WAIVED);
}
// Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, dev));
// JIT Compile the Kernel from PTX and get the Handles (Driver API)
ptxJIT(argc, argv, &hModule, &hKernel, &lState);
// Set the kernel parameters (Driver API)
dim3 block(nThreads, 1, 1);
dim3 grid(nBlocks, 1, 1);
int my_N = 4;
void *args[1] = {&my_N};
// Launch the kernel (Driver API_)
checkCudaErrors(cuLaunchKernel(hKernel, grid.x, grid.y, grid.z, block.x,
block.y, block.z, 0, NULL, args, NULL));
std::cout << "CUDA kernel launched" << std::endl;
cuCtxSynchronize();
if (hModule) {
checkCudaErrors(cuModuleUnload(hModule));
hModule = 0;
}
return EXIT_SUCCESS;
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=
################################################################################
PTX_FILE := kernel.ptx
#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
empty :=
space := $(empty) $(empty)
GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
GCCVERSION += 00
# Remove spaces from the version number
GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))
IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
ifeq ($(IS_MIN_VERSION), 1)
$(info >>> GCC Version is greater or equal to 4.7.0 <<<)
else
$(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
SAMPLE_ENABLED := 0
endif
endif
# Gencode arguments
SMS ?= 70
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ifeq ($(TARGET_OS),darwin)
ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
ifeq ($(TARGET_ARCH),x86_64)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
ifdef TARGET_OVERRIDE
CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
endif
endif
ifeq ($(TARGET_ARCH),ppc64le)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
endif
ifeq ($(HOST_ARCH),ppc64le)
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
endif
CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
ifeq ("$(CUDALIB)","")
$(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<)
SAMPLE_ENABLED := 0
else
CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
LIBRARIES += -L$(CUDALIB) -lcuda
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
LIBRARIES += -lcudart_static
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= #echo "[#]"
endif
################################################################################
# Target rules
all: build
build: ptxjit $(PTX_FILE)
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
#echo "Sample will be waived due to the above missing dependencies"
else
#echo "Sample is ready - all dependencies have been met"
endif
$(PTX_FILE): kernel.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -o $# -ptx $<
ptxjit.o:ptxjit.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $# -c $<
ptxjit: ptxjit.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $# $+ $(LIBRARIES)
run: build
$(EXEC) ./ptxjit
clean:
rm -f ptxjit ptxjit.o $(PTX_FILE)
clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f ptxjit ptxjit.o kernel.ptx
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit.o -c ptxjit.cpp
/usr/local/cuda/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit ptxjit.o -L/usr/local/cuda/lib64/stubs -lcuda -lcudart_static
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -rdc=true -o kernel.ptx -ptx kernel.cu
$ cuda-memcheck ./ptxjit
========= CUDA-MEMCHECK
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath <./kernel.ptx>
> initCUDA loading module: <./kernel.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in 0.000000ms. Linker Output:
CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$
On windows, you'd follow the same path, using the above files. Start with the ptxjit project. For simplicity you might want to rename the kernel so that it exactly matches the name of the kernel file used in the ptxjit project.
Here's the detailed steps I followed, using VS2019:
Open the ptxjit solution, on my machine it was here: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit
Take the ptxjit.cpp code from the above linux version, and use it to replace the contents of ptxjit.cpp in that solution/project.
Change the define statement back to: #define PTX_FILE "ptxjit_kernel64.ptx"
Change the location of the device runtime library to match your machine. Specifically this line: myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL); needs to be changed to something like myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\lib\\x64\\cudadevrt.lib", 0, NULL, NULL);
In the ptxjit_kernel.cu file in that project, replace the contents of that file with the kernel.cu file contents from the above linux version.
In the solution explorer window, right click on the ptxjit_kernel.cu file and select "Properties". In the "Configuration Properties" pane on the left, expand the CUDA C/C++ section, and select "Common". In the pane on the right, change the "Generate Relocatable Device Code" option from No to Yes. Click "OK".
In the same solution explorer window, right click on the ptxjit project, and select properties. Go into configuration properties...CUDA Linker...General, and change "Perform Device Link" from Yes to No. Click "OK".
Select Build...Rebuild Solution
When I do that, I get build console output like this:
1>------ Rebuild All started: Project: ptxjit, Configuration: Debug x64 ------
1>Compiling CUDA source file ptxjit_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_35,code=\"compute_35,compute_35\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 -ptx -cudart static -Xcompiler "/wd 4819" -o data/ptxjit_kernel64.ptx "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit_kernel.cu"
1>CUDACOMPILE : nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
1>ptxjit_kernel.cu
1>Done building project "ptxjit_vs2019.vcxproj".
1>ptxjit.cpp
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(318,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(324,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>LINK : ..\..\bin\win64\Debug\\ptxjit.exe not found or not built by the last incremental link; performing full link
1> Creating library ../../bin/win64/Debug/ptxjit.lib and object ../../bin/win64/Debug/ptxjit.exp
1>ptxjit_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe
1>Done building project "ptxjit_vs2019.vcxproj".
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========
At that point we can go to the indicated location of the exe:
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe
and run it in a command console. When I do that I see output like this:
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>ptxjit.exe
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <ptxjit_kernel64.ptx> in ./
...
sdkFindFilePath <ptxjit_kernel64.ptx> in ../../../6_Advanced/ptxjit/data/
> findModulePath <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
> initCUDA loading module: <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in -107374176.000000ms. Linker Output:
CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>
Notes:
The C4312 warnings in compilation above are in the original project and can be removed by switching from long to long long on the associated lines. This is not an actual problem.
The extended sequence of sdkFindFilePath messages printed at runtime can be shortened by copying the ptx file from its location to the location of the exe file. The final sdkFindFilePath output will tell you where it found the ptx file.
Responding to a question posted under my other answer:
is there a way to avoid this "jit linking during runtime" process while still being with Driver API interface
Yes. (I'm providing a separate answer because I ran into the character limit on my previous answer).
In this case, we want to create a fatbin object rather than ptx, during the compilation of the kernel code itself. This fatbin needs to be compiled with -rdc=true as you would expect for the dynamic parallelism, and also needs to be device-linked, together with the CUDA device runtime library.
The host side mechanics in this case are simpler, since we don't need any of the linking steps. The CUDA sample code that seems to be relatively close to this flow is vectorAddDrv so I will start with that code/sample project in order to demonstrate this.
Here is the linux version:
$ cat vectorAdd_kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
printf("kernel level %d\n", N);
if ((N > 1) && (threadIdx.x == 0)) k<<<1,1>>>(N-1);
}
$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>
// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>
// includes, CUDA
#include <builtin_types.h>
using namespace std;
// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;
// Functions
bool findModulePath(const char *, string &, char **, string &);
//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif
// Host code
int main(int argc, char **argv)
{
printf("Linked CDP demo (Driver API)\n");
int N = 4, devID = 0;
// Initialize
checkCudaErrors(cuInit(0));
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
{
exit(EXIT_FAILURE);
}
else
{
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size())
{
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module
checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "k"));
// Grid/Block configuration
int threadsPerBlock = 1;
int blocksPerGrid = 1;
void *args[] = { &N };
// Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
threadsPerBlock, 1, 1,
0,
NULL, args, NULL));
checkCudaErrors(cuCtxSynchronize());
exit(EXIT_SUCCESS);
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=
################################################################################
FATBIN_FILE := vectorAdd_kernel${TARGET_SIZE}.fatbin
#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
empty :=
space := $(empty) $(empty)
GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
GCCVERSION += 00
# Remove spaces from the version number
GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))
IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
ifeq ($(IS_MIN_VERSION), 1)
$(info >>> GCC Version is greater or equal to 4.7.0 <<<)
else
$(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
SAMPLE_ENABLED := 0
endif
endif
# Gencode arguments
SMS ?= 52 60 61 70 75 80 86
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ifeq ($(TARGET_OS),darwin)
ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
ifeq ($(TARGET_ARCH),x86_64)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
ifdef TARGET_OVERRIDE
CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
endif
endif
ifeq ($(TARGET_ARCH),ppc64le)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
endif
ifeq ($(HOST_ARCH),ppc64le)
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
endif
CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
ifeq ("$(CUDALIB)","")
$(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<)
SAMPLE_ENABLED := 0
else
CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
LIBRARIES += -L$(CUDALIB) -lcuda
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= #echo "[#]"
endif
################################################################################
# Target rules
all: build
build: vectorAddDrv $(FATBIN_FILE)
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
#echo "Sample will be waived due to the above missing dependencies"
else
#echo "Sample is ready - all dependencies have been met"
endif
$(FATBIN_FILE): vectorAdd_kernel.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -lcudadevrt -dlink -o $# -fatbin $<
vectorAddDrv.o:vectorAddDrv.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $# -c $<
vectorAddDrv: vectorAddDrv.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $# $+ $(LIBRARIES)
run: build
$(EXEC) ./vectorAddDrv
clean:
rm -f vectorAddDrv vectorAddDrv.o $(FATBIN_FILE)
clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f vectorAddDrv vectorAddDrv.o vectorAdd_kernel64.fatbin
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv.o -c vectorAddDrv.cpp
/usr/local/cuda/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv vectorAddDrv.o -L/usr/local/cuda/lib64/stubs -lcuda
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -m64 --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -rdc=true -lcudadevrt -dlink -o vectorAdd_kernel64.fatbin -fatbin vectorAdd_kernel.cu
$ cuda-memcheck ./vectorAddDrv
========= CUDA-MEMCHECK
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$
On Windows/VS 2019/CUDA 11.1, I followed these steps:
Open the vectorAddDrv project/solution, on my machine it was in: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv
Replace the code in the vectorAddDrv.cpp file with the code from the same file above in the linux example.
Replace the code in the vectorAdd_kernel.cu file with the code from the same file above in the linux example.
In the solution explorer pane on the left, right-click on the vectorAdd_kernel.cu file, and open the properties. Then in Configuration Properties...CUDA C/C++...Common change "Generate Relocatable Device Code" from No to Yes. Then in Configuration Properties...CUDA C/C++...Command Line add -dlink. Also make sure that Configuration Properties...CUDA C/C++...Device...Code Generation matches the device(s) you want to run on. Click "OK".
In the same solution explorer pane on the left, right click on the vectorAddDrv project, select Properties, then in Configuration Properties...CUDA Linker...General change "Perform Device Link" from Yes to No. Click "OK".
Select Build...Rebuild Solution.
When I do that I see console build output like this:
1>------ Rebuild All started: Project: vectorAddDrv, Configuration: Debug x64 ------
1>Compiling CUDA source file vectorAdd_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_61,code=sm_61 --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 -fatbin -cudart static -dlink -Xcompiler "/wd 4819" -o data/vectorAdd_kernel64.fatbin "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\vectorAdd_kernel.cu"
1>vectorAdd_kernel.cu
1>vectorAddDrv.cpp
1>LINK : ..\..\bin\win64\Debug\\vectorAddDrv.exe not found or not built by the last incremental link; performing full link
1>vectorAddDrv_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\../../bin/win64/Debug/vectorAddDrv.exe
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========
If we then open a command prompt and navigate to the indicated location for the executable, and run it, I see:
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>vectorAddDrv
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ./
...
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ../../../0_Simple/vectorAddDrv/data/
> findModulePath found file at <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
> initCUDA loading module: <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>
One of the notes in the other answer applies here as well: The extended sequence of sdkFindFilePath messages printed at runtime can be shortened by copying the fatbin file from its location to the location of the exe file. The final sdkFindFilePath output will tell you where it found the fatbin file.
Related
I have to a build cuda library with meson and the flags I need to provide are:
NVCCFLAGS = -ccbin g++ --threads 0 -std=c++17 -c -arch=sm_86 -gencode arch=compute_61,code=sm_61 -m64
the meson.build is
# Builds lib-gpu as a dependency called "lib_gpu_dep".
add_languages('cuda')
subdir('src')
project_source_files = [
source_files_src,
]
build_args_common = [
'-ccbin',
'g++',
'-c',
'-arch=sm_86',
'-gencode arch=compute_61,code=sm_61',
'-m64',
]
if (get_option('unit_test'))
build_args = [build_args_common,
'-g',
'-G',
'-DDEBUG=1'
]
else
build_args = [
build_args_common,
'-src-in-ptx',
' -keep',
# '--keep-dir=build/$(BUILD_TYPE)',
' --use_fast_math',
' -lineinfo'
]
endif
cuda_src_dir = '/usr/local/cuda'
inc_dir_cuda = include_directories(cuda_src_dir + '/include')
lib_gpu_incdir = include_directories('inc', is_system : true)
lib_gpu_name = 'lib_gpu'
lib_gpu = static_library(
lib_gpu_name,
project_source_files,
cuda_args : build_args,
gnu_symbol_visibility : 'default',
include_directories : [lib_gpu_incdir, inc_dir_cuda])
lib_gpu_dep = declare_dependency(
include_directories : lib_gpu_incdir,
link_with : lib_gpu
)
in the generated ninja file I have
ARGS = -Ilib-gpu/liblib_gpu.a.p -Xcompiler=-Wall,-Winvalid-pch,-Wnon-virtual-dtor,-Wextra,-Wpedantic -Xcompiler=-fPIC -I/usr/local/cuda/include -I/usr/local/cuda/include -isystem=../lib-gpu/inc -ccbin g++ -c -arch=sm_86 '-gencode$ arch=compute_61,code=sm_61' -m64 -Wno-pedantic -g -G -DDEBUG=1 -I../lib-gpu -Ilib-gpu -Ilib-gpu/liblib_gpu.a.p
but the flags '-gencode$ arch=compute_61,code=sm_61' is not correct
and in order to compile I need to modify it with a script to
-gencode arch=compute_61,code=sm_61
is there a way to solve this problem?
the version used is
meson --version
0.59.1
instead to use
'-gencode arch=compute_61,code=sm_61',
is it possible to write
'-gencode=arch=compute_61,code=sm_61',
in this case the flag generated by meson is correct.
I have a problem with my C project built under Debian Jessie. After doing some stuffs now I need to work with MySQL so i download the library and try to update my Makefile.
This is my Makefile right now
CC = gcc
CFLAGS = -Wall -Wextra
LDFLAGS = -lbluetooth -lpthread -lmysqlclient
LFLAGS = -lm
INC = -I/usr/include/mysql
SOURCES = stb.c btscan.c and.c gima.c database.c
OBJECTS = $(SOURCES:.c=.o)
EXECUTABLE = stb
$(EXECUTABLE): $(OBJECTS)
$(CC) $(CFLAGS) $(OBJECTS) -o $# $(LFLAGS) $(LDFLAGS)
stb.o: btscan.h and.h gima.h database.h
btscan.o: btscan.h
and.o: and.h
gima.o: gima.h
database.o: database.h
clean:
#rm -f *.o *.out stb
and this is the files where I want to use the library
stb.c
#include "btscan.h"
#include "and.h"
#include "gima.h"
#include "database.h"
struct device* bt_devices;
struct device* ble_devices;
int main(void) {
//-------------------------------- DATABASE CONNECTION ----------------
//MYSQL *con = mysql_init(NULL);
....... }
and finally database.h
#ifndef _DATABASE_H
#define _DATABASE_H
#include <my_global.h>
#include <mysql.h>
#endif
When I try to make i receive "Fatal error : my_global.h No such file or directory". However if i try MySQL on a single test file and compiling it with
gcc -o test test.c -I/usr/include/mysql -lmysqlclient
it works. Where I made a mistake?
Thanks in advance
If you'd shown us the output of the compiler, or examine it yourself, you'll immediately see that what you're running on the command line is not the same at all as what make is running: in particular make is not adding the -I/usr/include/mysql flag to the command line.
That's because in your makefile you set:
INC = -I/usr/include/mysql
but nowhere in your makefile (as you've shown it) is the variable INC actually used, so this is essentially a no-op.
Since you're using the standard GNU make compilation rules, you should be setting standard GNU make variables:
CPPFLAGS = -I/usr/include/mysql
I have a large library project that contains both cpp and cu source files. I'd like to compile it in a standalone shared object, but since I have some device functions I decided to split it in a shared object containing the majority of the functions and an archive file containing the device functions only. Here's part of the Makefile I wrote for it - all the (non-template) device functions have been put in the file device.cu:
Makefile
LIB_NAME = libexample.so
CUDA = /usr/local/cuda/include
CXX = g++
CXXFLAGS = -c -O2 -fPIC -Wall -I. -I./code -I./code/header -I$(CUDA)
SOURCES = src1.cpp src2.cpp src3.cpp
OBJECTS = $(SOURCES:.cpp=.o)
NVCC = nvcc
CU_SOURCES = src1_cu.cu src2_cu.cu src3_cu.cu device.cu
CU_OBJECTS = $(CU_SOURCES:.cu=.o)
BUILDDIR = .
VPATH = code/src/common_src code/src/src_CUDA
DEVICE_LINK = dev_link.o
GENCODE_FLAGS := -gencode arch=compute_20,code=sm_20
NVCCFLAGS = -x cu -O2 --compiler-options '-fPIC' $(GENCODE_FLAGS) -I. -I./code -I./code/header -I$(CUDA) -dc
NVCCLINK = --compiler-options '-fPIC' $(GENCODE_FLAGS) -dlink
all: lib/$(LIB_NAME)
lib/$(LIB_NAME): $(OBJECTS) $(CU_OBJECTS) $(DEVICE_LINK)
$(CXX) -shared -Wl,-soname,libexample.so $^ -o $#
ar rcs lib/device.a device.o
%.o: %.cpp
$(CXX) $(CXXFLAGS) $< -o $#
%.o: %.cu
$(NVCC) $(NVCCFLAGS) $< -o $#
$(DEVICE_LINK): $(CU_OBJECTS)
$(NVCC) $(NVCCLINK) $^ -o $#
I decided to change my build system and I switched to CMake to produce both Makefiles and Visual Studio projects. It's obvious how to write a working CMakeLists.txt file without separable compilation, but I couldn't find a solution that works in my case (I read some proposed solutions here on S.O. but they don't seem to work for me!). Can you help me to write said CMakeLists.txt file? Here's what I did so far:
CMakeLists.txt
cmake_minimum_required(VERSION 2.8.10)
# Set Library & project name
set(LIB_NAME example)
project(${LIB_NAME})
message("LIBRARY ${LIB_NAME}")
enable_language(CXX)
# Check if CUDA is installed on this system
find_package(CUDA REQUIRED)
# Set source directories
set(COMMON_SRCS_DIR ${CMAKE_SOURCE_DIR}/code/src/common_src)
set(CUDA_SRCS_DIR ${CMAKE_SOURCE_DIR}/code/src/src_CUDA)
# Set source files
set(COMMON_SRCS ${COMMON_SRCS_DIR}/src1.cpp
${COMMON_SRCS_DIR}/src2.cpp
${COMMON_SRCS_DIR}/src3.cpp
)
# Set CUDA device library name
set(DEVICE_LIB "device")
# Set CUDA objects
cuda_compile(SRC1_CU_O ${CUDA_SRCS_DIR}/src1_cu.cu)
cuda_compile(SRC2_CU_O ${CUDA_SRCS_DIR}/src2_cu.cu)
cuda_compile(SRC3_CU_O ${CUDA_SRCS_DIR}/src3_cu.cu)
cuda_compile(DEVICE_CU_O ${CUDA_SRCS_DIR}/device.cu)
# Set header file directories
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${CMAKE_SOURCE_DIR}/code)
include_directories(${CMAKE_SOURCE_DIR}/code/header)
include_directories(${CUDA_INCLUDE_DIRS})
# Get CUDA compute capability - contains CUDA_GENCODE define
include(CudaParams.cmake)
# Set include stuff
cuda_include_directories(${CMAKE_SOURCE_DIR})
cuda_include_directories(${CMAKE_SOURCE_DIR}/code)
cuda_include_directories(${CMAKE_SOURCE_DIR}/code/header)
cuda_include_directories(${CUDA_INCLUDE_DIRS})
set(CUDA_SEPARABLE_COMPILATION ON)
# Set compiler flags
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_GENCODE} --compiler-options '-fPIC' -O2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -march=native -O2")
# Generate main target
cuda_add_library(${LIB_NAME} SHARED ${COMMON_SRCS}
${SRC1_CU_O} ${SRC2_CU_O} ${SRC3_CU_O} ${DEVICE_CU_O})
cuda_add_library(${DEVICE_LIB} STATIC ${DEVICE_CU_O})
# Install instructions
INSTALL(TARGETS ${LIB_NAME}
LIBRARY DESTINATION ${CMAKE_SOURCE_DIR}/lib
ARCHIVE DESTINATION ${CMAKE_SOURCE_DIR}/lib
)
As you can see, there's no dev_link.o mention in the CMakeLists.txt file because I simply don't know where I could put it!
I try to write a GPU program using CUDA. Below is my function:
__global__ static void
histogram_gpu(int * hist_out, unsigned char * img_in, int img_size, int nbr_bin){
int i;
const int bid = blockIdx.x;
const int tid = threadIdx.x;
// for ( i = 0; i < img_size; i ++){
// hist_out[img_in[i]] ++;
// }
for (i = bid*THREAD_NUM + tid; i < img_size; i += BLOCK_NUM*THREAD_NUM) {
hist_out[img_in[i]]++;
}
}
When I call this function in the main function, there's an error occurs:
error: ‘blockIdx’ was not declared in this scope
I use the CUDA 5.0 on my MAC machine, and below is the Makefile:
OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
# Flags to detect 32-bit or 64-bit OS platform
OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/")
OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")
# These flags will override any settings
ifeq ($(i386),1)
OS_SIZE = 32
OS_ARCH = i686
endif
ifeq ($(x86_64),1)
OS_SIZE = 64
OS_ARCH = x86_64
endif
# Flags to detect either a Linux system (linux) or Mac OSX (darwin)
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
# Location of the CUDA Toolkit binaries and libraries
CUDA_PATH ?= /Developer/NVIDIA/CUDA-5.0
CUDA_INC_PATH ?= $(CUDA_PATH)/include
CUDA_BIN_PATH ?= $(CUDA_PATH)/bin
ifneq ($(DARWIN),)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
ifeq ($(OS_SIZE),32)
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib
else
CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64
endif
endif
# Common binaries
NVCC ?= $(CUDA_BIN_PATH)/nvcc
GCC ?= g++
# Extra user flags
EXTRA_NVCCFLAGS ?=
EXTRA_LDFLAGS ?=
# CUDA code generation flags
GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)
GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)
# OS-specific build flags
# ifneq ($(DARWIN),)
# LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart -lcublas -lcuda -lcufft -ltlshook
# CCFLAGS := -arch $(OS_ARCH)
# else
# ifeq ($(OS_SIZE),32)
# LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart
# CCFLAGS := -m32
# else
LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart -lcublas -lcuda -lcufft -ltlshook
CCFLAGS := -m64
# endif
# endif
# OS-architecture specific flags
ifeq ($(OS_SIZE),32)
NVCCFLAGS := -m32
else
NVCCFLAGS := -m64
endif
# Debug build flags
ifeq ($(dbg),1)
CCFLAGS += -g
NVCCFLAGS += -g -G
TARGET := debug
else
TARGET := release
endif
# Common includes and paths for CUDA
INCLUDES := -I$(CUDA_INC_PATH) -I. -I.. -I../../common/inc
# Add source files here
EXECUTABLE := 5kk70-assignment-gpu
# Cuda source files (compiled with cudacc)
CUFILES :=
# C/C++ source files (compiled with gcc / c++)
CCFILES := main.cpp histogram-equalization.cu contrast-enhancement.cu
################################################################################
# Rules and targets
# All Phony Targets
.PHONY : everything clean
# Default starting position
everything : $(EXECUTABLE)
# Common includes and paths for CUDA
# INCLUDES := -I$(CUDA_INC_PATH) -I. -I.. -I$(CUDA_INC_PATH)/samples/common/inc/
# Clean OBJECTS
clean :
rm -f $(EXECUTABLE) $(OBJ)
$(EXECUTABLE) : $(CCFILES)
$(NVCC) -o $# $^ $(INCLUDES) $(LDFLAGS) $(EXTRA_LDFLAGS) $(GENCODE_FLAGS)
What's the problem with my code?
This problem will occur when you are writing cuda code that is inside a file named .cpp, and you go to compile it. Rename the file to .cu, and the compiler will not complain at you.
In a bazel build rule, try putting the .cu.cc file in the hdrs rather than srcs.
Hi I'm developing CGI which is written in C and trying to use Mysql.
When I try to use it, I got those undefined reference errors for symbols that start with mysql_, such as those shown here:
/usr/bin/ld: Warning: type of symbol `password' changed from 2 to 1 in ../../lib /libcgi_module.a(users.o)
../../lib/libcgi_module.a(users.o): In function `save':
/home/jitcomm/intern_GUI/jit24_test_v2/cgi_src/cgi_module/users.c:18: multiple definition of `save'
../../lib/libcgi_module.a(mode.o):/home/jitcomm/intern_GUI/jit24_test_v2/cgi_src/cgi_module/mode.c:56: first defined here
../../lib/libcgi_module.a(users.o): In function `saveUser':
users.c:(.text+0x192): undefined reference to `mysql_init'
users.c:(.text+0x1e4): undefined reference to `mysql_real_connect'
users.c:(.text+0x1f5): undefined reference to `mysql_error'
users.c:(.text+0x267): undefined reference to `mysql_query'
users.c:(.text+0x278): undefined reference to `mysql_error'
users.c:(.text+0x2ab): undefined reference to `mysql_close'
So I write some sample code and I can run well when I use my program with this line
gcc -o saveUser $(mysql_config --cflags) saveUser.c $(mysql_config --libs)
It works well.
So next step, I try to put my coding in CGI.
I still got those undefined reference again.
Here is my Makefile.basic
CC=/usr/bin/gcc
#CC=powerpc-linux-gcc
CP=/usr/bin/cp
CFLAGS = -g -Wall $(mysql_config --cflags) $(mysql_config --libs)
www=/var/www
htdocs=/htdocs
cgi_bin=/cgi-bin
config=/etc/apache2/sites-available/default
What should I do ? Where it goes wrong ? Help me please. I have been searching on the internet for a week but still can't get the solution. I guess it's in makefile.basic and the linking to mysql is wrong.
Thanks
Update :
here is my top level Makefile
ROOT=.
CURDIR=$(shell /bin/pwd)
JITCOMM_INSTALL=$(ROOT)/install
include $(ROOT)/Makefile.basic
#set root directory
SUB_DIRS = cgi_src
SUB_DIRS += check_update
SUB_DIRS += loadconfig
SUB_DIRS += keepalive
SUB_DIRS += script
SUB_DIRS += server
SUB_DIRS += startproxy
SUB_DIRS += net_stats
#SUB_DIRS += ../sniffer_gui
#SUB_DIRS += java
all:
ifneq ($(SUB_DIRS), )
#for i in $(SUB_DIRS) ; do if [ ! -d $(CURDIR)/$${i} ]; then continue; fi; \
cd $(CURDIR)/$${i}; make || exit; cd $(CURDIR); done
endif
clean:
#rm -f $(ROOT)/lib/*
#rm -rf $(JITCOMM_INSTALL)
ifneq ($(SUB_DIRS), )
#for i in $(SUB_DIRS) ; do if [ ! -d $(CURDIR)/$${i} ]; then continue; fi; \
cd $(CURDIR)/$${i}; make clean || exit; cd $(CURDIR); done
endif
install: all
#rm -rf $(JITCOMM_INSTALL)
#mkdir $(JITCOMM_INSTALL)
#mkdir $(JITCOMM_INSTALL)/etc
#mkdir $(JITCOMM_INSTALL)/etc/acpro
#mkdir $(JITCOMM_INSTALL)/etc/syslog-ng
#mkdir $(JITCOMM_INSTALL)/etc/apache2
#mkdir $(JITCOMM_INSTALL)/etc/apache2/sites-available
#mkdir $(JITCOMM_INSTALL)/var
#mkdir $(JITCOMM_INSTALL)/var/www
#mkdir $(JITCOMM_INSTALL)/var/www/cgi-bin
Here is my Makefile in cgi-src folder
ROOT=../
CURDIR=$(shell /bin/pwd)
include $(ROOT)/Makefile.basic
#set root directory
SUB_DIRS = util
SUB_DIRS += cgi_util
SUB_DIRS += cgi_module
#Must be last
SUB_DIRS += cgi_main
all:
ifneq ($(SUB_DIRS), )
#for i in $(SUB_DIRS) ; do if [ ! -d $(CURDIR)/$${i} ]; then continue; fi; \
cd $(CURDIR)/$${i}; make || exit; cd $(CURDIR); done
endif
clean:
ifneq ($(SUB_DIRS), )
#for i in $(SUB_DIRS) ; do if [ ! -d $(CURDIR)/$${i} ]; then continue; fi; \
cd $(CURDIR)/$${i}; make clean || exit; cd $(CURDIR); done
endif
install:
ifneq ($(SUB_DIRS), )
#for i in $(SUB_DIRS) ; do if [ ! -d $(CURDIR)/$${i} ]; then continue; fi; \
cd $(CURDIR)/$${i}; make install || exit; cd $(CURDIR); done
endif
Here is my Makefile.basic
CC=/usr/bin/gcc
#CC=powerpc-linux-gcc
CP=/usr/bin/cp
CFLAGS=-g -Wall $(shell mysql_config --cflags) $(shell mysql_config --libs)
www=/var/www
htdocs=/htdocs
cgi_bin=/cgi-bin
config=/etc/apache2/sites-available/default
INSTALL_DIR=$(pwd)/.install
Latest Update :
Makefile in cgi_module
#set common variant
.SUFFIXES: .c .h .o
ROOT=../../
include $(ROOT)/Makefile.basic
#LINK_NAME = changepass.cgi login.cgi network.cgi reboot.cgi shutdown.cgi
LINK_NAME = login.cgi
INCS = -I../include
INCS += -I../../cgi_src/util/include
OBJES=../../lib/
TARGET = libcgi_module.a
#don't change below
LIB_OBJS = $(patsubst %.c, %.o, $(wildcard *.c))
CLEAN_OBJS = $(LIB_OBJS) $(TARGET)
INST_OBJ = $(TARGET)
%.o: %.c
$(CC) -c $(CFLAGS) $(INCS) -o $# $<
all: $(TARGET)
$(TARGET): $(LIB_OBJS)
$(AR) -vsur $# $^
#cp $(TARGET) $(OBJES)
clean:
-#rm -rf $(CLEAN_OBJS)
install:
Try this
CC=/usr/bin/gcc
#CC=powerpc-linux-gcc
CP=/usr/bin/cp
CFLAGS = -g -Wall $(shell mysql_config --cflags) $(shell mysql_config --libs)
foo:
#echo "CFLAGS are $(CFLAGS)"
and launch make foo