How to compile/link code with different compute capabilities against cublas_device?

How to compile/link code with different compute capabilities against cublas_device? - cuda

I'm working with dynamic parallelism (and cublas) in one of my kernels and want to provide a fallback-kernel for sm_20. In maxentropy_cuda.cu I wrote both kernels and used CUDA_ARCH to compile the dynamic parallelism kernel only for the architecture>=3.5. This works fine.
Part of the Makefile:
nvcc $(NVCCFLAGS) -gencode arch=compute_35,code=sm_35 -gencode arch=compute_20,code=sm_20 $(CINCL) -M maxentropy_cuda.cu -o maxentropy_cuda.d
nvcc --device-c $(NVCCFLAGS) -gencode arch=compute_35,code=sm_35 -gencode arch=compute_20,code=sm_20 -x cu maxentropy_cuda.cu -o maxentropy_cuda.o
When I link this to the kernels in another file:
nvcc --cudart static --relocatable-device-code=true -link -gencode arch=compute_35,code=sm_35 -gencode arch=compute_20,code=sm_20 $(LIBPATHS) -o main main.o selgen.o maxentropy.o maxentropy_omp.o maxentropy_cuda.o maxentropy_kernels.o $(OBJINFRA) $(LIBS) -lcublas_device -lcudadevrt
I get the following error:
nvlink fatal : could not find compatible device code in /opt/cuda/lib64/libcublas_device.a
make: *** [main] Error 255
Of course I don't need libcublas_device for the fallback-kernel...
Is there a way to get both compute-capabilities in one binary? (I'm using CUDA 5.5)
EDIT: Example (haven't tested the output...):
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
__global__ void calc_dev(double* u, double* s, double* r) {
#if __CUDA_ARCH__<350
printf("should not have been called - compiled for the wrong CUDA architecture");
#else
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
if (status != CUBLAS_STATUS_SUCCESS) {
printf("error while initializing cublas\n");
return;
}
status = cublasDdot(cnpHandle,5,u,1,s,1,r);
cudaDeviceSynchronize();
if (status != CUBLAS_STATUS_SUCCESS) {
printf("cublas error: u x s\n");
return;
}
#endif
}
void calc_host(double* u, double* s, double* r) {
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
cublasSetPointerMode(cnpHandle, CUBLAS_POINTER_MODE_DEVICE);
if (status != CUBLAS_STATUS_SUCCESS) {
printf("error while initializing cublas\n");
return;
}
status = cublasDdot(cnpHandle,5,u,1,s,1,r);
cudaThreadSynchronize();
if (status != CUBLAS_STATUS_SUCCESS) {
printf("cublas error: u x s\n");
return;
}
}
int main(int argc, char** argv) {
const int n = 5;
double u[n] = {0,1,2,4,8};
double s[n] = {1, 0.64570312500000004,
0.44203125000000004, 0.65804687500000003, 0.71976562500000008};
double r = 0.0;
double *dev_s,*dev_u,*dev_r;
cudaMalloc( (void**)&dev_s, sizeof(double)*n);
cudaMalloc( (void**)&dev_u, sizeof(double)*n);
cudaMalloc( (void**)&dev_r, sizeof(double));
cudaMemcpy( dev_s, s, n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy( dev_u, u, n*sizeof(double), cudaMemcpyHostToDevice);
#if __CUDA_ARCH__>=350
calc_dev<<<1,1>>>(dev_s,dev_u,dev_r);
#else
calc_host(dev_s,dev_u,dev_r);
#endif
cudaMemcpy( &r, dev_r, sizeof(double), cudaMemcpyDeviceToHost);
printf("%.3f\n",r);
return 0;
}
Nsight:
make all
Building file: ../main.cu
Invoking: NVCC Compiler
/usr/local/cuda-5.5/bin/nvcc -G -g -O0 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_35,code=sm_35 -odir "" -M -o "main.d" "../main.cu"
/usr/local/cuda-5.5/bin/nvcc --device-c -G -O0 -g -gencode arch=compute_20,code=sm_20 -gencode arch=compute_35,code=sm_35 -x cu -o "main.o" "../main.cu"
Finished building: ../main.cu
Building target: test_sm_compatibility
Invoking: NVCC Linker
/usr/local/cuda-5.5/bin/nvcc --cudart static --relocatable-device-code=true -gencode arch=compute_20,code=sm_20 -gencode arch=compute_35,code=sm_35 -link -o "test_sm_compatibility" ./main.o -lcublas -lcublas_device
nvlink fatal : could not find compatible device code in /usr/local/cuda-5.5/bin/../targets/x86_64-linux/lib/libcublas_device.a
make: *** [test_sm_compatibility] Error 255

This is possible now in CUDA 6.0
The example you have posted compiles as-is using the compile commands you have shown.
The only difficulty is that there are a variety of nvlink warnings that will have to be ignored:
nvlink warning : SM Arch ('sm_20') not found in '/usr/local/cuda/bin/..//lib64/libcublas_device.a'
However a proper executable is built correctly.

Related

Cuda - nvcc - No kernel image is available for execution on the device. What is the problem?

I'm trying to use nvcc with the most simple example, but it doesn't work correctly. I'm compiling and execute the example from https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/, however my server can't execute the global function. I rewrite the code to get some error message and I receive the following message:
"no kernel image is available for execution on the device"
My GPU is a Quadro 6000 and the cuda version is 9.0.
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
y[i] = 10.0; //a*x[i] + y[i];
}
int main(int argc, char *argv[])
{
int N = 120;
int nDevices;
float *x, *y, *d_x, *d_y;
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess)
printf("%s\n", cudaGetErrorString(err));
else
printf("Number of devices %d\n", nDevices);
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
saxpy<<<1, 1>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
err = cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
printf("%s\n",cudaGetErrorString(err));
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}"
Execution command
bash-4.1$ nvcc -o sapx simples_cuda.cu
bash-4.1$ ./sapx
Number of devices 1
no error
Sync kernel error: no kernel image is available for execution on the device

GPUs of compute capability less than 2.0 are only supported by CUDA toolkits of version 6.5 and older.
GPUs of compute capability less than 3.0 (but greater than or equal to 2.0) are only supported by CUDA toolkits of version 8.0 and older.
Your Quadro 6000 is a compute capability 2.0 GPU. This can be determined programmatically with the deviceQuery CUDA sample code, or via a google search. It is not supported by CUDA 9.0

You should add the compute capability of your Video Card as a parameter to the nvcc compiler. In my case (windows/Visual Studio 2017) I set this at the Code Generation field. So as #einpoklum answered before add the gencode parameter like this -gencode arch=compute_${COMPUTE_CAPABILITY},code=compute_${SM_CAPABILITY} where {COMPUTE_CAPABILITY} and {SM_CAPABILITY} belong to the following pairs (you can add them all as VS2017 do),
{COMPUTE_CAPABILITY},{SM_CAPABILITY}
compute_35,sm_35
compute_37,sm_37
compute_50,sm_50
compute_52,sm_52
compute_60,sm_60
compute_61,sm_61
compute_70,sm_70
compute_75,sm_75
compute_80,sm_80
D:\Program Files\nVidia\CUDA Samples\MySamples\IntroToCUDA_1\IntroToCUDA_1>"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\bin\nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -gencode=arch=compute_37,code=\"sm_37,compute_37\" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_52,code=\"sm_52,compute_52\" -gencode=arch=compute_60,code=\"sm_60,compute_60\" -gencode=arch=compute_61,code=\"sm_61,compute_61\" -gencode=arch=compute_70,code=\"sm_70,compute_70\" -gencode=arch=compute_75,code=\"sm_75,compute_75\" -gencode=arch=compute_80,code=\"sm_80,compute_80\" --use-local-env -ccbin "D:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64" -x cu -I"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\include" -I"D:\Program Files\nVidia\GPU Computing Toolkit\CUDA\v11.0\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -g -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc141.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\IntroToCUDA_1.cu.obj "D:\Program Files\nVidia\CUDA Samples\MySamples\IntroToCUDA_1\IntroToCUDA_1\IntroToCUDA_1.cu"
You can check your CC of your video card with the deviceQuery example you can find in CUDA Samples SDK

Adding to #RobertCrovella's answer:
When compiling with nvcc, you should always set appropriate flags to generate binary kernel images for the microarchitecture / compute capability you intend to run on. For example: -gencode arch=compute_${COMPUTE_CAPABILITY},code=compute_${COMPUTE_CAPABILITY},
with, say COMPUTE_CAPABILITY=61.
Read nvcc --help for more information on these flags (although, to be honest, it's a bit of a murky subject).

cuda & rdc & thrust in multiple shared objects results in SIGSEV in registerEntryFunction

I'm trying to run relocatable-device-code in two shared libraries, both using cuda-thrust. Everything runs fine if I stop using thrust in kernel.cu, which is not an option.
edit: The program works too if rdc is disabled. Not an option for me either.
It compiles fine but stops with a segfault when run. gdb tells me this:
Program received signal SIGSEGV, Segmentation fault.
0x0000000000422cc8 in cudart::globalState::registerEntryFunction(void**, char const*, char*, char const*, int, uint3*, uint3*, dim3*, dim3*, int*) ()
(cuda-gdb) bt
#0 0x0000000000422cc8 in cudart::globalState::registerEntryFunction(void**, char const*, char*, char const*, int, uint3*, uint3*, dim3*, dim3*, int*) ()
#1 0x000000000040876c in __cudaRegisterFunction ()
#2 0x0000000000402b58 in __nv_cudaEntityRegisterCallback(void**) ()
#3 0x00007ffff75051a3 in __cudaRegisterLinkedBinary(__fatBinC_Wrapper_t const*, void (*)(void**), void*) ()
from /home/mindoms/rdctestmcsimple/libkernel.so
#4 0x00007ffff75050b1 in __cudaRegisterLinkedBinary_66_tmpxft_00007a5f_00000000_16_cuda_device_runtime_ compute_52_cpp1_ii_8b1a5d37 () from /home/user/rdctestmcsimple/libkernel.so
#5 0x000000000045285d in __libc_csu_init ()
#6 0x00007ffff65ea50f in __libc_start_main () from /lib64/libc.so.6
Here is my stripped down example (using cmake) that shows the error.
main.cpp:
#include "kernel.cuh"
#include "kernel2.cuh"
int main(){
Kernel k;
k.callKernel();
Kernel2 k2;
k2.callKernel2();
}
kernel.cuh:
#ifndef __KERNEL_CUH__
#define __KERNEL_CUH__
class Kernel{
public:
void callKernel();
};
#endif
kernel.cu:
#include "kernel.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel(int *data){
if (threadIdx.x == 0)
printf("the kernel says hello\n");
data[threadIdx.x] = threadIdx.x * 2;
}
void Kernel::callKernel(){
thrust::device_vector<int> D2;
D2.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D2[0]);
printf("Kernel::callKernel called\n");
thekernel <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel!" << std::endl;
}
for (int i = 0; i < D2.size(); i++)
std::cout << "Kernel D[" << i << "]=" << D2[i] << std::endl;
}
kernel2.cuh:
#ifndef __KERNEL2_CUH__
#define __KERNEL2_CUH__
class Kernel2{
public:
void callKernel2();
};
#endif
kernel2.cu
#include "kernel2.cuh"
#include <stdio.h>
#include <iostream>
#include <thrust/device_vector.h>
__global__
void thekernel2(int *data2){
if (threadIdx.x == 0)
printf("the kernel2 says hello\n");
data2[threadIdx.x] = threadIdx.x * 2;
}
void Kernel2::callKernel2(){
thrust::device_vector<int> D;
D.resize(11);
int * raw_ptr = thrust::raw_pointer_cast(&D[0]);
printf("Kernel2::callKernel2 called\n");
thekernel2 <<< 1, 10 >>> (raw_ptr);
cudaThreadSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess) {
std::cout << "Cuda error: " << cudaGetErrorString(code) << " after callKernel2!" << std::endl;
}
for (int i = 0; i < D.size(); i++)
std::cout << "Kernel2 D[" << i << "]=" << D[i] << std::endl;
}
The cmake file below was used originally, but I get the same problem when I compile "by hand":
nvcc -arch=sm_35 -Xcompiler -fPIC -dc kernel2.cu
nvcc -arch=sm_35 -shared -Xcompiler -fPIC kernel2.o -o libkernel2.so
nvcc -arch=sm_35 -Xcompiler -fPIC -dc kernel.cu
nvcc -arch=sm_35 -shared -Xcompiler -fPIC kernel.o -o libkernel.so
g++ -o main main.cpp libkernel.so libkernel2.so -L/opt/cuda/current/lib64
Adding -cudart shared to every nvcc call as suggested somewhere results in a different error:
warning: Cuda API error detected: cudaFuncGetAttributes returned (0x8)
terminate called after throwing an instance of 'thrust::system::system_error'
what(): function_attributes(): after cudaFuncGetAttributes: invalid device function
Program received signal SIGABRT, Aborted.
0x000000313c432625 in raise () from /lib64/libc.so.6
(cuda-gdb) bt
#0 0x000000313c432625 in raise () from /lib64/libc.so.6
#1 0x000000313c433e05 in abort () from /lib64/libc.so.6
#2 0x00000031430bea7d in __gnu_cxx::__verbose_terminate_handler() () from /usr/lib64/libstdc++.so.6
#3 0x00000031430bcbd6 in std::set_unexpected(void (*)()) () from /usr/lib64/libstdc++.so.6
#4 0x00000031430bcc03 in std::terminate() () from /usr/lib64/libstdc++.so.6
#5 0x00000031430bcc86 in __cxa_rethrow () from /usr/lib64/libstdc++.so.6
#6 0x00007ffff7d600eb in thrust::detail::vector_base<int, thrust::device_malloc_allocator<int> >::append(unsigned long) () from ./libkernel.so
#7 0x00007ffff7d5f740 in thrust::detail::vector_base<int, thrust::device_malloc_allocator<int> >::resize(unsigned long) () from ./libkernel.so
#8 0x00007ffff7d5b19a in Kernel::callKernel() () from ./libkernel.so
#9 0x00000000004006f8 in main ()
CMakeLists.txt: Please adjust to your environment
cmake_minimum_required(VERSION 2.6.2)
project(Cuda-project)
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/CMake/cuda" ${CMAKE_MODULE_PATH})
SET(CUDA_TOOLKIT_ROOT_DIR "/opt/cuda/current")
SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52)
find_package(CUDA REQUIRED)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
set(CUDA_SEPARABLE_COMPILATION ON)
set(BUILD_SHARED_LIBS ON)
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
CUDA_ADD_LIBRARY(kernel
kernel.cu
)
CUDA_ADD_LIBRARY(kernel2
kernel2.cu
)
cuda_add_executable(rdctest main.cpp)
TARGET_LINK_LIBRARIES(rdctest kernel kernel2 cudadevrt)
About my system:
Fedora 23
kernel: 4.4.2-301.fc23.x86_64
Nvidia Driver: 361.28
Nvidia Toolkit: 7.5.18
g++: g++ (GCC) 5.3.1 20151207 (Red Hat 5.3.1-2)
Reproduced on:
CentOS release 6.7 (Final)
Kernel: 2.6.32-573.8.1.el6.x86_64
Nvidia Driver: 352.55
Nvidia Toolkit: 7.5.18
g++ (GCC) 4.4.7 20120313 (Red Hat 4.4.7-16)
glibc 2.12
cmake to 3.5

Apparently, this has something to do with what cuda runtime is used: shared or static.
I slightly modified your example: Instead of building two shared libraries and linking them to the executable individually, I create two static libraries that are linked together to one shared library, and that one is linked to the executable.
Also, here is an updated CMake file that uses the new (>= 3.8) native CUDA language support.
cmake_minimum_required(VERSION 3.8)
project (CudaSharedThrust CXX CUDA)
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_61,code=compute_61")
if(BUILD_SHARED_LIBS)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()
add_library(kernel STATIC kernel.cu)
set_target_properties(kernel PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(kernel2 STATIC kernel2.cu)
set_target_properties(kernel2 PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_library(allkernels empty.cu) # empty.cu is an empty file
set_target_properties(allkernels PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(allkernels kernel kernel2)
add_executable(rdctest main.cpp)
set_target_properties(rdctest PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(rdctest allkernels)
Building this without any CMake flags (static build), the build succeeds and the program works.
Building with -DBUILD_SHARED_LIBS=ON, the program compiles, but it crashes with the same error is yours.
Building with
cmake .. -DBUILD_SHARED_LIBS=ON -DCMAKE_CUDA_FLAGS:STRING="--cudart shared"
compiles, and actually makes it run! So for some reason, the shared CUDA runtime is required for this sort of thing.
Also note that the step from 2 SO's -> 2 Static Libs in 1 SO was necessary, because otherwise the program would crash with a hrust::system::system_error.
This, however is expected because NVCC actually ignores shared object files during device linking: http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#libraries

error generating .so file from .o files using JNI for CUDA in eclipse(ubuntu)

I am working on JNI based CUDA program for which i hava a java class that has main func i.e. jclass.java(containing native func jniEntry() declaration)
and jclass.h which is generated from javah . I have the JNI bridge cEntry.c which contains the native function implementation declared as
JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
the above function calls the CUDA host function i.e. jniEntry() in cudaprogram.h. The jniEntry() function then calls the device func contained in cudaprogram.cu
I can't seem generate the .so file from the generated .o files i.e. cudaprogram.o from cudaprogram.cu and cEntry.o from cEntry.c(which is the bridge for JNI i.e. jclass.java -> jclass.class(from javac) & jclass.h(from javah -jni)
My makefile is :
INCLUDES := -I$(CUDASDK_PATH)/inc -I$(CUDA_PATH)/include -I$(JDK_PATH)/include -I$(JDK_PATH)/include/linux -I.
LIBRARIES := -lrt -lm -lcudart -lcufft -lcublas -L$(CUDA_PATH)/lib64 -L.
JAVASRC_PATH := ../
NATIVESRC_PATH := ./
NVCC := /opt/cuda-6.5//bin/nvcc -ccbin g++
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o
cEntry.o: cEntry.c jclass.h cudaprog.o
gcc $(INCLUDES) -v -m64 -fPIC -o $# cEntry.c -c
cudaprog.o: cudaprog.cu jclass.h cudaprog.h
$(NVCC) $(INCLUDES) -v -m64 -o $# -c cudaprog.cu
run: build
$(EXEC) ./cujni1
jclass.h:jclass.class
javah -jni -classpath $(JAVASRC_PATH) jclass
jclass.class:
javac $(JAVASRC_PATH)/jclass.java
the files that are being generated without errors are jclass.class, jclass.h, cudaprogram.o, cEntry.o but the libcujni1.so is not getting generated as i get the errors something like
/usr/bin/ld: cudaprog.o: relocation R_X86_64_32 against `.rodata' can not be used when making a shared object; recompile with -fPIC
cudaprog.o: error adding symbols: Bad value
collect2: error: ld returned 1 exit status
make: *** [cujni1] Error 1
as you can see i am using nvcc to compile the .cu files so can't used -fPIC option because it returns error of |unknown option -fPIC"
For reference in needed i am attaching the other source files as well
jclass.java:
public class jclass {
static {
System.loadLibrary("cujni1");
}
private native void jniEntry();
public static void main(String[] args){
System.out.print("1:Hello" + "JNI CUder\n");
new jclass().jniEntry();
}
}
cEntry.c:
#include <jni.h>
#include "jclass.h"
#include "cudaprog.h"
JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
{
printf("2:cEntry.c-->Java_jclass_jniEntry!\n");
jniEntry();
return;
}
the generated jclass.h:
#ifndef CUDAPROG_H_
#define CUDAPROG_H_
#ifdef __cplusplus
extern "C" {
#endif
void jniEntry();
#ifdef __cplusplus
}
#endif
#endif /* CUDAPROG_H_ */
cudaprogram.cu:
// includes, system
#include <string.h>
#include <math.h>
#include "jclass.h"
#include "cudaprog.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#ifdef __cplusplus
extern "C"
{
#endif
#define LO -100.0f
#define HI 100.0f
#define BlockSize 16
#define VECTORLENGTH 100
#define MATRIXLENGTH 4000
__global__ void
calDistanceMatrixCUDA(float *Out, float *In)
{
// Block index
// int bx = blockIdx.x;
// int by = blockIdx.y;
// Thread index
// int tx = threadIdx.x;
// int ty = threadIdx.y;
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < MATRIXLENGTH && j < MATRIXLENGTH)
{
float fDim = 0.0f;
float fDist = 0.0f;
float(&InM)[4000][100] = *reinterpret_cast<float(*)[4000][100]>(In);
float(&OutM)[4000][4000] = *reinterpret_cast<float(*)[4000][4000]>(Out);
for (int k = 0; k < VECTORLENGTH; k++){//not blockSize because numElements = 100 < 128
fDim = InM[i][k] - InM[j][k];
fDim *= fDim;
fDist += fDim;
}
fDist = sqrt(fDist);
OutM[i][j] = fDist;
}
}
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
extern "C"
{
#endif
void jniEntry()
{
clock_t time1, time2, time3, time4;
double tDiff1, tDiff2, tDiff3, tDiff4;
unsigned int numElements = VECTORLENGTH;//dims
unsigned int numVectors = MATRIXLENGTH;
dim3 dimsVector(VECTORLENGTH, 1, 1);
dim3 dimsVectorArray(MATRIXLENGTH, VECTORLENGTH, 1);
dim3 dimsDistMatrix(MATRIXLENGTH, MATRIXLENGTH, 1);
size_t sizeVector = VECTORLENGTH * sizeof(float);
size_t sizeVectorArray = sizeVector * MATRIXLENGTH;
size_t sizeMatrix = MATRIXLENGTH * MATRIXLENGTH * sizeof(float);
unsigned int nSizeVector = dimsVector.x * dimsVector.y;
unsigned int mem_SizeVector = sizeof(float) * nSizeVector;
unsigned int nSizeVectorArray = dimsVectorArray.x * dimsVectorArray.y;
unsigned int mem_SizeVectorArray = sizeof(float) * nSizeVectorArray;
unsigned int nSizeDistMatrix = dimsDistMatrix.x * dimsDistMatrix.y;
unsigned int mem_SizeDistMatrix = sizeof(float) * nSizeDistMatrix;
float *distMatrix = (float *)malloc(mem_SizeDistMatrix);///Destination
/////////////////////////////////////////
///initialize Vector
time1 = clock();
float *featureV100 = (float *)malloc(mem_SizeVectorArray);
for (int i = 0; i < nSizeVectorArray; ++i)
{
featureV100[i] = LO + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (HI - LO)));;
// printf("i:%d, == %5.2f\n", i, featureV100[i]);
}
time2 = clock();
///////////////////////////
float *d_featureV100, *d_DistMatrix;
cudaError_t error;
error = cudaMalloc((void **)&d_featureV100, mem_SizeVectorArray);
if (error != cudaSuccess)
{
printf("cudaMalloc d_featureV100 returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_DistMatrix, mem_SizeDistMatrix);
if (error != cudaSuccess)
{
printf("cudaMalloc d_DistMatrix returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_featureV100, featureV100, mem_SizeVectorArray, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_featureV100,featureV100) returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
//////////////////////
// Allocate CUDA events that we'll use for timing
cudaEvent_t start;
error = cudaEventCreate(&start);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaEvent_t stop;
error = cudaEventCreate(&stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Record the start event
error = cudaEventRecord(start, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Setup execution parameters
// int threads = /*128*/512; //sufficient for vector of 100 elements
dim3 threads(512); //sufficient for vector of 100 elements
// dim3 grid(MATRIXLENGTH / threads, MATRIXLENGTH / threads);
dim3 grid(512);
calDistanceMatrixCUDA<<<grid, threads>>>(d_DistMatrix, d_featureV100);
// Record the stop event
error = cudaEventRecord(stop, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Compute and print the performance
float msec = msecTotal ;
printf(
"Performance= Time= %.3f msec, WorkgroupSize= %d,%d,%d threads/block & %d,%d,%d blocks/grid\n",
msec,
threads.x,threads.y,threads.z,
grid.x,grid.y,grid.z);
error = cudaGetLastError();
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to launch calDistanceMatrixCUDA (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(distMatrix, d_DistMatrix, mem_SizeDistMatrix, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to copy d_DistMatrix from device to host distMatrix (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaFree(d_featureV100);
cudaFree(d_DistMatrix);
free(featureV100);
free(distMatrix);
error = cudaDeviceReset();
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
printf("Done\n");
}
#ifdef __cplusplus
}
#endif
Needless to say the above cudaprogam.cu runs coorectly without errors when run as a CUDA application i.e. without JNI
Please guide me in regards to using the correct options in the makefile as i am a newbie in creating makefiles. Thanks.
Edit:
after the changes you mentioned below in the answer. the ldd commands gives `
ldd libcujni1.so
linux-vdso.so.1 => (0x00007ffd919b6000)
libcudart.so.6.5 => /opt/cuda-6.5//lib64/libcudart.so.6.5 (0x00007f47bde41000)
libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f47bdb3d000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f47bd778000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f47bd574000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f47bd356000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f47bd14e000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f47bce48000)
/lib64/ld-linux-x86-64.so.2 (0x00007f47be297000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f47bcc32000)
and the make command line shows
make all
javac ..//jclass.java
javah -jni -classpath ../ jclass
/opt/cuda-6.5//bin/nvcc -ccbin g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -Xcompiler -fPIC -m64 -o cudaprog.o -c *.cu # -v
cudaprog.cu(89): warning: variable "time1" was set but never used
cudaprog.cu(89): warning: variable "time2" was set but never used
cudaprog.cu(89): warning: variable "time3" was declared but never referenced
cudaprog.cu(89): warning: variable "time4" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced
cudaprog.cu(92): warning: variable "numElements" was declared but never referenced
cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced
cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced
cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced
cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced
cudaprog.cu(89): warning: variable "time1" was set but never used
cudaprog.cu(89): warning: variable "time2" was set but never used
cudaprog.cu(89): warning: variable "time3" was declared but never referenced
cudaprog.cu(89): warning: variable "time4" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced
cudaprog.cu(92): warning: variable "numElements" was declared but never referenced
cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced
cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced
cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced
cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced
g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -shared -fPIC -m64 -o cEntry.o cEntry.c jclass.h cudaprog.h # -shared -fPIC -Xlinker -znoexecstack -Xlinker -shared -v -g
g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o -L/opt/cuda-6.5//lib64 -Wl,-rpath=/opt/cuda-6.5//lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm # -v
also added Library folders i.e.
LIBRARIES := -L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64 -L$(CUDA_PATH)/lib64/stubs -Wl,-rpath=$(CUDA_PATH)/lib64/stubs -lcufft -lcublas -lcudart -lcuda -lrt -lm
the error currently is (after running the output command in jclass.java main()
Exception in thread "main" 1:HelloJNI CUder
java.lang.UnsatisfiedLinkError: jclass.jniEntry()V
at jclass.jniEntry(Native Method)
at jclass.main(jclass.java:22)

Posting a proper answer since comments are not meant for that...
Your first problem was that you were missing -Xcompiler - fpic in the nvcc compiler option list.
Your second issue is that your dynamic library is linked with neither libcudart nor libcuda. That is probably a problem with the Makefile, or the order of which lib are linked.
I would try something like-L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm as link option...
Then check with ldd libcujni1.so that libcudart and libcuda are indeed listed there.
And please post a copy of your actual link command line (the one executed when you type make) and the result of ldd libcujni1.so in you initial question.
EDIT: I think I've got it... Just reread your Makefile and you should change this:
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o
into this:
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o $(LIBRARIES)
Notice the change of place for $(LIBRARIES)... Order matters (a lot) when it comes to linking.

Segmentation fault when passing device pointer to cublasSnrm2

The code of cublas below give us the errors:core dumped while being at "cublasSnrm2(handle,row,dy,incy,de)",could you give some advice?
main.cu
#include <iostream>
#include "cublas.h"
#include "cublas_v2.h"
#include "helper_cuda.h"
using namespace std;
int main(int argc,char *args[])
{
float y[10] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
int dev=0;
checkCudaErrors(cudaSetDevice(dev));
//cublas init
cublasStatus stat;
cublasInit();
cublasHandle_t handle;
stat = cublasCreate(&handle);
if (stat !=CUBLAS_STATUS_SUCCESS)
{
printf("cublas handle create failed!\n");
cublasShutdown();
}
float * dy,*de,*e;
int incy = 1,ONE = 1,row = 10;
e = (float *)malloc(sizeof(float)*ONE);
e[0]=0.0f;
checkCudaErrors(cudaMalloc(&dy,sizeof(float)*row));
checkCudaErrors(cudaMalloc(&de,sizeof(float)*ONE));
checkCudaErrors(cudaMemcpy(dy,y,row*sizeof(float),cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(de,e,ONE*sizeof(float),cudaMemcpyHostToDevice));
stat = cublasSnrm2(handle,row,dy,incy,de);
if (stat !=CUBLAS_STATUS_SUCCESS)
{
printf("norm2 compute failed!\n");
cublasShutdown();
}
checkCudaErrors(cudaMemcpy(e,de,ONE*sizeof(float),cudaMemcpyDeviceToHost));
std::cout<<e[0]<<endl;
return 0;
}
makefile is below:
NVIDIA = $(HOME)/NVIDIA_CUDA-5.0_Samples
CUDA = /usr/local/cuda-5.0
NVIDINCADD = -I$(NVIDIA)/common/inc
CUDAINCADD = -I$(CUDA)/include
CC = -L/usr/lib64/ -lstdc++
GCCOPT = -O2 -fno-rtti -fno-exceptions
INTELOPT = -O3 -fno-rtti -xW -restrict -fno-alias
DEB = -g
NVCC = -G
ARCH = -arch=sm_35
bcg:main.cu
nvcc $(DEB) $(NVCC) $(ARCH) $(CC) -lm $(NVIDINCADD) $(CUDAINCADD) -lcublas -I./ -o $(#) $(<)
clean:
rm -f bcg
rm -f hyb
My OS is linux redhat 6.2,CUDA's version is 5.0, GPU is K20M.

The problem is here:
cublasSnrm2(handle,row,dy,incy,de);
By default, the last parameter is a host pointer. So either pass e to the snrm2 call rather than de or do this:
cublasSetPointerMode(handle,CUBLAS_POINTER_MODE_DEVICE);
stat = cublasSnrm2(handle,row,dy,incy,de);
The pointer mode needs to be set to device if you want to pass a device pointer to store the result.

Dynamic Parallelism - undefined reference to __cudaRegisterLinkedBinary linking error while compiling - separate compilation

I got a problem when I try to compile a simple code there are C++ and Cuda code compile in a separated way.
Here's my code
main.cpp:
#include "file.cuh"
int main( void )
{
test();
return 0;
}
file.cuh:
void test( void );
file.cu:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdio>
#include "file.cuh"
__global__ void printId( void )
{
printf("Hello from block %d \n", blockIdx.x);
}
__global__ void DynPara( void )
{
dim3 grid( 2, 1, 1 );
dim3 block( 1, 1, 1 );
printId<<< grid, block >>>();
}
void test( void )
{
dim3 grid( 1, 1, 1 );
dim3 block( 1, 1, 1 );
dynPara<<< grid, block >>>();
}
I compile with:
nvcc -arch=sm_35 -lcudadevrt -rdc=true -c file.cu
g++ file.o main.cpp -L<path> -lcudart
And here's the error while compiling:
file.o: In function `__sti____cudaRegisterAll_39_tmpxft_00005b2f_00000000_6_file_cpp1_ii_99181f96()':
tmpxft_00005b2f_00000000-3_file.cudafe1.cpp:(.text+0x1cd): undefined reference to `__cudaRegisterLinkedBinary_39_tmpxft_00005b2f_00000000_6_file_cpp1_ii_99181f96'
os: Red Hat
card: K20x
Any idea?
Thanks

This question is pretty much a duplicate of this recent question.
Dynamic parallelism requires relocatable device code linking, in addition to compiling.
Your nvcc command line specifies a compile-only operation (-rdc=true -c).
g++ does not do any device code linking. So in a scenario like this, when doing the final link operation using g++ an extra device code link step is required.
Something like this:
nvcc -arch=sm_35 -rdc=true -c file.cu
nvcc -arch=sm_35 -dlink -o file_link.o file.o -lcudadevrt -lcudart
g++ file.o file_link.o main.cpp -L<path> -lcudart -lcudadevrt

When using CMake, setting CUDA_SEPARABLE_COMPILATION before find_package() enables both relocatable device code compiling and linking:
SET(CUDA_SEPARABLE_COMPILATION ON)
find_package(CUDA QUIET REQUIRED)

Firstly, sorry for my low reputation, I can't comment under Robert Crovella's answer directly
https://stackoverflow.com/a/22116121/14377278
Just like his command, but I need link cuda library when use nvcc and nvlink to compile and link, like below
nvcc -arch=sm_35 -rdc=true -c file.cu -L<path>
nvcc -arch=sm_35 -dlink -o file_link.o file.o -lcudadevrt -lcudart -L<path>
g++ file.o file_link.o main.cpp -L<path> -lcudart -lcudadevrt

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to compile/link code with different compute capabilities against cublas_device? - cuda

Related

Cuda - nvcc - No kernel image is available for execution on the device. What is the problem?

cuda & rdc & thrust in multiple shared objects results in SIGSEV in registerEntryFunction

error generating .so file from .o files using JNI for CUDA in eclipse(ubuntu)

Segmentation fault when passing device pointer to cublasSnrm2

Dynamic Parallelism - undefined reference to __cudaRegisterLinkedBinary linking error while compiling - separate compilation

Categories

Resources