NVML code doesn't compile - cuda

I am implementing an example program with nvml library as shown at https://devtalk.nvidia.com/default/topic/504951/how-to-call-nvml-apis-/
The program is as follows:
#include <stdio.h>
#include <nvidia/gdk/nvml.h>
const char * convertToComputeModeString(nvmlComputeMode_t mode)
{
switch (mode)
{
case NVML_COMPUTEMODE_DEFAULT:
return "Default";
case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:
return "Exclusive_Thread";
case NVML_COMPUTEMODE_PROHIBITED:
return "Prohibited";
case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:
return "Exclusive Process";
default:
return "Unknown";
}
}
int main()
{
nvmlReturn_t result;
unsigned int device_count, i;
// First initialize NVML library
result = nvmlInit();
if (NVML_SUCCESS != result)
{
printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
printf("Press ENTER to continue...\n");
getchar();
return 1;
}
result = nvmlDeviceGetCount(&device_count);
if (NVML_SUCCESS != result)
{
printf("Failed to query device count: %s\n", nvmlErrorString(result));
goto Error;
}
printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : "");
printf("Listing devices:\n");
for (i = 0; i < device_count; i++)
{
nvmlDevice_t device;
char name[64];
nvmlPciInfo_t pci;
nvmlComputeMode_t compute_mode;
// Query for device handle to perform operations on a device
// You can also query device handle by other features like:
// nvmlDeviceGetHandleBySerial
// nvmlDeviceGetHandleByPciBusId
result = nvmlDeviceGetHandleByIndex(i, &device);
if (NVML_SUCCESS != result)
{
printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
goto Error;
}
result = nvmlDeviceGetName(device, name, sizeof(name)/sizeof(name[0]));
if (NVML_SUCCESS != result)
{
printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
goto Error;
}
// pci.busId is very useful to know which device physically you're talking to
// Using PCI identifier you can also match nvmlDevice handle to CUDA device.
result = nvmlDeviceGetPciInfo(device, &pci);
if (NVML_SUCCESS != result)
{
printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result));
goto Error;
}
printf("%d. %s [%s]\n", i, name, pci.busId);
// This is a simple example on how you can modify GPU's state
result = nvmlDeviceGetComputeMode(device, &compute_mode);
if (NVML_ERROR_NOT_SUPPORTED == result)
printf("\t This is not CUDA capable device\n");
else if (NVML_SUCCESS != result)
{
printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result));
goto Error;
}
else
{
// try to change compute mode
printf("\t Changing device's compute mode from '%s' to '%s'\n",
convertToComputeModeString(compute_mode),
convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));
result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
if (NVML_ERROR_NO_PERMISSION == result)
printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));
else if (NVML_ERROR_NOT_SUPPORTED == result)
printf("\t\t Compute mode prohibited not supported. You might be running on\n"
"\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n");
else if (NVML_SUCCESS != result)
{
printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result));
goto Error;
}
else
{
printf("\t Restoring device's compute mode back to '%s'\n",
convertToComputeModeString(compute_mode));
result = nvmlDeviceSetComputeMode(device, compute_mode);
if (NVML_SUCCESS != result)
{
printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result));
goto Error;
}
}
}
}
result = nvmlShutdown();
if (NVML_SUCCESS != result)
printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
printf("All done.\n");
printf("Press ENTER to continue...\n");
getchar();
return 0;
Error:
result = nvmlShutdown();
if (NVML_SUCCESS != result)
printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
printf("Press ENTER to continue...\n");
getchar();
return 1;
}
makefile as follows:
ARCH := $(shell getconf LONG_BIT)
ifeq (${ARCH},32)
NVML_LIB := ../lib/
else ifeq (${ARCH},64)
NVML_LIB := /usr/lib/nvidia-340/
else
$(error Unknown architecture!)
endif
CFLAGS := -I ../inc
LDFLAGS := -lnvidia-ml -L $(NVML_LIB)
example: example.o
$(CC) $(LDFLAGS) $< -o $#
clean:
-#rm -f example.o
-#rm -f example
And the error I get is:
cc -lnvidia-ml -L /usr/src/gdk/nvml/lib/ example.o -o example
example.o: In function `main':
example.c:(.text+0x5f): undefined reference to `nvmlInit_v2'
example.c:(.text+0x7b): undefined reference to `nvmlErrorString'
example.c:(.text+0xb5): undefined reference to `nvmlDeviceGetCount_v2'
example.c:(.text+0xd1): undefined reference to `nvmlErrorString'
example.c:(.text+0x149): undefined reference to `nvmlDeviceGetHandleByIndex_v2'
example.c:(.text+0x165): undefined reference to `nvmlErrorString'
example.c:(.text+0x19f): undefined reference to `nvmlDeviceGetName'
example.c:(.text+0x1bb): undefined reference to `nvmlErrorString'
example.c:(.text+0x1f3): undefined reference to `nvmlDeviceGetPciInfo_v2'
example.c:(.text+0x20f): undefined reference to `nvmlErrorString'
example.c:(.text+0x269): undefined reference to `nvmlDeviceGetComputeMode'
example.c:(.text+0x29d): undefined reference to `nvmlErrorString'
example.c:(.text+0x2ff): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x31b): undefined reference to `nvmlErrorString'
example.c:(.text+0x360): undefined reference to `nvmlErrorString'
example.c:(.text+0x3b5): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x3d1): undefined reference to `nvmlErrorString'
example.c:(.text+0x40c): undefined reference to `nvmlShutdown'
example.c:(.text+0x428): undefined reference to `nvmlErrorString'
example.c:(.text+0x45f): undefined reference to `nvmlShutdown'
example.c:(.text+0x47b): undefined reference to `nvmlErrorString'
collect2: error: ld returned 1 exit status
make: *** [example] Error 1
pranjal#PCL:~/nvidia$ make
cc -lnvidia-ml -L /usr/lib/nvidia-340/ example.o -o example
example.o: In function `main':
example.c:(.text+0x5f): undefined reference to `nvmlInit_v2'
example.c:(.text+0x7b): undefined reference to `nvmlErrorString'
example.c:(.text+0xb5): undefined reference to `nvmlDeviceGetCount_v2'
example.c:(.text+0xd1): undefined reference to `nvmlErrorString'
example.c:(.text+0x149): undefined reference to `nvmlDeviceGetHandleByIndex_v2'
example.c:(.text+0x165): undefined reference to `nvmlErrorString'
example.c:(.text+0x19f): undefined reference to `nvmlDeviceGetName'
example.c:(.text+0x1bb): undefined reference to `nvmlErrorString'
example.c:(.text+0x1f3): undefined reference to `nvmlDeviceGetPciInfo_v2'
example.c:(.text+0x20f): undefined reference to `nvmlErrorString'
example.c:(.text+0x269): undefined reference to `nvmlDeviceGetComputeMode'
example.c:(.text+0x29d): undefined reference to `nvmlErrorString'
example.c:(.text+0x2ff): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x31b): undefined reference to `nvmlErrorString'
example.c:(.text+0x360): undefined reference to `nvmlErrorString'
example.c:(.text+0x3b5): undefined reference to `nvmlDeviceSetComputeMode'
example.c:(.text+0x3d1): undefined reference to `nvmlErrorString'
example.c:(.text+0x40c): undefined reference to `nvmlShutdown'
example.c:(.text+0x428): undefined reference to `nvmlErrorString'
example.c:(.text+0x45f): undefined reference to `nvmlShutdown'
example.c:(.text+0x47b): undefined reference to `nvmlErrorString'
collect2: error: ld returned 1 exit status
make: *** [example] Error 1
Any help would be appreciated. Thank you.

Here's what I did on a linux CUDA 7.5 setup:
Update the GPU driver to 352.79. In my case, this was done via the runfile installer here. If you have previously installed the GPU driver via the package manager method (e.g. .deb) then you don't want to use the runfile installer method.
get the latest version of the GDK (see note below), which at this time happens to target 352.79, and includes nvml:
wget --no-check-certificate http://developer.download.nvidia.com/compute/cuda/7.5/Prod/gdk/gdk_linux_amd64_352_79_release.run
install the GDK:
sh gdk_linux_amd64_352_79_release.run
verify that the appropriate libraries were updated:
ls /usr/lib64/libnv*
(and you should see libnvidia-ml.so.352.79 etc.)
compile the example file:
g++ -I./gdk352_79/usr/include -L/usr/lib64 -lnvidia-ml example.c -o example
When I run the example executable, I get:
$ ./example
Found 2 devices
Listing devices:
0. Quadro 5000 [0000:02:00.0]
Changing device's compute mode from 'Default' to 'Prohibited'
Need root privileges to do that: Insufficient Permissions
1. GeForce GT 640 [0000:03:00.0]
Changing device's compute mode from 'Default' to 'Prohibited'
Need root privileges to do that: Insufficient Permissions
All done.
Press ENTER to continue...
$
Hopefully this will get you going. I am assuming you don't need help making any Makefile changes if needed. If your Makefile is not working, keep modifying it until you get the exact compile command I list in step 5.
NOTE: As of CUDA 8.0, the GDK is not a separate entity but is installed with CUDA 8.0 toolkit. It should not be necessary to install the GDK separately.

Related

Why is cudaPointerGetAttributes() returning invalid argument for host pointer?

I want to write a function that tells me if a pointer is a host or device pointer. This is essentially a wrapper around cudaPointerGetAttributes() that returns either 1 or 0 if the pointer is for the device or not.
What I can't understand is why cudaPointerGetAttributes fails my error checking by returning invalid argument when I'm testing a host pointer. An example is provided below.
#include <stdio.h>
#include <stdlib.h>
#define CUDA_ERROR_CHECK(fun) \
do{ \
cudaError_t err = fun; \
if(err != cudaSuccess) \
{ \
fprintf(stderr, "Cuda error %d %s:: %s\n", __LINE__, __func__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}while(0);
int is_device_pointer(const void *ptr)
{
int is_device_ptr = 0;
cudaPointerAttributes attributes;
CUDA_ERROR_CHECK(cudaPointerGetAttributes(&attributes, ptr));
if(attributes.devicePointer != NULL)
{
is_device_ptr = 1;
}
return is_device_ptr;
}
int main()
{
int *host_ptr, x = 0;
int is_dev_ptr;
host_ptr = &x;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, 16);
//is_dev_ptr = is_device_pointer((const void *)host_ptr); //Causes invalid argument
is_dev_ptr = is_device_pointer((const void *)dev_ptr); //Works
if(is_dev_ptr == 1)
{
fprintf(stdout, "Device pointer\n");
}
else
{
fprintf(stdout, "Not device Pointer\n");
}
CUDA_ERROR_CHECK(cudaFree((void *)dev_ptr));
CUDA_ERROR_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
This is expected behavior. cudaPointerGetAttributes can only introspect pointers that have been recorded in some fashion with the CUDA runtime API. Refer to the documentation:
If pointer was not allocated in, mapped by or registered with context supporting unified addressing cudaErrorInvalidValue is returned.
What this is saying is that the pointer must have been returned or passed through an API such as cudaMalloc, cudaMallocManaged, cudaHostRegister, etc. for it to be "recognized" by cudaPointerGetAttributes. You must be in a UVA regime, and you must have acquired the pointer using an appropriate method.
In your case, passing a bare host pointer this way doesn't meet the requirements spelled out in the documentation, so the error return is expected.
This particular error return code is a "non-sticky" CUDA error, meaning it can be cleared out via cudaGetLastError(). In my view, it should be safe to interpret this error return code as "this is an ordinary host pointer". But of course, if you pass a garbage value, or an unallocated pointer, you will get the same error code.

error generating .so file from .o files using JNI for CUDA in eclipse(ubuntu)

I am working on JNI based CUDA program for which i hava a java class that has main func i.e. jclass.java(containing native func jniEntry() declaration)
and jclass.h which is generated from javah . I have the JNI bridge cEntry.c which contains the native function implementation declared as
JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
the above function calls the CUDA host function i.e. jniEntry() in cudaprogram.h. The jniEntry() function then calls the device func contained in cudaprogram.cu
I can't seem generate the .so file from the generated .o files i.e. cudaprogram.o from cudaprogram.cu and cEntry.o from cEntry.c(which is the bridge for JNI i.e. jclass.java -> jclass.class(from javac) & jclass.h(from javah -jni)
My makefile is :
INCLUDES := -I$(CUDASDK_PATH)/inc -I$(CUDA_PATH)/include -I$(JDK_PATH)/include -I$(JDK_PATH)/include/linux -I.
LIBRARIES := -lrt -lm -lcudart -lcufft -lcublas -L$(CUDA_PATH)/lib64 -L.
JAVASRC_PATH := ../
NATIVESRC_PATH := ./
NVCC := /opt/cuda-6.5//bin/nvcc -ccbin g++
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o
cEntry.o: cEntry.c jclass.h cudaprog.o
gcc $(INCLUDES) -v -m64 -fPIC -o $# cEntry.c -c
cudaprog.o: cudaprog.cu jclass.h cudaprog.h
$(NVCC) $(INCLUDES) -v -m64 -o $# -c cudaprog.cu
run: build
$(EXEC) ./cujni1
jclass.h:jclass.class
javah -jni -classpath $(JAVASRC_PATH) jclass
jclass.class:
javac $(JAVASRC_PATH)/jclass.java
the files that are being generated without errors are jclass.class, jclass.h, cudaprogram.o, cEntry.o but the libcujni1.so is not getting generated as i get the errors something like
/usr/bin/ld: cudaprog.o: relocation R_X86_64_32 against `.rodata' can not be used when making a shared object; recompile with -fPIC
cudaprog.o: error adding symbols: Bad value
collect2: error: ld returned 1 exit status
make: *** [cujni1] Error 1
as you can see i am using nvcc to compile the .cu files so can't used -fPIC option because it returns error of |unknown option -fPIC"
For reference in needed i am attaching the other source files as well
jclass.java:
public class jclass {
static {
System.loadLibrary("cujni1");
}
private native void jniEntry();
public static void main(String[] args){
System.out.print("1:Hello" + "JNI CUder\n");
new jclass().jniEntry();
}
}
cEntry.c:
#include <jni.h>
#include "jclass.h"
#include "cudaprog.h"
JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
{
printf("2:cEntry.c-->Java_jclass_jniEntry!\n");
jniEntry();
return;
}
the generated jclass.h:
#ifndef CUDAPROG_H_
#define CUDAPROG_H_
#ifdef __cplusplus
extern "C" {
#endif
void jniEntry();
#ifdef __cplusplus
}
#endif
#endif /* CUDAPROG_H_ */
cudaprogram.cu:
// includes, system
#include <string.h>
#include <math.h>
#include "jclass.h"
#include "cudaprog.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#ifdef __cplusplus
extern "C"
{
#endif
#define LO -100.0f
#define HI 100.0f
#define BlockSize 16
#define VECTORLENGTH 100
#define MATRIXLENGTH 4000
__global__ void
calDistanceMatrixCUDA(float *Out, float *In)
{
// Block index
// int bx = blockIdx.x;
// int by = blockIdx.y;
// Thread index
// int tx = threadIdx.x;
// int ty = threadIdx.y;
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < MATRIXLENGTH && j < MATRIXLENGTH)
{
float fDim = 0.0f;
float fDist = 0.0f;
float(&InM)[4000][100] = *reinterpret_cast<float(*)[4000][100]>(In);
float(&OutM)[4000][4000] = *reinterpret_cast<float(*)[4000][4000]>(Out);
for (int k = 0; k < VECTORLENGTH; k++){//not blockSize because numElements = 100 < 128
fDim = InM[i][k] - InM[j][k];
fDim *= fDim;
fDist += fDim;
}
fDist = sqrt(fDist);
OutM[i][j] = fDist;
}
}
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
extern "C"
{
#endif
void jniEntry()
{
clock_t time1, time2, time3, time4;
double tDiff1, tDiff2, tDiff3, tDiff4;
unsigned int numElements = VECTORLENGTH;//dims
unsigned int numVectors = MATRIXLENGTH;
dim3 dimsVector(VECTORLENGTH, 1, 1);
dim3 dimsVectorArray(MATRIXLENGTH, VECTORLENGTH, 1);
dim3 dimsDistMatrix(MATRIXLENGTH, MATRIXLENGTH, 1);
size_t sizeVector = VECTORLENGTH * sizeof(float);
size_t sizeVectorArray = sizeVector * MATRIXLENGTH;
size_t sizeMatrix = MATRIXLENGTH * MATRIXLENGTH * sizeof(float);
unsigned int nSizeVector = dimsVector.x * dimsVector.y;
unsigned int mem_SizeVector = sizeof(float) * nSizeVector;
unsigned int nSizeVectorArray = dimsVectorArray.x * dimsVectorArray.y;
unsigned int mem_SizeVectorArray = sizeof(float) * nSizeVectorArray;
unsigned int nSizeDistMatrix = dimsDistMatrix.x * dimsDistMatrix.y;
unsigned int mem_SizeDistMatrix = sizeof(float) * nSizeDistMatrix;
float *distMatrix = (float *)malloc(mem_SizeDistMatrix);///Destination
/////////////////////////////////////////
///initialize Vector
time1 = clock();
float *featureV100 = (float *)malloc(mem_SizeVectorArray);
for (int i = 0; i < nSizeVectorArray; ++i)
{
featureV100[i] = LO + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (HI - LO)));;
// printf("i:%d, == %5.2f\n", i, featureV100[i]);
}
time2 = clock();
///////////////////////////
float *d_featureV100, *d_DistMatrix;
cudaError_t error;
error = cudaMalloc((void **)&d_featureV100, mem_SizeVectorArray);
if (error != cudaSuccess)
{
printf("cudaMalloc d_featureV100 returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_DistMatrix, mem_SizeDistMatrix);
if (error != cudaSuccess)
{
printf("cudaMalloc d_DistMatrix returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_featureV100, featureV100, mem_SizeVectorArray, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_featureV100,featureV100) returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
//////////////////////
// Allocate CUDA events that we'll use for timing
cudaEvent_t start;
error = cudaEventCreate(&start);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaEvent_t stop;
error = cudaEventCreate(&stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Record the start event
error = cudaEventRecord(start, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Setup execution parameters
// int threads = /*128*/512; //sufficient for vector of 100 elements
dim3 threads(512); //sufficient for vector of 100 elements
// dim3 grid(MATRIXLENGTH / threads, MATRIXLENGTH / threads);
dim3 grid(512);
calDistanceMatrixCUDA<<<grid, threads>>>(d_DistMatrix, d_featureV100);
// Record the stop event
error = cudaEventRecord(stop, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Compute and print the performance
float msec = msecTotal ;
printf(
"Performance= Time= %.3f msec, WorkgroupSize= %d,%d,%d threads/block & %d,%d,%d blocks/grid\n",
msec,
threads.x,threads.y,threads.z,
grid.x,grid.y,grid.z);
error = cudaGetLastError();
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to launch calDistanceMatrixCUDA (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(distMatrix, d_DistMatrix, mem_SizeDistMatrix, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to copy d_DistMatrix from device to host distMatrix (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaFree(d_featureV100);
cudaFree(d_DistMatrix);
free(featureV100);
free(distMatrix);
error = cudaDeviceReset();
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
printf("Done\n");
}
#ifdef __cplusplus
}
#endif
Needless to say the above cudaprogam.cu runs coorectly without errors when run as a CUDA application i.e. without JNI
Please guide me in regards to using the correct options in the makefile as i am a newbie in creating makefiles. Thanks.
Edit:
after the changes you mentioned below in the answer. the ldd commands gives `
ldd libcujni1.so
linux-vdso.so.1 => (0x00007ffd919b6000)
libcudart.so.6.5 => /opt/cuda-6.5//lib64/libcudart.so.6.5 (0x00007f47bde41000)
libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f47bdb3d000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f47bd778000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f47bd574000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f47bd356000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f47bd14e000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f47bce48000)
/lib64/ld-linux-x86-64.so.2 (0x00007f47be297000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f47bcc32000)
and the make command line shows
make all
javac ..//jclass.java
javah -jni -classpath ../ jclass
/opt/cuda-6.5//bin/nvcc -ccbin g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -Xcompiler -fPIC -m64 -o cudaprog.o -c *.cu # -v
cudaprog.cu(89): warning: variable "time1" was set but never used
cudaprog.cu(89): warning: variable "time2" was set but never used
cudaprog.cu(89): warning: variable "time3" was declared but never referenced
cudaprog.cu(89): warning: variable "time4" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced
cudaprog.cu(92): warning: variable "numElements" was declared but never referenced
cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced
cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced
cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced
cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced
cudaprog.cu(89): warning: variable "time1" was set but never used
cudaprog.cu(89): warning: variable "time2" was set but never used
cudaprog.cu(89): warning: variable "time3" was declared but never referenced
cudaprog.cu(89): warning: variable "time4" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff1" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff2" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff3" was declared but never referenced
cudaprog.cu(90): warning: variable "tDiff4" was declared but never referenced
cudaprog.cu(92): warning: variable "numElements" was declared but never referenced
cudaprog.cu(93): warning: variable "numVectors" was declared but never referenced
cudaprog.cu(100): warning: variable "sizeVectorArray" was declared but never referenced
cudaprog.cu(101): warning: variable "sizeMatrix" was declared but never referenced
cudaprog.cu(104): warning: variable "mem_SizeVector" was declared but never referenced
g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -shared -fPIC -m64 -o cEntry.o cEntry.c jclass.h cudaprog.h # -shared -fPIC -Xlinker -znoexecstack -Xlinker -shared -v -g
g++ -I/home/faizan/workspace/common//inc -I/opt/cuda-6.5//include -I/usr/lib/jvm/jdk1.8.0_60//include -I/usr/lib/jvm/jdk1.8.0_60//include/linux -I. -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o -L/opt/cuda-6.5//lib64 -Wl,-rpath=/opt/cuda-6.5//lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm # -v
also added Library folders i.e.
LIBRARIES := -L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64 -L$(CUDA_PATH)/lib64/stubs -Wl,-rpath=$(CUDA_PATH)/lib64/stubs -lcufft -lcublas -lcudart -lcuda -lrt -lm
the error currently is (after running the output command in jclass.java main()
Exception in thread "main" 1:HelloJNI CUder
java.lang.UnsatisfiedLinkError: jclass.jniEntry()V
at jclass.jniEntry(Native Method)
at jclass.main(jclass.java:22)
Posting a proper answer since comments are not meant for that...
Your first problem was that you were missing -Xcompiler - fpic in the nvcc compiler option list.
Your second issue is that your dynamic library is linked with neither libcudart nor libcuda. That is probably a problem with the Makefile, or the order of which lib are linked.
I would try something like-L$(CUDA_PATH)/lib64 -Wl,-rpath=$(CUDA_PATH)/lib64 -lcufft -lcublas -lcudart -lcuda -lrt -lm as link option...
Then check with ldd libcujni1.so that libcudart and libcuda are indeed listed there.
And please post a copy of your actual link command line (the one executed when you type make) and the result of ldd libcujni1.so in you initial question.
EDIT: I think I've got it... Just reread your Makefile and you should change this:
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o
into this:
cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
g++ $(INCLUDES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o $(LIBRARIES)
Notice the change of place for $(LIBRARIES)... Order matters (a lot) when it comes to linking.

undefined reference to `htmlcxx::HTML::ParserDom::parseTree(std::string const&)'

I am using htmlcxx library for a simple program but I got stuck in a problem, I searched many other related solutions but my problem is still a problem, Hope any one can help me, Here is the code I used in Kdevelop on Ubuntu:
#include <iostream>
#include <string>
#include <htmlcxx/html/ParserDom.h>
using namespace htmlcxx;
int main()
{
//Parse some html code
std::string html = "<html><body>hey</body></html>";
HTML::ParserDom parser;
tree<HTML::Node> dom= parser.parseTree(html) ;
//Print whole DOM tree
std::cout << dom << std::endl;
//Dump all links in the tree
tree<HTML::Node>::iterator it = dom.begin();
tree<HTML::Node>::iterator end = dom.end();
for (; it != end; ++it)
{
if (it->tagName() == "A")
{
it->parseAttributes();
std::cout << it->attribute("href").second;
}
}
//Dump all text of the document
it = dom.begin();
end = dom.end();
for (; it != end; ++it)
{
if ((!it->isTag()) && (!it->isComment()))
{
std::cout << it->text();
}
}
return 0;
}
And here is the error come when I build it in Kdevelop:
/home/ratior/projects/html/build> make -j2
Scanning dependencies of target html
[100%] Building CXX object CMakeFiles/html.dir/main.o
Linking CXX executable html
CMakeFiles/html.dir/main.o: In function `main':
/home/ratior/projects/html/main.cpp:17: undefined reference to `htmlcxx::HTML::ParserDom::parseTree(std::string const&)'
/home/ratior/projects/html/main.cpp:20: undefined reference to `htmlcxx::HTML::operator<<(std::ostream&, tree<htmlcxx::HTML::Node, std::allocator<tree_node_<htmlcxx::HTML::Node> > > const&)'
/home/ratior/projects/html/main.cpp:29: undefined reference to `htmlcxx::HTML::Node::parseAttributes()'
CMakeFiles/html.dir/main.o: In function `htmlcxx::HTML::ParserDom::ParserDom()':
/usr/local/include/htmlcxx/html/ParserDom.h:14: undefined reference to `vtable for htmlcxx::HTML::ParserDom'
CMakeFiles/html.dir/main.o: In function `htmlcxx::HTML::ParserDom::~ParserDom()':
/usr/local/include/htmlcxx/html/ParserDom.h:15: undefined reference to `vtable for htmlcxx::HTML::ParserDom'
collect2: error: ld returned 1 exit status
make[2]: *** [html] Error 1
make[1]: *** [CMakeFiles/html.dir/all] Error 2
make: *** [all] Error 2
*** Failure: Exit code 2 **strong text**
Wrong path to the library is the reason why it's not linking the code.

CUFFT error handling

I'm using the following macro for CUFFT error handling:
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "cufftSafeCall() CUFFT error in file <%s>, line %i.\n",
file, line);
getch(); exit(-1);
}
}
This macro does not return the message string from an error code. The book "CUDA Programming: a developer's guide to parallel computing with GPUs" suggests using the following macro
#define CUDA_CALL(call) { const cudaError_t err = (call); \
if(err != cudaSuccess) \
{ \
fprintf(stderr, "CUDA error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
cudaGetErrorString(err)); \
cudaDeviceReset(); assert(0); \
} }
(note: it has been somewhat customized without altering the functionalities). The book says: "This technique works for all the CUDA calls except for the invocation of kernels." However, when using CUDA_CALL on a CUFFT routine call, the compiler returns
a value of type "cufftResult" cannot be used to initialize an entity of type "const cudaError_t".
It seems then that cufftResult and cudaError_t are not immediately compatible.
Investigating a bit more, from this NVIDIA CUDA Library link, it seems that cudaGetErrorString requires a cudaError_t input type.
My questions are the following:
Is there a way to make cufftResult and cudaError_t be compatible, so that I can use CUDA_CALL on CUFFT routines and receive the message string from an error code?
Is there any technical reason why implementing a different error for the CUFFT library? :-)
Thanks.
EDIT FOLLOWING ROBERT CROVELLA'S ANSWER
I have modified the CufftSafeCall routine as
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
to return also the error type string.
cufft is not part of the cuda runtime api. cufft is a separate library of functions. Since it's separate, it makes sense not to make cufft error enums dependent on the cuda runtime api library; such linkages hamper independent development of modules, codes, and libraries.
So when the book mentions CUDA calls, they are referring to the cuda runtime api, not the cufft library api.
Since the enumerated values returned from cufft library calls are independent of (and mostly orthogonal to) the enumerated values returned from the cuda runtime api, I don't think it's possible in any straightforward way to harmonize the two sets in a single macro. And since cuda calls and cufft calls may be intermingled in any piece of code, I can't think of an environmental way to do it. Someone else may come up with a clever approach, however.
If you want a cufft error enum to string parser, there is one in /usr/local/cuda/samples/common/inc/helper_cuda.h (assuming standard linux CUDA 5 install) that may be of interest. Pasting it in here for convenience:
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
I use the following macro in my project:
// NOTE: include cufft to import '_cudaGetErrorEnum(cufftResult error)'
#include <cufft.h>
#include <helper_cuda.h>
#define CHECK_CUFFT_ERRORS(call) { \
cufftResult_t err; \
if ((err = (call)) != CUFFT_SUCCESS) { \
fprintf(stderr, "cuFFT error %d:%s at %s:%d\n", err, _cudaGetErrorEnum(err), \
__FILE__, __LINE__); \
exit(1); \
} \
}

cudaHostRegister fails with 'invalid argument' error even with page-aligned memory

I have allocated page-aligned memory on host using posix_memalign. The call to posix_memalign does not return any error. However, using this pointer as argument to cudaHostRegister gives me an 'invalid argument' error. What could be the issue?
CUDA API version: 4.0
gcc version: 4.4.5
GPU compute capability: 2.0
The memory allocation is done in the application code, and a pointer is passed to a library routine.
Application code snippet:
if(posix_memalign((void **)&h_A, getpagesize(), n * n * sizeof(float))) {
printf("Error allocating aligned memory for A\n");
return 1;
}
Shared library code snippet:
if((ret = cudaSetDeviceFlags(cudaDeviceMapHost)) != cudaSuccess) {
fprintf(stderr, "Error setting device flag: %s\n",
cudaGetErrorString(ret));
return NULL;
}
if((ret = cudaHostRegister(h_A, n2 * sizeof(float),
cudaHostRegisterMapped)) != cudaSuccess) {
fprintf(stderr, "Error registering page-locked memory for A: %s\n",
cudaGetErrorString(ret));
return NULL;
}
I cannot reproduce this. If I take the code snippets you supplied and make them into a minimal executable:
#include <unistd.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
int main(void)
{
const int n2 = 100 * 100;
float *h_A;
cudaError_t ret;
if(posix_memalign((void **)&h_A, getpagesize(), n2 * sizeof(float))) {
printf("Error allocating aligned memory for A\n");
return -1;
}
if((ret = cudaSetDeviceFlags(cudaDeviceMapHost)) != cudaSuccess) {
fprintf(stderr, "Error setting device flag: %s\n",
cudaGetErrorString(ret));
return -1;
}
if((ret = cudaHostRegister(h_A, n2 * sizeof(float),
cudaHostRegisterMapped)) != cudaSuccess) {
fprintf(stderr, "Error registering page-locked memory for A: %s\n",
cudaGetErrorString(ret));
return -1;
}
return 0;
}
it compiles and runs without error under both CUDA 4.2 and CUDA 5.0 on a 64 bit linux host with the 304.54 driver. I would, therefore, conclude that either you have a broken CUDA installation or your code has a problem somewhere you haven't shown us.
Perhaps you can compile and run this code exactly as I posted and see what happens. If it works, it might help narrow down what it is that might be going wrong here.