undefined reference to cusolverDn - cuda

I am trying to run the cuSolver library available in cuda 7.0. I have an issue with using the cuSolver library that must be very simple to fix, but here I am asking for some help.
I have looked at quite a few examples posted around and I chose in particular this one from JackOLantern:
Parallel implementation for multiple SVDs using CUDA
I have just reduced it to a kernel_0.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
// --- cuSOLVE input/output parameters/arrays
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
cusolverDnDestroy(solver_handle);
return 0;
}
I use the same Utilities.cuh and Utilities.cu as JackOlantern. I compile it as (to be explicit):
/usr/local/cuda-7.0/bin/nvcc kernel_0.cu Utilities.cu
And what I get is:
Utilities.cu(27): warning: conversion from a string literal to "char *" is deprecated
Utilities.cu(27): warning: conversion from a string literal to "char *" is deprecated
/tmp/tmpxft_00007e1d_00000000-22_kernel_0.o: In function `main':
tmpxft_00007e1d_00000000-4_kernel_0.cudafe1.cpp:(.text+0x3d): undefined reference to `cusolverDnCreate'
tmpxft_00007e1d_00000000-4_kernel_0.cudafe1.cpp:(.text+0x49): undefined reference to `cusolverDnDestroy'
collect2: error: ld returned 1 exit status
If I comment out the cusolverDnCreate and cusolverDnDestroy, it compiles fine, so the library is apparently well included.
What simple and basic point am I missing? I have searched around, but I could not fix it. Thanks there.

What simple and basic point am I missing?
You have to link against the cusolver library:
/usr/local/cuda-7.0/bin/nvcc kernel_0.cu Utilities.cu -lcusolver

Related

My C code won't compile against C code in a `.cu` file

I have 2 files that form a small CUDA library from my previous program (which works well, btw) written on C++.
The header for this library is:
#ifndef __cudaLU__
#define __cudaLU__
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <cuComplex.h>
#include <stdlib.h>
void denseLS(int dim,
std::complex<float> * A,
std::complex<float> * b );
void sparseLS(int dim,
std::complex<float> *csrVal,
int *csrRowPtr,
int *csrColInd,
std::complex<float> *vecVal);
#endif
And I want to use this library in my old-as-the-hills C program just by setting procedure in the head of my main.c file:
extern void denseLS(int dim, float complex *A, float complex *b);
And it fails with a bunch of similar errors. Few of them are:
..NA/cudaLS.cu(115): error: namespace "std" has no member "complex"
..NA/cudaLS.cu(115): error: expected a ")"
..NA/cudaLS.cu(137): error: identifier "csrRowPtr" is undefined
..NA/cudaLS.cu(169): error: identifier "csrColInd" is undefined
..NA/cudaLS.cu(170): error: identifier "csrVal" is undefined
..NA/cudaLS.cu(171): error: identifier "vecVal" is undefined
I tried to make a change std::complex -> float complex and nothing works. Still same errors (without std error, ofc).
The cmake instructions file
cmake_minimum_required(VERSION 3.8)
project(NA)
set(CMAKE_C_STANDARD 11)
find_package(GSL REQUIRED)
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
cuda_add_library(solvers STATIC
cudaLS.cu
cudaLS.h)
target_link_libraries(solvers ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cusolver_LIBRARY})
target_compile_features(solvers PUBLIC cxx_std_11)
set_target_properties( solvers
PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_executable(NA main.c)
set_target_properties(NA PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(NA PRIVATE GSL::gsl m solvers)
What am I doing wrong pals?
UPD:
g++/gcc - 7.3
Linux
Well, I found what exactly I did in a wrong way.
Cmake is OK. But headers in the .h file have to be modified to
extern "C" void denseLS(int dim, cuComplex *A, cuComplex *b );
The cuda functions in .c have to be decleared in the head (or separate .h-file) as
void denseLS(int dim, float complex *A, float complex *b);

error: calling a __host__ function from a __global__ function is not allowed

I have written cuda function for dense sampling of feature points but i am getting error. My cuda code is given below. I am using cuda 7.5 toolkit.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "opencv2/imgproc/imgproc.hpp"
#include <opencv2/gpu/gpu.hpp>
#include <opencv2/opencv.hpp>
using namespace cv::gpu;
using namespace cv;
using namespace std;
__global__ void densefun(std::vector<int>* d_counters,std::vector<Point2f>* d_points,int d_x_max,int d_y_max,int width, int min_distance)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
Point2f point = (*d_points)[i];
int x = cvFloor(point.x);
int y = cvFloor(point.y);
//if(x >= d_x_max || y >= d_y_max)
//continue;
x /= min_distance;
y /= min_distance;
(*d_counters)[y*width+x]++;
}
void dense(std::vector<int>& counters,std::vector<Point2f>& points,int x_max,int y_max,int width)
{
std::vector<int>* d_counters;
std::vector<Point2f>* d_points;
int min_distance=5;
cudaMalloc(&d_counters,counters.size());
cudaMalloc(&d_points,points.size());
cudaMemcpy(d_points, &points, points.size(), cudaMemcpyHostToDevice);
densefun<<<1,points.size()>>>(d_counters,d_points,x_max,y_max,width,min_distance);
cudaMemcpy(&counters, d_counters, counters.size(), cudaMemcpyDeviceToHost);
cudaFree(d_counters);
cudaFree(d_points);
}
Output:
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(28):
error: calling a host function("cv::Point_ ::Point_") from
a global function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(28):
error: calling a host function("std::vector ,
std::allocator > > ::operator []") from a global
function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(29)
(col. 7): error: calling a host function("cvFloor") from a
global function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(30)
(col. 7): error: calling a host function("cvFloor") from a
global function("densefun") is not allowed
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/denseCuda.cu(35):
error: calling a host function("std::vector > ::operator []") from a global
function("densefun") is not allowed
5 errors detected in the compilation of
"/tmp/tmpxft_00000c85_00000000-7_denseCuda.cpp1.ii". CMake Error at
testVideo_generated_denseCuda.cu.o.cmake:260 (message): Error
generating file
/home/supriya/Desktop/5Dec/CalculateFV_merged_gpu_old/build/CMakeFiles/testVideo.dir//./testVideo_generated_denseCuda.cu.o
CMakeFiles/testVideo.dir/build.make:392: recipe for target
'CMakeFiles/testVideo.dir/./testVideo_generated_denseCuda.cu.o' failed
make[2]: *
[CMakeFiles/testVideo.dir/./testVideo_generated_denseCuda.cu.o] Error
1 CMakeFiles/Makefile2:130: recipe for target
'CMakeFiles/testVideo.dir/all' failed make[1]: *
[CMakeFiles/testVideo.dir/all] Error 2 Makefile:76: recipe for target
'all' failed make: *** [all] Error 2
You cannot use C++ standard library, OpenCV or any other non-CUDA specific library inside a CUDA kernel.
Instead of std::vector you need to use raw pointers to arrays allocated on the device, instead of Point2f you need to use CUDA specific vector type float2, instead of cvFloor you need to use __device__ ​ floorf() and so on.

Why is this not copying from device to host in Cuda?

I'm working through the examples of the "CUDA by Example" book. The following code doesn't give me an answer and work as it should. Where's the mistake?
Will appreciate your help and answers.
I get an output,which reads
Calculation done on GPU yields the answer: &d
Press enter to stop
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
using namespace std;
__global__ void add_integers_cuda(int a, int b, int *c)
{
*c = a + b;
}
int main(void)
{
int c;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, sizeof(int)); //allocate sizeof(int) bytes of contiguous memory in the gpu device and return the address of first byte to dev_ptr.
// call the kernel
add_integers_cuda <<<1,1>>>(2,7,dev_ptr);
cudaMemcpy(&c, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("Calculation done on GPU yields the answer: &d\n",c );
cudaFree(dev_ptr);
printf("Press enter to stop.");
cin.ignore(255, '\n');
return 0;
}
"
&d is not a correct printf formatting character here:
printf("Calculation done on GPU yields the answer: &d\n",c );
You won't get the output you are expecting.
You should use %d instead:
printf("Calculation done on GPU yields the answer: %d\n",c );
This particular issue has nothing to do with CUDA of course.
You may also want to run CUDA codes with cuda-memcheck and/or use proper CUDA error checking if you are just learning and having trouble. Neither of those would have pointed out the above error, however.

cuda Texture declaration compile-time error

I'm trying to compile the following piece of code:
#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
texture<float, 2, cudaReadModeElementType> tex;
int main () { ... }
yet, nvcc gives me the following error:
main.c:6:8: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘<’ token
I'm pretty new to CUDA, so I suppose I'm missing something here.
You can only use CUDA syntax in .cu files.

cudaMemset fails on __device__ variable

I am having trouble using cudaMemset on a device variable. Is it possible to use the reference to the device variable for cudaMemset, or is it just a matter of missing compiler flags, or libraries.. I am using cuda 4.1, and
NVRM version: NVIDIA UNIX x86_64 Kernel Module 285.05.33 Thu Jan 19
14:07:02 PST 2012
This is my sample code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
// device variable and kernel
__device__ float d_test;
int main() {
if (cudaMemset(&d_test,0,sizeof(float)) !=cudaSuccess)
printf("Error!\n");
}
which outputs:
Error!
Your problem is that d_test (as it appears in the host symbol table) isn't a valid device address and the runtime cannot access it directly. The solution is to use the cudaGetSymbolAddress API function to read the address of the device symbol from the context at runtime. Here is a slightly expanded version of your demonstration case which should work correctly:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
// device variable and kernel
__device__ float d_test;
inline void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main()
{
float * _d_test;
gpuErrchk( cudaFree(0) );
gpuErrchk( cudaGetSymbolAddress((void **)&_d_test, "d_test") );
gpuErrchk( cudaMemset(_d_test,0,sizeof(float)) );
gpuErrchk( cudaThreadExit() );
return 0;
}
Here, we read the address of the device symbol d_test from the context into a host pointer _d_test. This can then be passed to host side API functions like cudaMemset, cudaMemcpy, etc.
Edit to note that the form of cudaGetSymbolAddress shown in this answer has been deprecated and removed from the CUDA runtime API. For modern CUDA, the call would be:
gpuErrchk( cudaGetSymbolAddress((void **)&_d_test, d_test) );
I believe you can also use cudaMemcpyFromSymbol:
A function, such as the following kernel, can change the value of the variable declared in global memory (outside of the main function)
__global__ void kernel1() { d_test = 1.0; }
Inside your main, you can obtain the value using cudaMemcpyFromSymbol
cudaMemcpyFromSymbol(&h_test,"d_test",sizeof(float),0,cudaMemcpyDeviceToHost);
Of course, there is also cudaMemcpyToSymbol to change the value of the global variable.
The idea came from here: Having problem assigning a device variable in CUDA