CURAND Library - Compiling Error - Undefined reference to functions - cuda

I have the following code which I am trying to compile using nvcc.
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
int main(void)
{
size_t n = 100;
size_t i;
int *hostData;
unsigned int *devData;
hostData = (int *)calloc(n, sizeof(int));
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);
curandSetPseudoRandomGeneratorSeed(gen, 12345);
cudaMalloc((void **)&devData, n * sizeof(int));
curandGenerate(gen, devData, n);
cudaMemcpy(hostData, devData, n * sizeof(int), cudaMemcpyDeviceToHost);
for(i = 0; i < n; i++)
{
printf("%d ", hostData[i]);
}
printf("\n");
curandDestroyGenerator (gen);
cudaFree ( devData );
free ( hostData );
return 0;
}
This is the output I receive:
$ nvcc -o RNG7 RNG7.cu
/tmp/tmpxft_00005da4_00000000-13_RNG7.o: In function `main':
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0x6c): undefined reference to `curandCreateGenerator'
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0x7a): undefined reference to `curandSetPseudoRandomGeneratorSeed'
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0xa0): undefined reference to `curandGenerate'
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0x107): undefined reference to `curandDestroyGenerator'
collect2: ld returned 1 exit status
My initial guess is that for some reason the CURAND Library is not properly installed or that it cannot find the curand.h header file.
Please let me know what I should look for or how to solve my problem.
Thanks!

#Wilo Maldonado: just use a linker flag -lcurand and
additionally -L/path/to/cuda/libs if you do not have it already

The problem is not the header file, otherwise you would have got a compile error. You have a linker error. You will need to tell your linker where to find the object or library file that contains those functions.

Related

How to use nvcc with gprbuild?

I have a code in Ada that must use CUDA without using the Ada binding. So I made an interface that allows the Ada program to call C code. Now I want to compile it.
How can I tell gprbuild to not use gcc to compile .cu files by nvcc? If it's not possible, maybe I have to generate the objects using nvcc and then link them with the ada code? How would you do it?
EDIT: Using the link given by Simon Wright, I made this gpr file:
project Cuda_Interface is
for Languages use ("Ada", "Cuda");
for Source_Dirs use ("src");
for Object_Dir use "obj";
for Exec_Dir use ".";
for Main use ("cuda_interface.adb");
for Create_Missing_Dirs use "True";
package Naming is
for Body_Suffix("Cuda") use ".cu";
for Spec_Suffix("Cuda") use ".cuh";
end Naming;
package Compiler is
for Driver("Cuda") use "nvcc";
for Leading_Required_Switches("Cuda") use ("-c");
end Compiler;
package Linker is
for Default_Switches("Ada") use ("-L/usr/local/cuda/lib64", "-lcuda", "-lcudart", "-lm");
end Linker;
end Cuda_Interface;
The compilation works well but the linker returns this error:
/usr/bin/ld : cuda_interface.o : in the function « _ada_cuda_interface » :
cuda_interface.adb:(.text+0x3a5) : undefined reference to « inter_add_two »
collect2: error: ld returned 1 exit status
gprbuild: link of cuda_interface.adb failed
cuda_interface.adb:
with Ada.Text_IO; use Ada.Text_IO;
procedure Cuda_Interface is
type Index is range 1 .. 5;
type Element_Type is new Natural;
type Array_Type is array (Index) of Element_Type;
procedure Inter_Add_Two(Arr : in out Array_Type; Length : Index)
with
Import => True,
Convention => C,
External_Name => "inter_add_two";
A : Array_Type := (1, 2, 3, 4, 5);
begin
for I in Index loop
Put_Line("Value at "
& Index'Image(I)
& " is "
& Element_Type'Image(A(I)));
end loop;
New_Line;
Inter_Add_Two(A, Index'Last);
for I in Index loop
Put_Line("Value at "
& Index'Image(I)
& " is "
& Element_Type'Image(A(I)));
end loop;
end Cuda_Interface;
kernel.cuh
#ifndef __KERNEL_CUH__
#define __KERNEL_CUH__
#include <cuda.h>
__global__ void kernel_add_two(unsigned int *a, unsigned int length);
void inter_add_two(unsigned int *a, unsigned int length);
#endif // __KERNEL_CUH__
kernel.cu
#include "kernel.cuh"
#include <math.h>
#define THREADS_PER_BLOCK (1024)
__global__ void kernel_add_two(unsigned int *a, unsigned int length)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < length) a[tid] += 2;
}
void inter_add_two(unsigned int *a, unsigned int length)
{
unsigned int block_number = ceil(((float)length) / THREADS_PER_BLOCK);
unsigned int *d_a;
cudaMalloc((void**)&d_a, sizeof(unsigned int) * length);
cudaMemcpy(d_a, a, sizeof(unsigned int) * length, cudaMemcpyHostToDevice);
kernel_add_two<<<block_number, THREADS_PER_BLOCK>>>(d_a, length);
cudaMemcpy(a, d_a, sizeof(unsigned int) * length, cudaMemcpyDeviceToHost);
cudaFree(d_a);
}
Thanks to the comments, I successfully compiled and ran an Ada program calling C code which calls CUDA code. These are the files I edited :
kernel.cuh
#ifndef __KERNEL_CUH__
#define __KERNEL_CUH__
#include <cuda.h>
void *__gxx_personality_v0;
extern "C"
{
__global__ void kernel_add_two(unsigned int *a, unsigned int length);
void inter_add_two(unsigned int *a, unsigned int length);
}
#endif // __KERNEL_CUH__
cuda_interface.gpr
project Cuda_Interface is
for Languages use ("Ada", "Cuda");
for Source_Dirs use ("src");
for Object_Dir use "obj";
for Exec_Dir use ".";
for Main use ("cuda_interface.adb");
for Create_Missing_Dirs use "True";
package Naming is
for Body_Suffix("Cuda") use ".cu";
for Spec_Suffix("Cuda") use ".cuh";
end Naming;
package Compiler is
for Driver("Cuda") use "nvcc";
for Leading_Required_Switches("Cuda") use ("-c");
end Compiler;
package Linker is
for Default_Switches("Ada") use ("-L/usr/local/cuda/lib64", "-lcuda", "-lcudart", "-lcudadevrt", "-lm");
end Linker;
end Cuda_Interface;

Increment and access global counter CUDA

I need to make my kernel communicate with the host. I tried to use a global counter (better ways are well accepted), but the following code prints always 0. What am I doing wrong? (I tried both commented and uncommented ways).
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", c);
return 0;
}
Anytime you are having trouble with a CUDA code, I strongly encourage you to use proper CUDA error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, providing it in your question will be useful for those trying to help you.
If you had done so, you would have received a report that cudaMemcpyFromSymbol is throwing an invalid argument error.
If you study the documentation for that function call, you will see that the 4th parameter is not the direction parameter, but is the offset parameter. So providing cudaMemcpyDeviceToHost is incorrect for the offset parameter. Since cudaMemcpyFromSymbol is always a device->host transfer, providing the direction argument is redundant, and since it is provided a default, is unnecessary. Your code works correctly for me simply by eliminating that:
$ cat t1414.cu
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int));
printf("%d\n", c);
return 0;
}
$ nvcc -o t1414 t1414.cu
$ cuda-memcheck ./t1414
========= CUDA-MEMCHECK
10
========= ERROR SUMMARY: 0 errors
$

Why is this not copying from device to host in Cuda?

I'm working through the examples of the "CUDA by Example" book. The following code doesn't give me an answer and work as it should. Where's the mistake?
Will appreciate your help and answers.
I get an output,which reads
Calculation done on GPU yields the answer: &d
Press enter to stop
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
using namespace std;
__global__ void add_integers_cuda(int a, int b, int *c)
{
*c = a + b;
}
int main(void)
{
int c;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, sizeof(int)); //allocate sizeof(int) bytes of contiguous memory in the gpu device and return the address of first byte to dev_ptr.
// call the kernel
add_integers_cuda <<<1,1>>>(2,7,dev_ptr);
cudaMemcpy(&c, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("Calculation done on GPU yields the answer: &d\n",c );
cudaFree(dev_ptr);
printf("Press enter to stop.");
cin.ignore(255, '\n');
return 0;
}
"
&d is not a correct printf formatting character here:
printf("Calculation done on GPU yields the answer: &d\n",c );
You won't get the output you are expecting.
You should use %d instead:
printf("Calculation done on GPU yields the answer: %d\n",c );
This particular issue has nothing to do with CUDA of course.
You may also want to run CUDA codes with cuda-memcheck and/or use proper CUDA error checking if you are just learning and having trouble. Neither of those would have pointed out the above error, however.

Matrix not copied back from device to host successfully in CUDA

I am new to cuda. I wrote a kernel to create an identity matrix(GPUsetIdentity) of dimension sizeXsize. Further inside a function GPUfunctioncall, I called my kernel. The identity matrix should be stored in dDataInv. But when I copy it back to dataOut sizexsize , all the values are zero. I know, I am doing something very stupid somewhere, but couldnt get it, I am new to cuda, if anyone can point my mistake. Thanks.
#include <stdio.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <stdlib.h>
#include <string>
#include <fstream>
#include <iterator>
#include <sstream>
#include <vector>
#include <cstring>
#include <cstdlib>
#include <ctime>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "cuda.h"
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size);
cudaMemset ((void *) dDataInv, 0, size);
dim3 idyThreads (BLOCKSIZE);
dim3 idyBlocks (size / BLOCKSIZE);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size, cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
Any time you are having trouble with a CUDA code, it's good practice to use proper cuda error checking. You can also run your code with cuda-memcheck to get a quick read on whether there are any errors.
Using either of these methods, you would have discovered an "invalid configuration error" on your kernel launch. This usually means that the parameters in the <<< >>> syntax are incorrect. When you run into this type of error, simply printing out those values may indicate the problem.
In your case, this line of code:
dim3 idyBlocks (size / BLOCKSIZE);
results in a value of 0 for idyBlocks when size is 4 and BLOCKSIZE is 16. So you are requesting a kernel launch of 0 blocks which is illegal. Therefore your kernel is not running and your results are not what you expect.
There are a variety of ways to solve this, many of them involving detecting this condition and adding an "extra block" when size is not evenly divisible by BLOCKSIZE. Using this approach, we may be launching "extra threads", so we must include a "thread check" in the kernel to prevent those extra threads from doing anything (such as accessing arrays out of bounds). For this, we often need to know the intended size in the kernel, and we can pass this value as an extra kernel parameter.
You've also made some errors in your handling of device variables. The following code:
dataOut = new float[size*size];
allocates enough space for a square matrix of dimension size. But the following code:
cudaMalloc ((void **) &dDataInv, size);
only allocates enough space for size bytes. You want size*size*sizeof(float) instead of size here, and likewise you want it in the following cudaMemset and cudaMemcpy operations. cudaMalloc, cudaMemset and cudaMemcpy require a size parameter in bytes, just like malloc, memset, and memcpy. This error is found in your usage of cudaMemset and cudaMemcpy as well.
The following code has those modifications, and seems to work correctly for me:
$ cat t580.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width, int size)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
if (tx < size)
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size*size*sizeof(float));
cudaMemset ((void *) dDataInv, 0, size*size*sizeof(float));
dim3 idyThreads (BLOCKSIZE);
int num_blocks = size/BLOCKSIZE + (size%BLOCKSIZE)?1:0;
dim3 idyBlocks (num_blocks);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size*size*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
$ nvcc -arch=sm_20 -o t580 t580.cu
$ cuda-memcheck ./t580
========= CUDA-MEMCHECK
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
========= ERROR SUMMARY: 0 errors
$
Note that it may be redundant to pass size twice to the kernel. For this particular example, we could have easily used the width parameter to do our kernel "thread check". But for educational purposes, I chose to call it out as a separate parameter, because in the general case you will often pass it as a separate parameter to other kernels that you write.
Finally, note that cudaThreadSynchronize() is deprecated and should be replaced with cudaDeviceSynchronize() instead. In this particular example, niether are actually necessary, as the next cudaMemcpy operation will force the same kind of synchronization, but you may use it if you decide to add cuda error checking to your code (recommended).

CUDA Compiling Error - Need to add "include" to the linker

I am trying to compile this code using the CUDA Compiler:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
int main(void)
{
size_t n = 100;
size_t i;
int *hostData;
unsigned int *devData;
hostData = (int *)calloc(n, sizeof(int));
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, 12345);
cudaMalloc((void **)&devData, n * sizeof(int));
curandGenerate(gen, devData, n);
cudaMemcpy(hostData, devData, n * sizeof(int), cudaMemcpyDeviceToHost);
for(i = 0; i < n; i++)
{
printf("%d ", hostData[i]);
}
printf("\n");
curandDestroyGenerator (gen);
cudaFree ( devData );
free ( hostData );
return 0;
}
By using this command:
nvcc -o RNG RNG7.cu
This is the output I receive:
[root#client2 CUDA]$ nvcc -o RNG7 RNG7.cu
/tmp/tmpxft_00001ed1_00000000-13_RNG7.o: In function `main':
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0x6c): undefined reference to `curandCreateGenerator'
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0x7a): undefined reference to `curandSetPseudoRandomGeneratorSeed'
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0xa0): undefined reference to `curandGenerate'
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0x107): undefined reference to `curandDestroyGenerator'
collect2: ld returned 1 exit status
In another discussion they stated that this problem could be related to a linker problem or something, that I need to manually link the library in the compiler command to include the ones stated on my code.
I have no idea to achieve this, can someone please help with this?
Thanks!
Use the following options.
nvcc -o RNG7 RNG7.cu -lcurand -Xlinker=-rpath,/usr/local/cuda/lib
it will work like charm.