I am trying to compile this code using the CUDA Compiler:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
int main(void)
{
size_t n = 100;
size_t i;
int *hostData;
unsigned int *devData;
hostData = (int *)calloc(n, sizeof(int));
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, 12345);
cudaMalloc((void **)&devData, n * sizeof(int));
curandGenerate(gen, devData, n);
cudaMemcpy(hostData, devData, n * sizeof(int), cudaMemcpyDeviceToHost);
for(i = 0; i < n; i++)
{
printf("%d ", hostData[i]);
}
printf("\n");
curandDestroyGenerator (gen);
cudaFree ( devData );
free ( hostData );
return 0;
}
By using this command:
nvcc -o RNG RNG7.cu
This is the output I receive:
[root#client2 CUDA]$ nvcc -o RNG7 RNG7.cu
/tmp/tmpxft_00001ed1_00000000-13_RNG7.o: In function `main':
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0x6c): undefined reference to `curandCreateGenerator'
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0x7a): undefined reference to `curandSetPseudoRandomGeneratorSeed'
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0xa0): undefined reference to `curandGenerate'
tmpxft_00001ed1_00000000-1_RNG7.cudafe1.cpp:(.text+0x107): undefined reference to `curandDestroyGenerator'
collect2: ld returned 1 exit status
In another discussion they stated that this problem could be related to a linker problem or something, that I need to manually link the library in the compiler command to include the ones stated on my code.
I have no idea to achieve this, can someone please help with this?
Thanks!
Use the following options.
nvcc -o RNG7 RNG7.cu -lcurand -Xlinker=-rpath,/usr/local/cuda/lib
it will work like charm.
Related
I need to make my kernel communicate with the host. I tried to use a global counter (better ways are well accepted), but the following code prints always 0. What am I doing wrong? (I tried both commented and uncommented ways).
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", c);
return 0;
}
Anytime you are having trouble with a CUDA code, I strongly encourage you to use proper CUDA error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, providing it in your question will be useful for those trying to help you.
If you had done so, you would have received a report that cudaMemcpyFromSymbol is throwing an invalid argument error.
If you study the documentation for that function call, you will see that the 4th parameter is not the direction parameter, but is the offset parameter. So providing cudaMemcpyDeviceToHost is incorrect for the offset parameter. Since cudaMemcpyFromSymbol is always a device->host transfer, providing the direction argument is redundant, and since it is provided a default, is unnecessary. Your code works correctly for me simply by eliminating that:
$ cat t1414.cu
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int));
printf("%d\n", c);
return 0;
}
$ nvcc -o t1414 t1414.cu
$ cuda-memcheck ./t1414
========= CUDA-MEMCHECK
10
========= ERROR SUMMARY: 0 errors
$
I'm working through the examples of the "CUDA by Example" book. The following code doesn't give me an answer and work as it should. Where's the mistake?
Will appreciate your help and answers.
I get an output,which reads
Calculation done on GPU yields the answer: &d
Press enter to stop
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
using namespace std;
__global__ void add_integers_cuda(int a, int b, int *c)
{
*c = a + b;
}
int main(void)
{
int c;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, sizeof(int)); //allocate sizeof(int) bytes of contiguous memory in the gpu device and return the address of first byte to dev_ptr.
// call the kernel
add_integers_cuda <<<1,1>>>(2,7,dev_ptr);
cudaMemcpy(&c, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("Calculation done on GPU yields the answer: &d\n",c );
cudaFree(dev_ptr);
printf("Press enter to stop.");
cin.ignore(255, '\n');
return 0;
}
"
&d is not a correct printf formatting character here:
printf("Calculation done on GPU yields the answer: &d\n",c );
You won't get the output you are expecting.
You should use %d instead:
printf("Calculation done on GPU yields the answer: %d\n",c );
This particular issue has nothing to do with CUDA of course.
You may also want to run CUDA codes with cuda-memcheck and/or use proper CUDA error checking if you are just learning and having trouble. Neither of those would have pointed out the above error, however.
I am new to cuda. I wrote a kernel to create an identity matrix(GPUsetIdentity) of dimension sizeXsize. Further inside a function GPUfunctioncall, I called my kernel. The identity matrix should be stored in dDataInv. But when I copy it back to dataOut sizexsize , all the values are zero. I know, I am doing something very stupid somewhere, but couldnt get it, I am new to cuda, if anyone can point my mistake. Thanks.
#include <stdio.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <stdlib.h>
#include <string>
#include <fstream>
#include <iterator>
#include <sstream>
#include <vector>
#include <cstring>
#include <cstdlib>
#include <ctime>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "cuda.h"
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size);
cudaMemset ((void *) dDataInv, 0, size);
dim3 idyThreads (BLOCKSIZE);
dim3 idyBlocks (size / BLOCKSIZE);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size, cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
Any time you are having trouble with a CUDA code, it's good practice to use proper cuda error checking. You can also run your code with cuda-memcheck to get a quick read on whether there are any errors.
Using either of these methods, you would have discovered an "invalid configuration error" on your kernel launch. This usually means that the parameters in the <<< >>> syntax are incorrect. When you run into this type of error, simply printing out those values may indicate the problem.
In your case, this line of code:
dim3 idyBlocks (size / BLOCKSIZE);
results in a value of 0 for idyBlocks when size is 4 and BLOCKSIZE is 16. So you are requesting a kernel launch of 0 blocks which is illegal. Therefore your kernel is not running and your results are not what you expect.
There are a variety of ways to solve this, many of them involving detecting this condition and adding an "extra block" when size is not evenly divisible by BLOCKSIZE. Using this approach, we may be launching "extra threads", so we must include a "thread check" in the kernel to prevent those extra threads from doing anything (such as accessing arrays out of bounds). For this, we often need to know the intended size in the kernel, and we can pass this value as an extra kernel parameter.
You've also made some errors in your handling of device variables. The following code:
dataOut = new float[size*size];
allocates enough space for a square matrix of dimension size. But the following code:
cudaMalloc ((void **) &dDataInv, size);
only allocates enough space for size bytes. You want size*size*sizeof(float) instead of size here, and likewise you want it in the following cudaMemset and cudaMemcpy operations. cudaMalloc, cudaMemset and cudaMemcpy require a size parameter in bytes, just like malloc, memset, and memcpy. This error is found in your usage of cudaMemset and cudaMemcpy as well.
The following code has those modifications, and seems to work correctly for me:
$ cat t580.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width, int size)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
if (tx < size)
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size*size*sizeof(float));
cudaMemset ((void *) dDataInv, 0, size*size*sizeof(float));
dim3 idyThreads (BLOCKSIZE);
int num_blocks = size/BLOCKSIZE + (size%BLOCKSIZE)?1:0;
dim3 idyBlocks (num_blocks);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size*size*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
$ nvcc -arch=sm_20 -o t580 t580.cu
$ cuda-memcheck ./t580
========= CUDA-MEMCHECK
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
========= ERROR SUMMARY: 0 errors
$
Note that it may be redundant to pass size twice to the kernel. For this particular example, we could have easily used the width parameter to do our kernel "thread check". But for educational purposes, I chose to call it out as a separate parameter, because in the general case you will often pass it as a separate parameter to other kernels that you write.
Finally, note that cudaThreadSynchronize() is deprecated and should be replaced with cudaDeviceSynchronize() instead. In this particular example, niether are actually necessary, as the next cudaMemcpy operation will force the same kind of synchronization, but you may use it if you decide to add cuda error checking to your code (recommended).
I'm attempting to write a basic matrix multiplication program using CUDA and C. The code itself doesn't really do anything right now, but should at least compile. After some research on the issue, I've determined that the issue is failure to include CUDA header files, indicating an issue with my Makefile. I'm extremely inexperienced with CUDA (and C for that matter), so any help would be greatly appreciated.
Output on command: make matrixMult1
c99 -I. -I/usr/local/cuda/include -c matrixMult1.c -o matrixMult1.o
matrixMult1.c: In function 'main':
matrixMult1.c:77: warning: implicit declaration of function 'cudaMalloc'
matrixMult1.c:82: warning: implicit declaration of function 'cudaMemcpy'
matrixMult1.c:83: error: 'cudaMemcpyHostToDevice' undeclared (first use in this
function)
matrixMult1.c:83: error: (Each undeclared identifier is reported only once
matrixMult1.c:83: error: for each function it appears in.)
matrixMult1.c:106: warning: implicit declaration of function 'cudaFree'
make: *** [matrixMult1.o] Error 1
Makefile:
GCC = c99
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.c
$(GCC) $(INCLUDES) -c matrixMult1.c -o $#
matrixMult1: matrixMult1.o
$(GCC) -o $# matrixMult1.o $(CUDA_LIBS)
C Program:
//********************************************************************
// matrixMult1.c
//
// A basic matrix multiplication program.
//********************************************************************
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "cuda.h"
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
void initMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Set random seed */
srand(2013);
/* Compute memory sizes for matrices A, B, and C */
unsigned int sizeA = WA * HA;
unsigned int sizeB = WB * HB;
unsigned int sizeC = WC * HC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/* Allocate memory for matrices A, B, and C */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(matrixA, sizeA);
initMatrix(matrixB, sizeB);
/* Print matrix A */
printf("\nMatrix A:\n");
for (int i = 0; i < sizeA; i++) {
printf("%f ", matrixA[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Print matrix B */
printf("\nMatrix B:\n");
for (int i = 0; i < sizeB; i++) {
printf("%f ", matrixB[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Allocate device memory */
float* deviceMemA;
float* deviceMemB;
float* deviceMemC;
cudaMalloc((void**) &deviceMemA, memoryA);
cudaMalloc((void**) &deviceMemB, memoryB);
cudaMalloc((void**) &deviceMemC, memoryC);
/* Copy host memory to device */
cudaMemcpy(deviceMemA, matrixA, memoryA,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemB, matrixB, memoryB,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemC, matrixC, memoryC,
cudaMemcpyHostToDevice);
/* Print matrix C */
printf("\nMatrix C:\n");
for (int i = 0; i < sizeC; i++) {
printf("%f ", matrixC[i]);
if (((i + 1) % WC) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
printf("\n");
/* Free up memory */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceMemA);
cudaFree(deviceMemB);
cudaFree(deviceMemC);
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix(float * matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; ++i) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float)RAND_MAX;
}
}
CUDA programs need to be compiled by nvcc. While your program does not yet contain any CUDA kernel yet, I believe that is what you want to achieve.
Rename your file from matrixMult1.c to matrixMult1.cu, remove the #include "cuda.h" line (programs compiled with nvcc don't need any CUDA-specific includes) and compile with nvcc instead of gcc (e.g. by setting GCC = nvcc at the beginning of the Makefile).
Two problems here:
You were not including the appropriate header into your code (which you fixed)
Your Makefile is, in fact, broken. It should look something like:
GCC = c99
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.c
$(GCC) $(INCLUDES) -c matrixMult1.c -o $#
matrixMult1: matrixMult1.o
$(GCC) -o $# matrixMult1.o $(CUDA_LIBS)
[Disclaimer: not tested, use at own risk]
The current problem is that the include path was only specified at the linkage phase of the build.
Note that these changes also preempt the missing symbols error you will get during linkage from not linking with the CUDA runtime library. Note that depending on whether you are using a 32 or 64 bit host OS, you may need to change the library path to $(CUDA_INSTALL_PATH)/lib64 for the linkage to work correctly.
I have the following code which I am trying to compile using nvcc.
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
int main(void)
{
size_t n = 100;
size_t i;
int *hostData;
unsigned int *devData;
hostData = (int *)calloc(n, sizeof(int));
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);
curandSetPseudoRandomGeneratorSeed(gen, 12345);
cudaMalloc((void **)&devData, n * sizeof(int));
curandGenerate(gen, devData, n);
cudaMemcpy(hostData, devData, n * sizeof(int), cudaMemcpyDeviceToHost);
for(i = 0; i < n; i++)
{
printf("%d ", hostData[i]);
}
printf("\n");
curandDestroyGenerator (gen);
cudaFree ( devData );
free ( hostData );
return 0;
}
This is the output I receive:
$ nvcc -o RNG7 RNG7.cu
/tmp/tmpxft_00005da4_00000000-13_RNG7.o: In function `main':
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0x6c): undefined reference to `curandCreateGenerator'
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0x7a): undefined reference to `curandSetPseudoRandomGeneratorSeed'
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0xa0): undefined reference to `curandGenerate'
tmpxft_00005da4_00000000-1_RNG7.cudafe1.cpp:(.text+0x107): undefined reference to `curandDestroyGenerator'
collect2: ld returned 1 exit status
My initial guess is that for some reason the CURAND Library is not properly installed or that it cannot find the curand.h header file.
Please let me know what I should look for or how to solve my problem.
Thanks!
#Wilo Maldonado: just use a linker flag -lcurand and
additionally -L/path/to/cuda/libs if you do not have it already
The problem is not the header file, otherwise you would have got a compile error. You have a linker error. You will need to tell your linker where to find the object or library file that contains those functions.