Update project from older CUDA version - cuda

In my older CUDA project I had the globals:
__device__ uint8_t dev_intersect
__constant__ uint8_t dev_flags
... and used them this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,"dev_intersect")
cudaMemcpyToSymbol("dev_flags",&flags,sizeof(flags))
Now, since CUDA 5.0 (and newer) the symbols must be passed directly (without string), so I define the globals this way:
__device__ uint8_t *dev_intersect
__constant__ uint8_t *dev_flags
...and call the functions this way:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect)
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags))
Am I doing it right so far? I'm asking you, because when I update the code, I start getting other errors, which makes me kinda suspicious. Thanks for any help.

Switching from a POD variable to a pointer is probably not what you want.
If you didn't make changes elsewhere in your code to account for that difference, I would expect things to break.
To update your cuda function calls, leave your variables as-is:
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
And just drop the quotes from your cuda API functions that use those variables:
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
Here is a complete worked example:
$ cat t524.cu
#include <stdio.h>
typedef unsigned char uint8_t;
__device__ uint8_t dev_intersect;
__constant__ uint8_t dev_flags;
__global__ void mykernel(uint8_t *d1_ptr){
printf("data 1 = %c\n", *d1_ptr);
printf("dev_flags = %c\n", dev_flags);
}
int main(){
uint8_t *ptr_dev_intersect;
uint8_t flags = 'X';
uint8_t dev_intersect_data = 'Y';
cudaGetSymbolAddress((void**)&ptr_dev_intersect,dev_intersect);
cudaMemcpyToSymbol(dev_flags,&flags,sizeof(flags));
cudaMemcpyToSymbol(dev_intersect,&dev_intersect_data,sizeof(dev_intersect_data));
mykernel<<<1,1>>>(ptr_dev_intersect);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -o t524 t524.cu
$ cuda-memcheck ./t524
========= CUDA-MEMCHECK
data 1 = Y
dev_flags = X
========= ERROR SUMMARY: 0 errors
$

Related

Writing on a register in parallel in CUDA C++

Given a 64-bit variable in a register in a kernel, let's say std::uint64_t var, I want to do some calculations and set each bit of this variable using 64 different threads separately but in parallel. Is it possible to write on a shared variable in parallel?
__shared__ std::uint64_t var = 0
in each thread (tid = 0 to 63):
do some calculations
if we should set the bit with index = tid then:
var |= ((std::uint64_t) 1 << tid)
I also realized that using atomicOr does not benefit us as it only works on integers.
You can do this using a 64-bit shared atomicOr():
$ cat t2082.cu
#include <cstdio>
#include <cstdint>
__global__ void k(){
__shared__ unsigned long long var;
if (!threadIdx.x) var = 0;
__syncthreads();
atomicOr(&var, 1<<threadIdx.x);
__syncthreads();
if (!threadIdx.x) printf("0x%lx\n", var);
}
int main(){
k<<<1,64>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2082 t2082.cu
$ compute-sanitizer ./t2082
========= COMPUTE-SANITIZER
0xffffffffffffffff
========= ERROR SUMMARY: 0 errors
$
The 64-bit shared atomicOr is supported on devices of cc3.5 or greater, which is the same device footprint supported by CUDA 11.
If you were, for example, on CUDA 10.x and using a cc3.0 device, you could do this with two 32-bit variables:
$ cat t2082.cu
#include <cstdio>
#include <cstdint>
__global__ void k(){
__shared__ unsigned var[2];
if (!threadIdx.x) {var[0] = 0; var[1] = 0;}
__syncthreads();
if (threadIdx.x < 32)
atomicOr(&(var[0]), 1<<threadIdx.x);
else
atomicOr(&(var[1]), 1<<(threadIdx.x-32));
__syncthreads();
unsigned long long var64 = (((unsigned long long)var[1])<<32) + var[0];
if (!threadIdx.x) printf("0x%lx\n", var64);
}
int main(){
k<<<1,64>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2082 t2082.cu
$ compute-sanitizer ./t2082
========= COMPUTE-SANITIZER
0xffffffffffffffff
========= ERROR SUMMARY: 0 errors
$

Static __device__ variable and kernels in separate file

I want statically declare a global variable with __device__ qualifier. In the same time I want to store functions intended to GPU in a separate file.
However, if I do so, the variable value is not transferred to GPU -- there are no errors in compilation or execution time, but memcpy functions do nothing.
When I move kernel function into the file with the host code, everything works.
I am sure, that it should be possible to split host and device functions into separate files in this case, but how to do this? I have seen just examples, when kernels and host code are in the same file.
I would be also very thankful, if somebody explained, why does it behaves so.
A sample code is listed below.
Thank you in advance.
Working directory:
$ ls
functionsGPU.cu functionsGPU.cuh staticGlobalMemory.cu
staticGlobalMemory.cu:
#include "functionsGPU.cuh"
#if VARIANT == 2
__global__ void checkGlobalVariable(){
printf("Old value (dev): %f\n", devData);
devData += 2.0f;
printf("New value (dev): %f\n", devData);
}
#endif
int main(int argc, char **argv){
int dev = 0;
float val = 3.2;
cudaSetDevice(dev);
printf("---------\nVARIANT %i\n---------\n", VARIANT);
printf("Old value (host): %f\n", val);
cudaMemcpyToSymbol(devData, &val, sizeof(float));
checkGlobalVariable <<<1, 1>>> ();
cudaMemcpyFromSymbol(&val, devData, sizeof(float));
printf("New value (host): %f\n", val);
cudaDeviceReset();
return 0;
}
functionsGPU.cuh:
#ifndef FUNCTIONSGPU_CUH
#define FUNCTIONSGPU_CUH
#include <cuda_runtime.h>
#include <stdio.h>
#define VARIANT 1
__device__ float devData;
#if VARIANT == 1
__global__ void checkGlobalVariable();
#endif
#endif
functionsGPU.cu:
#include "functionsGPU.cuh"
#if VARIANT == 1
__global__ void checkGlobalVariable(){
printf("Old value (dev): %f\n", devData);
devData += 2.0f;
printf("New value (dev): %f\n", devData);
}
#endif
This is compiled as
$ nvcc -arch=sm_61 staticGlobalMemory.cu functionsGPU.cu -o staticGlobalMemory
Output if the kernel and host code are in separate files (incorrect):
---------
VARIANT 1
---------
Old value (host): 3.200000
Old value (dev): 0.000000
New value (dev): 2.000000
New value (host): 3.200000
Output if the kernel and host code are in the same file (correct):
---------
VARIANT 2
---------
Old value (host): 3.200000
Old value (dev): 3.200000
New value (dev): 5.200000
New value (host): 5.200000
Your code structure, where device code in one compilation unit references device code or device entities in another compilation unit, will require CUDA relocatable device code compilation and linking.
In the case of __device__ variables such as what you have here:
Add -rdc=true to enable this, to your nvcc compilation command line
Add extern in front of the definition of devData, in functionsGPU.cuh
Add __device__ float devData; to staticGlobalMemory.cu
In the case of linking to a __device__ function in a separate file, along with providing the prototype typically via a header file like you would with any function in C++, you also need to add -rdc=true to your nvcc compilation command line, to enable device code linking. Steps 2 and 3 above are not needed.
That should fix the issue. Step 1 provides the necessary cross-module linkage, and steps 2 and 3 will fix the duplicate definition problem you would have, since you are including the same variable via a header file in separate compilation units.
For a reference of how to do the device code compilation setting in windows visual studio, see here.

Increment and access global counter CUDA

I need to make my kernel communicate with the host. I tried to use a global counter (better ways are well accepted), but the following code prints always 0. What am I doing wrong? (I tried both commented and uncommented ways).
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", c);
return 0;
}
Anytime you are having trouble with a CUDA code, I strongly encourage you to use proper CUDA error checking and run your code with cuda-memcheck, before asking others for help. Even if you don't understand the error output, providing it in your question will be useful for those trying to help you.
If you had done so, you would have received a report that cudaMemcpyFromSymbol is throwing an invalid argument error.
If you study the documentation for that function call, you will see that the 4th parameter is not the direction parameter, but is the offset parameter. So providing cudaMemcpyDeviceToHost is incorrect for the offset parameter. Since cudaMemcpyFromSymbol is always a device->host transfer, providing the direction argument is redundant, and since it is provided a default, is unnecessary. Your code works correctly for me simply by eliminating that:
$ cat t1414.cu
#include <stdio.h>
#include <cuda_runtime.h>
//__device__ int count[1] = {0};
__device__ int count = 0;
__global__ void inc() {
//count[0]++;
atomicAdd(&count, 1);
}
int main(void) {
inc<<<1,10>>>();
cudaDeviceSynchronize();
//int *c;
int c;
cudaMemcpyFromSymbol(&c, count, sizeof(int));
printf("%d\n", c);
return 0;
}
$ nvcc -o t1414 t1414.cu
$ cuda-memcheck ./t1414
========= CUDA-MEMCHECK
10
========= ERROR SUMMARY: 0 errors
$

Loading multiple modules in JCuda is not working

In jCuda one can load cuda files as PTX or CUBIN format and call(launch) __global__ functions (kernels) from Java.
With keeping that in mind, I want to develop a framework with JCuda that gets user's __device__ function in a .cu file at run-time, loads and runs it.
And I have already implemented a __global__ function, in which each thread finds out the start point of its related data, perform some computation, initialization and then call user's __device__ function.
Here is my kernel pseudo code:
extern "C" __device__ void userFunc(args);
extern "C" __global__ void kernel(){
// initialize
userFunc(args);
// rest of the kernel
}
And user's __device__ function:
extern "C" __device__ void userFunc(args){
// do something
}
And in Java side, here is the part that I load the modules(modules are made from ptx files which are successfully created from cuda files with this command: nvcc -m64 -ptx path/to/cudaFile -o cudaFile.ptx)
CUmodule kernelModule = new CUmodule(); // 1
CUmodule userFuncModule = new CUmodule(); // 2
cuModuleLoad(kernelModule, ptxKernelFileName); // 3
cuModuleLoad(userFuncModule, ptxUserFuncFileName); // 4
When I try to run it I got error at line 3 : CUDA_ERROR_NO_BINARY_FOR_GPU. After some searching I get that my ptx file has some syntax error. After running this suggested command:
ptxas -arch=sm_30 kernel.ptx
I got:
ptxas fatal : Unresolved extern function 'userFunc'
Even when I replace line 3 with 4 to load userFunc before kernel I get this error. I got stuck at this phase. Is this the correct way to load multiple modules that need to be linked together in JCuda? Or is it even possible?
Edit:
Second part of the question is here
The really short answer is: No, you can't load multiple modules into a context in the runtime API.
You can do what you want, but it requires explicit setup and execution of a JIT linking call. I have no idea how (or even whether) that has been implemented in JCUDA, but I can show you how to do it with the standard driver API. Hold on...
If you have a device function in one file, and a kernel in another, for example:
// test_function.cu
#include <math.h>
__device__ float mathop(float &x, float &y, float &z)
{
float res = sin(x) + cos(y) + sqrt(z);
return res;
}
and
// test_kernel.cu
extern __device__ float mathop(float & x, float & y, float & z);
__global__ void kernel(float *xvals, float * yvals, float * zvals, float *res)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
res[tid] = mathop(xvals[tid], yvals[tid], zvals[tid]);
}
You can compile them to PTX as usual:
$ nvcc -arch=sm_30 -ptx test_function.cu
$ nvcc -arch=sm_30 -ptx test_kernel.cu
$ head -14 test_kernel.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324607
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_30
.address_size 64
// .globl _Z6kernelPfS_S_S_
.extern .func (.param .b32 func_retval0) _Z6mathopRfS_S_
At runtime, your code must create a JIT link session, add each PTX to the linker session, then finalise the linker session. This will give you a handle to a compiled cubin image which can be loaded as a module as usual. The simplest possible driver API code to put this together looks like this:
#include <cstdio>
#include <cuda.h>
#define drvErrChk(ans) { drvAssert(ans, __FILE__, __LINE__); }
inline void drvAssert(CUresult code, const char *file, int line, bool abort=true)
{
if (code != CUDA_SUCCESS) {
fprintf(stderr, "Driver API Error %04d at %s %d\n", int(code), file, line);
exit(-1);
}
}
int main()
{
cuInit(0);
CUdevice device;
drvErrChk( cuDeviceGet(&device, 0) );
CUcontext context;
drvErrChk( cuCtxCreate(&context, 0, device) );
CUlinkState state;
drvErrChk( cuLinkCreate(0, 0, 0, &state) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_function.ptx", 0, 0, 0) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_kernel.ptx" , 0, 0, 0) );
size_t sz;
char * image;
drvErrChk( cuLinkComplete(state, (void **)&image, &sz) );
CUmodule module;
drvErrChk( cuModuleLoadData(&module, image) );
drvErrChk( cuLinkDestroy(state) );
CUfunction function;
drvErrChk( cuModuleGetFunction(&function, module, "_Z6kernelPfS_S_S_") );
return 0;
}
You should be able to compile and run this as posted and verify it works OK. It should serve as a template for a JCUDA implementation, if they have JIT linking support implemented.

cudaMemcpyFromSymbol on a __device__ variable

I am trying to apply a kernel function on a __device__ variable, which, according to the specs, resides "in global memory"
#include <stdio.h>
#include "sys_data.h"
#include "my_helper.cuh"
#include "helper_cuda.h"
#include <cuda_runtime.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
int main(void) {
checkCudaErrors(cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double)));
vector_projection<double><<<1,10>>>(DEV_X, 10);
getLastCudaError("oops");
checkCudaErrors(cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double)));
return 0;
}
The kernel function vector_projection is defined in my_helper.cuh as follows:
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
As you can see, I use cudaMemcpyToSymbol and cudaMemcpyFromSymbol to transfer data to and from the device. However, I'm getting the following error:
CUDA error at ../src/vectorAdd.cu:19 code=4(cudaErrorLaunchFailure)
"cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double))"
Footnote: I can of course avoid to use __device__ variables and go for something like this which works fine; I just want to see how to do the same thing (if possible) with __device__ variables.
Update: The output of cuda-memcheck can be found at http://pastebin.com/AW9vmjFs. The error messages I get are as follows:
========= Invalid __global__ read of size 8
========= at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection<double>(double*, int)
========= by thread (9,0,0) in block (0,0,0)
========= Address 0x000370e8 is out of bounds
The root of the problem is that you are not allowed to take the address of a device variable in ordinary host code:
vector_projection<double><<<1,10>>>(DEV_X, 10);
^
Although this seems to compile correctly, the actual address passed is garbage.
To take the address of a device variable in host code, we can use cudaGetSymbolAddress
Here is a worked example that compiles and runs correctly for me:
$ cat t577.cu
#include <stdio.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
int main(void) {
cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double));
double *my_dx;
cudaGetSymbolAddress((void **)&my_dx, DEV_X);
vector_projection<double><<<1,10>>>(my_dx, 10);
cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double));
for (int i = 0; i < 10; i++)
printf("%d: %f\n", i, Y[i]);
return 0;
}
$ nvcc -arch=sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0: 1.000000
1: 0.000000
2: 3.000000
3: 0.000000
4: 5.000000
5: 0.000000
6: 7.000000
7: 0.000000
8: 9.000000
9: 0.000000
========= ERROR SUMMARY: 0 errors
$
This is not the only way to address this. It is legal to take the address of a device variable in device code, so you could modify your kernel with a line something like this:
T *dx = DEV_X;
and forgo passing of the device variable as a kernel parameter. As suggested in the comments, you could also modify your code to use Unified Memory.
Regarding error checking, if you deviate from proper cuda error checking and are not careful in your deviations, the results may be confusing. Most cuda API calls can, in addition to errors arising from their own behavior, return an error that resulted from some previous CUDA asynchronous activity (usually kernel calls).