I'm transporting data to specific CUDA symbol, my CUDA version is 10.1, GPU is Tesla K80. I compiled the code on VS2017, code generated by compute_35 & sm35. When I wrote my code like this,
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(&scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
printf("%d: %s.\n",cudaErr,cudaGetErorString(cudaErr));
it compiled well but got cudaErrInvalidSymbol when I run the program,
13: Invalid device symbol
If I modified my code like this,
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
then the compile would fail due to incompatible parameter type as the first parameter is FLOAT while function asks VOID*, here I found the function definition in cuda_runtime_api.h,
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
Could anyone please give some advice, much appreciated.
This:
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(&scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
printf("%d: %s.\n",cudaErr,cudaGetErorString(cudaErr));
is illegal/wrong in two ways. You must use nvcc to compile the code using a device code aware trajectory, and the first argument of the cudaMemcpyToSymbol call is incorrect. If you simply rename your .cpp source file to have a .cu file extension and change the contents so that it looks like this:
<.cu>
#include <.h>
....
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(scoreRatio, &ScoreRatio, sizeof(ScoreRatio));
printf("%d: %s.\n", cudaErr, cudaGetErorString(cudaErr));
it will both compile and run correctly. See here for an explanation of why it is necessary to change the first argument of the cudaMemcpyToSymbol call.
Related
From the documentation:
I.4.20.4. Constexpr functions and function templates
By default, a
constexpr function cannot be called from a function with incompatible
execution space. The experimental nvcc flag --expt-relaxed-constexpr
removes this restriction. When this flag is specified, host code can
invoke a __device__ constexpr function and device code can invoke a
__host__ constexpr function.
I read it, but I don't understand what it means - device code can invoke a host constexpr function? Here is my test:
constexpr int bar(int i)
{
#ifdef __CUDA_ARCH__
return i;
#else
return 555;
#endif
}
__global__ void kernel()
{
int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
printf("%i\n", bar(tid));
}
int main(int argc, char *[])
{
static_assert(bar(5) > 0);
// static_assert(bar(argc) > 0); // compile error
cout << bar(argc) << endl;
kernel<<<2, 2>>>();
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
It prints:
555
0
1
2
3
According to my understanding, the host invokes the host function, while the device invokes the device function. I.e. it behaves the same as if I declare bar with both __host__ and __device__ attributes. Adding a single attribute (__host__ or __device__) doesn't make any difference.
As a comparison, the documentation for std::initializer_list is much clearer:
I.4.20.2. std::initializer_list
By default, the CUDA compiler will
implicitly consider the member functions of std::initializer_list to
have __host__ __device__ execution space specifiers, and therefore
they can be invoked directly from device code.
Here I don't have any questions.
What does the documentation mean exactly?
Consider the following code.
#include <algorithm> //std::max
__global__ void kernel(int *array, int n) {
array[0] = std::max(array[1], array[2]);
}
This code will not compile by default.
error: calling a constexpr __host__ function("max") from a __global__ function("kernel") is not allowed. The experimental flag '--expt-relaxed-constexpr' can be used to allow this.
std::max is a standard host function without __device__ execution space specifiers and thus cannot be called from device code.
However, when the compiler flag --expt-relaxed-constexpr is specified, the code compiles nonetheless. I cannot give you any details about how this is achieved internally
I'm relatively new to cuda programming and can't find a solution to my problem.
I'm trying to have a shared library, lets call it func.so, that defines a device function
__device__ void hello(){ prinf("hello");}
I then want to be able to access that library via dlopen, and use that function in my programm. I tried something along the following lines:
func.cu
#include <stdio.h>
typedef void(*pFCN)();
__device__ void dhello(){
printf("hello\n")
}
__device__ pFCN ptest = dhello;
pFCN h_pFCN;
extern "C" pFCN getpointer(){
cudaMemcpyFromSymbol(&h_pFCN, ptest, sizeof(pFCN));
return h_pFCN;
}
main.cu
#include <dlfcn.h>
#include <stdio.h>
typedef void (*fcn)();
typedef fcn (*retpt)();
retpt hfcnpt;
fcn hfcn;
__device__ fcn dfcn;
__global__ void foo(){
(*dfcn)();
}
int main() {
void * m_handle = dlopen("gputest.so", RTLD_NOW);
hfcnpt = (retpt) dlsym( m_handle, "getpointer");
hfcn = (*hfcnpt)();
cudaMemcpyToSymbol(dfcn, &hfcn, sizeof(fcn), 0, cudaMemcpyHostToDevice);
foo<<<1,1>>>();
cudaThreadSynchronize();
return 0;
}
But this way I get the following error when debugging with cuda-gdb:
CUDA Exception: Warp Illegal Instruction
Program received signal CUDA_EXCEPTION_4, Warp Illegal Instruction.
0x0000000000806b30 in dtest () at func.cu:5
I appreciate any help you all can give me! :)
Calling a __device__ function in one compilation unit from device code in another compilation unit requires separate compilation with device linking usage of nvcc.
However, such usage with libraries only works with static libraries.
Therefore if the target __device__ function is in the .so library, and the calling code is outside of the .so library, your approach cannot work, with the current nvcc toolchain.
The only "workarounds" I can suggest would be to put the desired target function in a static library, or else put both caller and target inside the same .so library. There are a number of questions/answers on the cuda tag which give examples of these alternate approaches.
I am using CUDA 5.0. I noticed that the compiler will allow me to use host-declared int constants within kernels. However, it refuses to compile any kernels that use host-declared float constants. Does anyone know the reason for this seeming discrepancy?
For example, the following code runs just fine as is, but it will not compile if the final line in the kernel is uncommented.
#include <cstdio>
#include <cuda_runtime.h>
static int __constant__ DEV_INT_CONSTANT = 1;
static float __constant__ DEV_FLOAT_CONSTANT = 2.0f;
static int const HST_INT_CONSTANT = 3;
static float const HST_FLOAT_CONSTANT = 4.0f;
__global__ void uselessKernel(float * val)
{
*val = 0.0f;
// Use device int and float constants
*val += DEV_INT_CONSTANT;
*val += DEV_FLOAT_CONSTANT;
// Use host int and float constants
*val += HST_INT_CONSTANT;
//*val += HST_FLOAT_CONSTANT; // won't compile if uncommented
}
int main(void)
{
float * d_val;
cudaMalloc((void **)&d_val, sizeof(float));
uselessKernel<<<1, 1>>>(d_val);
cudaFree(d_val);
}
Adding a const number in the device code is OK, but adding a number stored on the host memory in the device code is NOT.
Every reference of the static const int in your code can be replaced with the value 3 by the compiler/optimizer when the addr of that variable is never referenced. In this case, it is like #define HST_INT_CONSTANT 3, and no host memory is allocated for this variable.
But for float var, the host memory is always allocated even it is of static const float. Since the kernel can not access the host memory directly, your code with static const float won't be compiled.
For C/C++, int can be optimized more aggressively than float.
You code runs when the comment is ON can be seen as a bug of CUDA C I think. The static const int is a host side thing, and should not be accessible to the device directly.
I've ported a cuda project from linux to windows (basically just added few defines and typedefs in the header file). I'm using visual studio 2008, and the cuda runtime api custom build rules from the SDK. The code is c, not c++ (and I'm compiling /TC not /TP)
I'm having scope issues that I didn't have in linux. Global variables in my header file aren't shared between the .c files and .cu files.
I've created a simplified project, and here is all of the code:
main.h:
#ifndef MAIN_H
#define MAIN_H
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
cudaEvent_t cudaEventStart;
#if defined __cplusplus
extern "C" void func(void);
#else
extern void func(void);
#endif
#endif
main.c:
#include "main.h"
int main(void)
{
int iDevice = 0;
cudaSetDevice(iDevice);
cudaFree(0);
cudaGetDevice(&iDevice);
printf("device: %d\n", iDevice);
cudaEventCreate(&cudaEventStart);
printf("create event: %d\n", (int) cudaEventStart);
func();
cudaEventDestroy(cudaEventStart);
printf("destroy event: %d\n", (int) cudaEventStart);
return cudaThreadExit();
}
kernel.cu:
#include "main.h"
void func()
{
printf("event in cu: %d\n", (int) cudaEventStart);
}
output:
device: 0
create event: 44199920
event in cu: 0
event destroy: 441999920
Any ideas about what I am doing wrong here? How do I need to change my setup so that it works in visual studio? Ideally, I'd like a setup that works multi-platform.
CUDA 3.2, GTX 480, 64-bit Win7, 263.06 general
What you are trying to do
Would not work even without CUDA -- try renaming kernel.cu to kernel.c and recompile. You will get a linker error because cudaEventStart will be multiply defined -- in each compilation unit (.c file) that includes it. You would need to make the variable static, and initialize it in only one compilation unit.
Compiles in CUDA because CUDA does not have a linker, and therefore code in compilation units compiled by nvcc (.cu files) cannot reference symbols in other compilation units. CUDA doesn't support static global variables currently. In the future CUDA will have a linker, but currently it does not.
What is happening is each compilation unit is getting its own, non-conflicting instance of cudaEventStart.
What you can do is get rid of the global variable (make it a local variable in main()), add cudaEvent_t parameters to the functions that need to use the event, and then pass the event variable around.
BTW, in your second post, you have circular #includes...
I modified my simplified example (with success) by including the .cu file in the header and removing the forward declarations of the .cu file function.
main.h:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "kernel.cu"
cudaEvent_t cudaEventStart;
main.c:
#include "main.h"
int main(void)
{
int iDevice = 0;
cudaSetDevice(iDevice);
cudaFree(0);
cudaGetDevice(&iDevice);
printf("device: %d\n", iDevice);
cudaEventCreate(&cudaEventStart);
printf("create event: %d\n", (int) cudaEventStart);
func();
cudaEventDestroy(cudaEventStart);
printf("destroy event: %d\n", (int) cudaEventStart);
return cudaThreadExit();
}
kernel.cu:
#ifndef KERNEL_CU
#define KERNEL_CU
#include "main.h"
void func(void);
void func()
{
printf("event in cu: %d\n", (int) cudaEventStart);
}
#endif
output:
device: 0
create event: 42784024
event in cu: 42784024
event destroy: 42784024
About to see if it works in my real project, and whether the solution is portable back to linux.
This is my first question. I gave up and will use a hand rolled functor for this, but I am curious as to how it is supposed to be done. The contrived example below is intended to resize all of the vectors in a vector to be of size 9, by filling them with nulls. The indicated line causes MinGW GCC 4.5.0 to spew a lot of template errors. I've tried several different permutations, but only posted the code that I consider to be "closest to correct" below. How should it be written? Note, I want to retain the two-argument version of resize.
#include <vector>
using std::vector;
#include <algorithm>
using std::for_each;
#include <tr1/functional>
using std::tr1::bind;
using std::tr1::placeholders::_1;
int main() {
vector<vector<void *> > stacked_vector(20);
for_each(stacked_vector.begin(),stacked_vector.end(),
bind(&std::vector<void *>::resize,_1,9,0/*NULL*/)); // voluminous error output
return 0;
}
Thank you very much for your input.
It's hard to say without seeing the error output (and frankly, even with it). However, try passing the NULL as a void* type: static_cast<void*>(0). Otherwise the object returned by bind tries to give an int value as the second parameter to resize.
Try this.
#include <functional>
#include <algorithm>
#include <iostream>
#include <vector>
int main()
{
typedef std::vector<int> vec_int;
typedef std::vector<vec_int> vec_vec_int;
// Do this to make the _1 work
using namespace std::placeholders;
static const int FIRST_DIM = 5;
static const int SECOND_DIM = 10;
static const int DEFAULT_VALUE = 66;
vec_vec_int v(FIRST_DIM);
std::for_each(v.begin(), v.end(),
std::bind(&vec_int::resize, _1, SECOND_DIM, DEFAULT_VALUE));
std::cout << v[4][9];
return (0);
}
If you do not want to specify the default value, you do not need to.