Allocate constant memory - cuda

I'm trying to set my simulation params in constant memory but without luck (CUDA.NET).
cudaMemcpyToSymbol function returns cudaErrorInvalidSymbol. The first parameter in cudaMemcpyToSymbol is string... Is it symbol name? actualy I don't understand how it could be resolved. Any help appreciated.
//init, load .cubin
float[] arr = new float[1];
arr[0] = 0.0f;
int size = Marshal.SizeOf(arr[0]) * arr.Length;
IntPtr ptr = Marshal.AllocHGlobal(size);
Marshal.Copy(arr, 0, ptr, arr.Length);
var error = CUDARuntime.cudaMemcpyToSymbol("param", ptr, 4, 0, cudaMemcpyKind.cudaMemcpyHostToDevice);
my .cu file contain
__constant__ float param;
Working solution
cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "name.cubin"));
simParams = cuda.GetModuleGlobal("params");
float[] parameters = new float[N]{...}
cuda.CopyHostToDevice<float>(simParams, parameters);

Unfortunately the __ constant __ must be in the same file scope as the memcpy to the symbol, and in your case your __ constant __ is in a separate .cu file.
The simple way around this is to provide a wrapper function in your .cu file, for example:
__constant__ float param;
// Host function to set the constant
void setParam(float value)
{
cudaMemcpyToSymbol("param", ptr, 4, 0, cudaMemcpyHostToDevice);
}
// etc.
__global__ void ...

If this question is actual you can use cuModuleGetGlobal and next cudaMemcpy like this:
private bool setValueToSymbol(CUmodule module, string symbol, int value)
{
CUdeviceptr devPtr = new CUdeviceptr();
uint lenBytes = 0;
CUResult result = CUDADriver.cuModuleGetGlobal(ref devPtr, ref lenBytes, module, symbol);
if (result == CUResult.Success)
{
int[] src = new int[] { value };
cudaError error = CUDARuntime.cudaMemcpy(devPtr, src, lenBytes, cudaMemcpyKind.cudaMemcpyHostToDevice);
if (error == cudaError.cudaSuccess)
return true;
else
return false;
}
else
{
return false;
}
}
where CUmodule module = cuda.LoadModule("MyCode.cubin");
This code works with NVIDIA GPU Computing SDK 3.1 and CUDA.NET 3.0.

constant memory has implicit local scope linkage.
make sure declaration is in the same file where you use it. it sounds like you have two files.
may also have to declare param to array (or maybe not)

Related

implementing random generator in cuda

i want to implement this function in cuda as device/global function so as to obtain random numbers which are in gaussian distribution.
double gasdev2() {
double ran3n(long *seed);
// double genrand64_real3();
static int iset=0;
static double gcos;
double tmp1,tmp2;
if (iset==0) {
tmp1=sqrt(-2*log(ran3n(&seed)));
tmp2=pi2*ran3n(&seed);
// tmp1=sqrt(-2*log(genrand64_real3()));
// tmp2=pi2*genrand64_real3();
gcos=tmp1*cos(tmp2);
iset=1;
return tmp1*sin(tmp2);
//return 1;
}else{
iset=0;
return gcos;
//return 1;
}
}
this function will be basically used in these function calls and in serial code these are like this
for(int i=0;i<NTO;i++){
Frdx[j]=gasdev2()*ranm[j]*tconst;
Frdy[j]=gasdev2()*ranm[j]*tconst;
Frdz[j]=gasdev2()*ranm[j]*tconst;
}
I'd suggest not implementing it yourself but using the random algorithms provided by Thrust:
uint32_t seed = 1234;
thrust::default_random_engine rng(seed);
thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
float random_value_1 = dist(rng);
float random_value_2 = dist(rng);
You can use this both in host and device code.
Have a look at the Thrust examples.

passing 2 parameters of undefined type to a constructor (of 2 expected)

I have a constructor where as first 2 parameteres I would like to pass:
ID3D11ShaderResourceView* as a steady downloaded texture OR
const CHAR* as a filename to downlowd this texture and assign it to a class ID3D11ShaderResourceView* member (with following release on demand), but I can not understand the way I should do it correctly. it looks this way:
class {button
public
button() {};
button(data1 (or texture or filname), data2 (or texture or filname), rest data....);
...
~button();
}
so I tried:
templates but failed, may be cause of knowledge lack, templates
define one type while I need a choice of 2. Varradic templates, or I didnt get them right but they mean undetermined amount of variables when I need to differ only 2 first.
Unions but it had conflict with class variable set - said could not match const char [amount] with const char* and unions do not work with std::string.
tried void* with typeid.name() but it always showed me "void *"
I don't want to overload constructors, becuase this will create 4+ of them barely differing one from another. Do you think boost::variant helps me in this case? Is there any smooth and effective way to build that kind of constructor? My c++ knowledge is on beginning level, sorry if its a duplicate topic, read all it suggested to me while creating it but didn't seem to find out anything closely similar, thanks:)
Update:
Applied boost::any, got next results:
class button : public frame {
public:
button() {};
button(boost::any _texture,
boost::any _hover_texture,
...
};
if (_texture.type() == typeid(ID3D11ShaderResourceView*)) texture = boost::any_cast<ID3D11ShaderResourceView*>(_texture);
if (_texture.type() == typeid(const char*))
{
if ( FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice,boost::any_cast<const char*>(_texture), NULL, NULL, &texture, NULL )) )
mboxout( "loading texture failed.", "UI texture load error", true );
};
if (_hover_texture.type() == typeid(ID3D11ShaderResourceView*)) hover_texture = boost::any_cast<ID3D11ShaderResourceView*>(_hover_texture);
if (_hover_texture.type() == typeid(const char*))
{
if ( FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice, boost::any_cast<const char*>(_hover_texture), NULL, NULL, &hover_texture, NULL )) )
mboxout( "loading texture failed.", "UI texture load error", true );
};
Is it the only possible decision because this one seems akward for me? Thanks :)
As always, when you have combinatoric explosion/tedious repetition, refactor your code into reusable units.
In this case, your constructor could be just
template <typename T1, typename T2>
button(T1 const& texture, T2 const& hover_texture)
: _texture(load(texture)),
_hover_texture(load(hover_texture))
{
};
And all the loading logic would be inside... you guess it the load function. A sample implementation of that:
static ID3D11ShaderResourceView* load(ID3D11ShaderResourceView* v){
return v; // just return the resources passed in
}
static ID3D11ShaderResourceView* load(char const* fname) {
ID3D11ShaderResourceView* p = NULL;
if (FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice, fname, NULL, NULL, &p, NULL)))
throw std::runtime_error(std::string("loading texture failed (") + fname + ")");
return p;
}
Note: while we were at it we separated UI from business logic. You do not want to display messageboxes from inside constructors. Ever. You just want to notify the caller of the problem and the caller decides what to do (use another resource, try a different path, retry a download, write a warning message to the log, shut down etc.)
Full Demo
Live On Coliru
#include <iostream>
#include <stdexcept>
///////////////////////////////////////////////////////
// mocking ID3D*
struct ID3D11ShaderResourceView;
enum ERROR_CODE { ERR_OK };
#define FAILED(e) (ERR_OK != (e))
static int gvDevice = 42;
ERROR_CODE D3DX11CreateShaderResourceViewFromFile(int, char const* fname, void*, void*, ID3D11ShaderResourceView**, void*) {
std::cout << "Loaded from " << fname << "\n";
return ERR_OK;
}
//
///////////////////////////////////////////////////////
struct frame{ virtual ~frame() = default; };
class button : public frame {
public:
button() {};
template <typename T1, typename T2>
button(T1 const& texture, T2 const& hover_texture)
: _texture(load(texture)),
_hover_texture(load(hover_texture))
{
};
private:
// TODO Rule-Of-Three constructor/destructorl
// SUGGEST: Rule-Of-Zero using shared pointers instead
ID3D11ShaderResourceView* _texture;
ID3D11ShaderResourceView* _hover_texture;
static ID3D11ShaderResourceView* load(ID3D11ShaderResourceView* v) { return v; }
static ID3D11ShaderResourceView* load(char const* fname) {
ID3D11ShaderResourceView* p = NULL;
if (FAILED(D3DX11CreateShaderResourceViewFromFile(gvDevice, fname, NULL, NULL, &p, NULL)))
throw std::runtime_error(std::string("loading texture failed (") + fname + ")");
return p;
}
};
#include <cassert>
int main() {
ID3D11ShaderResourceView* default_texture = NULL;
assert(!FAILED( D3DX11CreateShaderResourceViewFromFile(gvDevice, "default_texture.bin", NULL, NULL, &default_texture, NULL)));
try {
button button1("t1.bin", "hover1.bin");
button button2(default_texture, "hover2.bin");
button button3("t3.bin", default_texture);
button button4(default_texture, default_texture);
} catch(std::exception const& e) {
std::cout << "Oops: " << e.what() << "\n";
}
}
Prints:
Loaded from default_texture.bin
Loaded from t1.bin
Loaded from hover1.bin
Loaded from hover2.bin
Loaded from t3.bin
There's still a lot to be improved (see the comments, e.g.) but this is a start.

thrust::device_vector in constant memory

I have a float array that needs to be referenced many times on the device, so I believe the best place to store it is in __ constant __ memory (using this reference). The array (or vector) will need to be written once at run-time when initializing, but read by multiple different functions many millions of times, so constant copying to the kernel each function call seems like A Bad Idea.
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
__host__ __device__ float operator()(const float& x) const { return fmax(x,C);}
};
void foo(const thrust::host_vector<float> &, const float &);
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x,0.0);
return(0);
}
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
//this method works fine, but the memory transfer is unacceptable
thrust::device_vector<float> input_dev_vec(n);
input_dev_vec = input_host_x; //I want to avoid this
thrust::transform(input_dev_vec.begin(),input_dev_vec.end(),dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this memory transfer for debugging
//this method compiles fine, but crashes at runtime
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(dev_ptr,dev_ptr+n,dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this line crashes
}
I tried adding a global thrust::device_vector dev_x(n), but that also crashed at run-time, and would be in __ global __ memory rather than __ constant__ memory
This can all be made to work if I just discard the thrust library, but is there a way to use the thrust library with globals and device constant memory?
Good question! You can't cast a __constant__ array as if it's a regular device pointer.
I will answer your question (after the line below), but first: this is a bad use of __constant__, and it isn't really what you want. The constant cache in CUDA is optimized for uniform access across threads in a warp. That means all threads in the warp access the same location at the same time. If each thread of the warp accesses a different constant memory location, then the accesses get serialized. So your access pattern, where consecutive threads access consecutive memory locations, will be 32 times slower than a uniform access. You should really just use device memory. If you need to write the data once, but read it many times, then just use a device_vector: initialize it once, and then read it many times.
To do what you asked, you can use a thrust::counting_iterator as the input to thrust::transform to generate a range of indices into your __constant__ array. Then your functor's operator() takes an int index operand rather than a float value operand, and does the lookup into constant memory.
(Note that this means your functor is now __device__ code only. You could easily overload the operator to take a float and call it differently on host data if you need portability.)
I modified your example to initialize the data and print the result to verify that it is correct.
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
// only works as a device function
__device__ float operator()(const int& i) const {
// use index into constant array
return fmax(dev_x[i],C);
}
};
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n),
dev_sol.begin(),
struct_max(x0));
host_sol = dev_sol; //this line crashes
for (int i = 0; i < n; i++)
printf("%f\n", host_sol[i]);
}
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
for (int i = 0; i < n; i++) x[i] = rand() / (float)RAND_MAX;
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x, 0.5);
return(0);
}

CUDA program giving garbage value

I really do not understand why the output for the below code is not a and b.
#include<cutil.h>
#include<iostream>
__global__ void p(unsigned char **a){
unsigned char temp[2];
temp[0] = 'a';
temp[1] = 'b';
a[0] = temp;
}
void main(){
unsigned char **a ;
cudaMalloc((void**)&a, sizeof(unsigned char*));
p<<<1,1>>>(a);
unsigned char **c;
unsigned char b[2];
cudaMemcpy(c, a, sizeof(unsigned char *), cudaMemcpyDeviceToHost);
cudaMemcpy(b, c[0], 2*sizeof(unsigned char), cudaMemcpyDeviceToHost);
for( int i=0 ; i < 2; i++){
printf("%c\n", b[i]);
}
getchar();
}
what is wrong with my logic?
Let's leave out CUDA for now. Let's just make a function that writes data to a user-provided array. The user passes the array via a pointer:
void fill_me_up(int * dst)
{
// We sure hope that `dst` points to a large enough area of memory!
dst[0] = 28;
dst[1] = 75;
}
Now, what you're doing with the local variable doesn't make sense, because you want to use the address of a local variable, which becomes invalid after you leave the function scope. The next best thing you could do is memcpy(), or some equivalent C++ algorithm:
void fill_me_up_again(int * dst)
{
int temp[] = { 28, 75 };
memcpy((void *)dst, (const void *)temp, sizeof(temp));
}
OK, now on to calling that function: We first must provide the target memory, and then pass a pointer:
int main()
{
int my_memory[2]; // here's our memory -- automatic local storage
fill_me_up(my_memory); // OK, array decays to pointer-to-beginning
fill_me_up(&my_memory[0]); // A bit more explicit
int * your_memory = malloc(sizeof(int) * 2); // more memory, this time dynamic
fill_me_up_again(your_memory);
/* ... */
free(your_memory);
}
(In C++ you would probably have uses new int[2] and delete your_memory instead, but by using C malloc() the connection to CUDA hopefully becomes clear.)
When you're moving fill_me_up to the CUDA device, you have to give it a device pointer rather than a host pointer, so you have to set that one up first and afterwards copy the results back out, but that's about the only change.

mips _Unwind_Backtrace on SIGSEGV

On a mips platform, I am trying to get Unwind work. Currently if I issue print_trace manually stack trace is correctly shown as below:
backtrace_helper 0x4b6958
backtrace_helper 0x4b6ab4
backtrace_helper 0x2ac2f628
Obtained 3 stack frames.
./v(print_trace+0x38) [0x4b6958]
./v(main+0x90) [0x4b6ab4]
/lib/libc.so.0(__uClibc_main+0x24c) [0x2ac2f628]
But when a SIGSEGV occurs, stack trace does not show correct function call sequence.
backtrace_helper 0x4b7a74
backtrace_helper 0x2ab9b84c
Obtained 2 stack frames.
./v(getLineIDByPhyIdx+0x3d8) [0x4b7a74]
/lib/libpthread.so.0(__new_sem_post+0x2c8) [0x2ab9b84c]
I am compiling with -g -fexceptions -rdynamic. Also I have seen How to generate a stacktrace when my gcc C++ app crashes in which 2nd answer mentiones about wrong address but when I set as he does but it only changes 2nd frame and rest is the same, code snippet is below:
caller_address = (void *) uc->uc_mcontext.gregs[30]; // Frame pointer (from wikipedia here)
fprintf(stderr, "signal %d (%s), address is %p from %p\n",
sig_num, strsignal(sig_num), info->si_addr,
(void *)caller_address);
size = backtrace(array, 50);
/* overwrite sigaction with caller's address */
array[1] = caller_address;
messages = backtrace_symbols(array, size);
Code:
int main(int argc, char *argv[]) {
registerSignalHandler(signalHandler);
print_trace();
{
// Seg Fault
int *p = NULL;
*p = 54;
}
}
void print_trace(void) {
void *array[10];
size_t size;
char **strings;
size_t i;
/* Get the address at the time the signal was raised from the EIP (x86) */
size = backtrace(array, 10);
strings = backtrace_symbols(array, size);
printf("Obtained %zd stack frames.\n", size);
for (i = 0; i < size; i++)
printf("%s\n", strings[i]);
free(strings);
}
static _Unwind_Reason_Code
backtrace_helper (struct _Unwind_Context *ctx, void *a)
{
struct trace_arg *arg = a;
assert (unwind_getip != NULL);
/* We are first called with address in the __backtrace function. Skip it. */
if (arg->cnt != -1) {
arg->array[arg->cnt] = (void *) unwind_getip (ctx);
printf("backtrace_helper %p \n", arg->array[arg->cnt]);
}
if (++arg->cnt == arg->size)
return _URC_END_OF_STACK;
return _URC_NO_REASON;
}
/*
* Perform stack unwinding by using the _Unwind_Backtrace.
*
* User application that wants to use backtrace needs to be
* compiled with -fexceptions option and -rdynamic to get full
* symbols printed.
*/
int backtrace (void **array, int size)
{
struct trace_arg arg = { .array = array, .size = size, .cnt = -1 };
if (unwind_backtrace == NULL)
backtrace_init();
if (size >= 1)
unwind_backtrace (backtrace_helper, &arg);
return arg.cnt != -1 ? arg.cnt : 0;
}
void signalHandler( int sig, siginfo_t* siginfo, void* notused)
{
/* Print out the signal info */
signalInfo(sig, siginfo);
switch (sig) {
case SIGSEGV:
{
print_trace();
abort();
}
}
}
Frame pointer is practically never used on MIPS and obtaining backtrace without digging into symbols requires some heuristics.
Typical approach is to analyze code preceding current instruction pointer and try finding function prologue that adjusts SP. Using that info one can figure out location of preceding frame, etc.
See these slides for some of the gory details:
http://elinux.org/images/0/07/Intricacies_of_a_MIPS_Stack_Backtrace_Implementation.pdf