implementing random generator in cuda - cuda

i want to implement this function in cuda as device/global function so as to obtain random numbers which are in gaussian distribution.
double gasdev2() {
double ran3n(long *seed);
// double genrand64_real3();
static int iset=0;
static double gcos;
double tmp1,tmp2;
if (iset==0) {
tmp1=sqrt(-2*log(ran3n(&seed)));
tmp2=pi2*ran3n(&seed);
// tmp1=sqrt(-2*log(genrand64_real3()));
// tmp2=pi2*genrand64_real3();
gcos=tmp1*cos(tmp2);
iset=1;
return tmp1*sin(tmp2);
//return 1;
}else{
iset=0;
return gcos;
//return 1;
}
}
this function will be basically used in these function calls and in serial code these are like this
for(int i=0;i<NTO;i++){
Frdx[j]=gasdev2()*ranm[j]*tconst;
Frdy[j]=gasdev2()*ranm[j]*tconst;
Frdz[j]=gasdev2()*ranm[j]*tconst;
}

I'd suggest not implementing it yourself but using the random algorithms provided by Thrust:
uint32_t seed = 1234;
thrust::default_random_engine rng(seed);
thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
float random_value_1 = dist(rng);
float random_value_2 = dist(rng);
You can use this both in host and device code.
Have a look at the Thrust examples.

Related

How to implement properly an inline function in the device that returns a vector to another device function?

I want to implement properly an inlined device function that fill out a vector of dynamic size and return the filled vector like:
__device__ inline thrust::device_vector<double> make_array(double zeta, int l)
{
thrust::device_vector<double> ret;
int N =(int)(5*l+zeta); //the size of the array will depend on l and zeta, in a complex way...
// Make sure of sufficient memory allocation
ret.reserve(N);
// Resize array
ret.resize(N);
//fill it:
//for(int i=0;i<N;i++)
// ...;
return ret;
}
My goal is to use the content of the returned vector in another device function like:
__device__ inline double use_array(double zeta,int l)
{
thrust::device_vector<double> array = make_array(zeta, l);
double result = 0;
for(int i=0; i<array.size(); i++)
result += array[i];
return result;
}
How can I do it properly? my feeling is that a thrust vector is designed for this type of task, but I want to do it properly. What is the standard CUDA approach to this task?
thrust::device_vector is not usable in device code.
However you can return a pointer to a dynamically allocated area, like so:
#include <assert.h>
template <typename T>
__device__ T* make_array(T zeta, int l)
{
int N =(int)(5*l+zeta); //the size of the array will depend on l and zeta, in a complex way...
T *ret = (T *)malloc(N*sizeof(T));
assert(ret != NULL); // error checking
//fill it:
//for(int i=0;i<N;i++)
// ret[i] = ...;
return ret;
}
The inline keyword should not be necessary. The compiler will aggressively inline functions wherever possible.

CUDA Thrust: Finding the index of the first element in a vector satisfying a predicate (e.g., zero or negative) [Matlab's syntax min(find(x<=0))]

I am attempting to find the the index of the first zero or negative value of an array using CUDA Thrust. The serial CPU code I am attempting to write using CUDA Thrust is the following:
for (int i = StartIndex; i <= ArrayLimitIndex; i++)
{
if (Array[i] <= 0) { DesiredIndex = i; break; }
}
I am thinking that the easiest way to do this on the GPU will be using the find_if function within the Thrust library.
The array is already on the GPU and I am attempting to search for the index on this array using Thrust as such:
struct less_than_or_eq_zero
{
__host__ __device__
bool operator() (double x)
{
return x <= 0;
}
};
thrust::device_vector<double>::iterator iter;
thrust::device_ptr<double> dev_ptr_Col46 = thrust::device_pointer_cast(dev_Col46);
iter = thrust::find_if(thrust::device, dev_ptr_Col46, dev_ptr_Col46 + size,less_than_or_eq_zero());
Now I would like to use the value of iter as an argument for my next kernel:
newKernel<<<size, 1>>>(*dev_array, iter)
where the newKernel definition is of the form:
__global__ void newKernel(double *dev_array, iter)
{
int x = blockIdx.x;
if(x <= iter)
{
//process data here...
}
}
I know that the code I have here is incorrect and I have a few questions regarding the use of iter. First, iter is a device_vector. Is there any way I can make iter just one value and not a vector? Also, when I have executed the find_if how can I use the value of iter in my next kernel call?
Any help with this be greatly appreciated.
Thanks
I'm summarizing the comments by talonmies and Jared Hoberock above as well as the answer by Sebastian Dressler in a fully compilable and executable example. The code calculates, by CUDA Thrust, the index of the first element of a vector satisfying a predicate (x<=0. in this case), I hope it will be helpful for future readers.
#include <thrust/device_vector.h>
#include <stdio.h>
struct less_than_or_eq_zero
{
__host__ __device__ bool operator() (double x) { return x <= 0.; }
};
int main(void)
{
int N = 6;
thrust::device_vector<float> D(N);
D[0] = 3.;
D[1] = 2.3;
D[2] = -1.3;
D[3] = 0.;
D[4] = 3.;
D[5] = -44.;
thrust::device_vector<float>::iterator iter1 = D.begin();
thrust::device_vector<float>::iterator iter2 = thrust::find_if(D.begin(), D.begin() + N, less_than_or_eq_zero());
int d = thrust::distance(iter1, iter2);
printf("Index = %i\n",d);
getchar();
return 0;
}
As you do not use a device_vector in your kernel but a raw array, you have to pass it an index and not an iterator. You can obtain the index by using thrust::distance to calculate the distance between dev_ptr_Col46 and iter.
You'll also want to read thrust iterators documentation, where distance is documented.
Try this:
thrust::device_ptr<double> val_ptr = thrust::find_if(dev_ptr_Col46, dev_ptr_Col46 + size,less_than_or_eq_zero());
double * val = thrust::raw_pointer_cast(val_ptr);
newKernel<<<size, 1>>>(dev_array, val)
Your kernel will have to have signature
__global__ void newKernel(double * dev_array, double * val)

thrust::device_vector in constant memory

I have a float array that needs to be referenced many times on the device, so I believe the best place to store it is in __ constant __ memory (using this reference). The array (or vector) will need to be written once at run-time when initializing, but read by multiple different functions many millions of times, so constant copying to the kernel each function call seems like A Bad Idea.
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
__host__ __device__ float operator()(const float& x) const { return fmax(x,C);}
};
void foo(const thrust::host_vector<float> &, const float &);
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x,0.0);
return(0);
}
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
//this method works fine, but the memory transfer is unacceptable
thrust::device_vector<float> input_dev_vec(n);
input_dev_vec = input_host_x; //I want to avoid this
thrust::transform(input_dev_vec.begin(),input_dev_vec.end(),dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this memory transfer for debugging
//this method compiles fine, but crashes at runtime
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(dev_ptr,dev_ptr+n,dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this line crashes
}
I tried adding a global thrust::device_vector dev_x(n), but that also crashed at run-time, and would be in __ global __ memory rather than __ constant__ memory
This can all be made to work if I just discard the thrust library, but is there a way to use the thrust library with globals and device constant memory?
Good question! You can't cast a __constant__ array as if it's a regular device pointer.
I will answer your question (after the line below), but first: this is a bad use of __constant__, and it isn't really what you want. The constant cache in CUDA is optimized for uniform access across threads in a warp. That means all threads in the warp access the same location at the same time. If each thread of the warp accesses a different constant memory location, then the accesses get serialized. So your access pattern, where consecutive threads access consecutive memory locations, will be 32 times slower than a uniform access. You should really just use device memory. If you need to write the data once, but read it many times, then just use a device_vector: initialize it once, and then read it many times.
To do what you asked, you can use a thrust::counting_iterator as the input to thrust::transform to generate a range of indices into your __constant__ array. Then your functor's operator() takes an int index operand rather than a float value operand, and does the lookup into constant memory.
(Note that this means your functor is now __device__ code only. You could easily overload the operator to take a float and call it differently on host data if you need portability.)
I modified your example to initialize the data and print the result to verify that it is correct.
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
// only works as a device function
__device__ float operator()(const int& i) const {
// use index into constant array
return fmax(dev_x[i],C);
}
};
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n),
dev_sol.begin(),
struct_max(x0));
host_sol = dev_sol; //this line crashes
for (int i = 0; i < n; i++)
printf("%f\n", host_sol[i]);
}
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
for (int i = 0; i < n; i++) x[i] = rand() / (float)RAND_MAX;
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x, 0.5);
return(0);
}

Thrust reduce not working with non equal input/output types

I'm attempting to reduce the min and max of an array of values using Thrust and I seem to be stuck. Given an array of floats what I would like is to reduce their min and max values in one pass, but using thrust's reduce method I instead get the mother (or at least auntie) of all template compile errors.
My original code contains 5 lists of values spread over 2 float4 arrays that I want reduced, but I've boiled it down to this short example.
struct ReduceMinMax {
__host__ __device__
float2 operator()(float lhs, float rhs) {
return make_float2(Min(lhs, rhs), Max(lhs, rhs));
}
};
int main(int argc, char *argv[]){
thrust::device_vector<float> hat(4);
hat[0] = 3;
hat[1] = 5;
hat[2] = 6;
hat[3] = 1;
ReduceMinMax binary_op_of_dooooom;
thrust::reduce(hat.begin(), hat.end(), 4.0f, binary_op_of_dooooom);
}
If I split it into 2 reductions instead it of course works. My question is then: Is it possible to reduce both the min and max in one pass with thrust and how? If not then what is the most efficient way of achieving said reduction? Will a transform iterator help me (and if so, will the reduction then be a one pass reduction?)
Some additional info:
I'm using Thrust 1.5 (as supplied by CUDA 4.2.7)
My actual code is using reduce_by_key, not just reduce.
I found transform_reduce while writing this question, but that one doesn't take keys into account.
As talonmies notes, your reduction does not compile because thrust::reduce expects the binary operator's argument types to match its result type, but ReduceMinMax's argument type is float, while its result type is float2.
thrust::minmax_element implements this operation directly, but if necessary you could instead implement your reduction with thrust::inner_product, which generalizes thrust::reduce:
#include <thrust/inner_product.h>
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <cassert>
struct minmax_float
{
__host__ __device__
float2 operator()(float lhs, float rhs)
{
return make_float2(thrust::min(lhs, rhs), thrust::max(lhs, rhs));
}
};
struct minmax_float2
{
__host__ __device__
float2 operator()(float2 lhs, float2 rhs)
{
return make_float2(thrust::min(lhs.x, rhs.x), thrust::max(lhs.y, rhs.y));
}
};
float2 minmax1(const thrust::device_vector<float> &x)
{
return thrust::inner_product(x.begin(), x.end(), x.begin(), make_float2(4.0, 4.0f), minmax_float2(), minmax_float());
}
float2 minmax2(const thrust::device_vector<float> &x)
{
using namespace thrust;
pair<device_vector<float>::const_iterator, device_vector<float>::const_iterator> ptr_to_result;
ptr_to_result = minmax_element(x.begin(), x.end());
return make_float2(*ptr_to_result.first, *ptr_to_result.second);
}
int main()
{
thrust::device_vector<float> hat(4);
hat[0] = 3;
hat[1] = 5;
hat[2] = 6;
hat[3] = 1;
float2 result1 = minmax1(hat);
float2 result2 = minmax2(hat);
assert(result1.x == result2.x);
assert(result1.y == result2.y);
}

Allocate constant memory

I'm trying to set my simulation params in constant memory but without luck (CUDA.NET).
cudaMemcpyToSymbol function returns cudaErrorInvalidSymbol. The first parameter in cudaMemcpyToSymbol is string... Is it symbol name? actualy I don't understand how it could be resolved. Any help appreciated.
//init, load .cubin
float[] arr = new float[1];
arr[0] = 0.0f;
int size = Marshal.SizeOf(arr[0]) * arr.Length;
IntPtr ptr = Marshal.AllocHGlobal(size);
Marshal.Copy(arr, 0, ptr, arr.Length);
var error = CUDARuntime.cudaMemcpyToSymbol("param", ptr, 4, 0, cudaMemcpyKind.cudaMemcpyHostToDevice);
my .cu file contain
__constant__ float param;
Working solution
cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "name.cubin"));
simParams = cuda.GetModuleGlobal("params");
float[] parameters = new float[N]{...}
cuda.CopyHostToDevice<float>(simParams, parameters);
Unfortunately the __ constant __ must be in the same file scope as the memcpy to the symbol, and in your case your __ constant __ is in a separate .cu file.
The simple way around this is to provide a wrapper function in your .cu file, for example:
__constant__ float param;
// Host function to set the constant
void setParam(float value)
{
cudaMemcpyToSymbol("param", ptr, 4, 0, cudaMemcpyHostToDevice);
}
// etc.
__global__ void ...
If this question is actual you can use cuModuleGetGlobal and next cudaMemcpy like this:
private bool setValueToSymbol(CUmodule module, string symbol, int value)
{
CUdeviceptr devPtr = new CUdeviceptr();
uint lenBytes = 0;
CUResult result = CUDADriver.cuModuleGetGlobal(ref devPtr, ref lenBytes, module, symbol);
if (result == CUResult.Success)
{
int[] src = new int[] { value };
cudaError error = CUDARuntime.cudaMemcpy(devPtr, src, lenBytes, cudaMemcpyKind.cudaMemcpyHostToDevice);
if (error == cudaError.cudaSuccess)
return true;
else
return false;
}
else
{
return false;
}
}
where CUmodule module = cuda.LoadModule("MyCode.cubin");
This code works with NVIDIA GPU Computing SDK 3.1 and CUDA.NET 3.0.
constant memory has implicit local scope linkage.
make sure declaration is in the same file where you use it. it sounds like you have two files.
may also have to declare param to array (or maybe not)