CUDA (from C++) Hyperbolic Trig Functions Calculate Different Results in Different Locations - cuda

I'm running into an odd issue with a simulation I wrote. I recently restructured my code to make things cleaner and more organized. Basically (among other things) I moved (basically copy-pasted) the CUDA function in question to another file. This function uses asinh to compute something, as well as sinh and cosh. What I've noticed is that before the move, the function produced expected results consistent with hand calculated values (in excel). After the move, the hyperbolic functions are fed the same inputs, yet the results are significantly different (up to 10% in asinh, 0.5% in sinh). This effectively breaks my simulation. I am confident in the rest of the function.
EDIT:
Upon further testing, I've found hard-coding values for the angle (lambdaDegrees) in question - namely double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) }; - produces the (good) expected results. Measuring the angle before and after the equation is executed, the angle is unchanged, yet without hard-coding the value, it produces the wrong results. The weirdest part is simply adding another diagnostic printf function caused the function to produce yet another (wrong) result. I'm wondering if it has anything to do with the way I've set up a callback function on the GPU...maybe multiple threads using the function at the same time leading to some (consistent) undefined behavior?
After a bit of screwing around with the code, I reproduced the error. Expected value of x within getSAtLambda (the printf statement) is 1.268... Result is 1.768... Let me know what you think.
main.cu
//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
typedef double(*callbackFcn)(double*, int, double, double, int);
//on GPU global variables
extern __device__ double* fieldConstArray_GPU;
extern __device__ int arraySize_GPU;
extern __device__ callbackFcn callback_GPU;
__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__global__ void setupEnvironmentGPU(double* constArrayPtr);
__global__ void execute()
{
int thdInd{ blockIdx.x * blockDim.x + threadIdx.x };
callback_GPU(fieldConstArray_GPU, arraySize_GPU, (thdInd == 31487) ? 1233005.097 : ((115200 - thdInd) / 50000.0 * 6.371e6), 0.0, thdInd ); //3rd argument are example values
}
void setupEnvironment()
{// consts: [ B0, ILATDeg, L, L_norm, s_max ]
double fieldConstArray_h[]{ 3.12e-5, 72.0, 66717978.17, 10.47213595, 85670894.1 };
double* fieldConstants_d{ nullptr };
cudaMalloc((void **)&fieldConstants_d, 5 * sizeof(double));
cudaMemcpy(fieldConstants_d, fieldConstArray_h, 5 * sizeof(double), cudaMemcpyHostToDevice);
setupEnvironmentGPU <<< 1, 1 >>> (fieldConstants_d);
}
int main()
{
setupEnvironment();
int loops{ 0 };
while (loops < 3)
{
execute <<< 115200 / 256, 256 >>> ();
cudaDeviceSynchronize();
loops++;
}
return 0;
}
otherfunctions.cu
#include <cmath>
#include <iostream>
//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
typedef double(*callbackFcn)(double*, int, double, double, int);
__device__ double* fieldConstArray_GPU{ nullptr };
__device__ int arraySize_GPU{ 7 };
__device__ callbackFcn callback_GPU{ nullptr };
__host__ __device__ double getSAtLambda(double* consts, int arrayLength, double lambdaDegrees, double simtime, int thdInd)
{//returns s in units of L
double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) };
if (simtime == 0.0 && thdInd == 31487) { printf("\n\ngetSAtLambda: %f, %f\n\n", lambdaDegrees, x); }
return (0.5 * consts[2] / sqrt(3.0)) * (x + sinh(x) * cosh(x));
}
__host__ __device__ double getLambdaAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp{ (-consts[1] / consts[4]) * s + consts[1] }; //-ILAT / s_max * s + ILAT
double s_tmp{ consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, thdInd) };
double dlambda{ 1.0 };
bool over{ 0 };
while (abs((s_tmp - s) / s) > 1e-4) //errorTolerance
{
while (1)
{
over = (s_tmp >= s);
if (over)
{
lambda_tmp += dlambda;
s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
if (s_tmp < s)
break;
}
else
{
lambda_tmp -= dlambda;
s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
if (s_tmp >= s)
break;
}
}
if (dlambda < 1e-4 / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
}
return lambda_tmp;
}
__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg{ getLambdaAtS(consts, arrayLength, s, simtime, thdInd) };
double lambda_rad{ lambda_deg * 3.1415927 / 180.0 };
double rnorm{ consts[3] * pow(cos(lambda_rad), 2) };
return -consts[0] / pow(rnorm, 3) * sqrt(1.0 + 3 * pow(sin(lambda_rad), 2));
}
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{
return (BFieldAtS(consts, arrayLength, s + consts[5], simtime, thdInd) - BFieldAtS(consts, arrayLength, s - consts[5], simtime, thdInd)) / (2 * consts[5]);
}
__global__ void setupEnvironmentGPU(double* constArrayPtr)
{
callback_GPU = gradBAtS; //sets pointer to callback function
arraySize_GPU = 7;
fieldConstArray_GPU = constArrayPtr;
}

A summary of my findings:
On Cuda 8.0:
Correct results are produced when the code above:
Is compiled as debug instead of release (except for -O1)
When a trig identity of asinh is used instead of the actual asinh function
When the argument for asinh is hardcoded
With -O1 instead of -O2 for both release and debug
(Paradoxically) When the function getSAtLambda is called directly instead of through a function pointer
Incorrect results are produced for asinh(x) when:
Compiled as release with -O2 with a non-hardcoded value through a function pointer
Updating to CUDA 9.1 fixed the issue.

Related

Mathematical function in device to improve results (CUDA)

Is it worth to execute the mathematical function pow() in device(GPU), in order to improve the execution time of a code?
I found the function __powf() from Cuda Toolkit Documentation:
http://docs.nvidia.com/cuda/cuda-c-programming-guide/#intrinsic-functions
So I replaced the pow() function calls with __powf() and I used the option -use_fast_math for the compiler, but I got results "nan" instead of double precision numbers. What should I change on my code to achieve the above?
Libraries of my code.cu:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/time.h> // for gettimeofday()
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
Part of my code.cu:
void function(double *cx, double *cy, double *R, int var, double pts[][2], int e) {
magma_trans_t my_trans = MagmaNoTrans;
magma_int_t info;
magma_int_t M, C;
magma_int_t ldda, lddb;
C = 3;
M = var;
int i;
double Q[M];
double a[3];
int ret;
double A[3][M];
double pts_x[M], pts_y[M];
double *dev_pts_x, *dev_pts_y, *devA, *devB, *pWork, lWorkQuery[1];
/* Allocate device memory for the matrix (column-major) */
ldda = ((M + 31) / 32) * 32;
lddb = ldda;
cudaMalloc((void **)&devA, (ldda * C) * sizeof(double));
cudaMalloc((void **)&devB, (M) * sizeof(double));
for (i = 0; i < M; i++) {
pts_x[i] = pts[i][0];
pts_y[i] = pts[i][1];
A[0][i] = pts[i][0];
A[1][i] = pts[i][1];
A[2][i] = 1.0;
}
cudaMalloc((void **)&dev_pts_x, (M) * sizeof(double));
cudaMemcpy(dev_pts_x, pts_x, M * sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&dev_pts_y, (M) * sizeof(double));
cudaMemcpy(dev_pts_y, pts_y, M * sizeof(double), cudaMemcpyHostToDevice);
// Kernel invocation
dim3 threadsPerBlock(1, 1);
dim3 numBlocks(M / threadsPerBlock.x, M / threadsPerBlock.y);
call <<< numBlocks, threadsPerBlock >>> (var, dev_pts_x, dev_pts_y, devB);
cublasSetMatrix(M, C, sizeof(double), A, M, devA, ldda);
// cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);
/* Resolve the LLSP using MAGMA */
ret = magma_dgels_gpu(my_trans, M, C, 1 , devA, ldda, devB, M, lWorkQuery, -1, &info);
int lwork = (int)lWorkQuery[0];
//printf("Optimal work space %d\n", lwork);
pWork = (double*)malloc((lwork) * sizeof(double));
ret = magma_dgels_gpu(my_trans, M, C, 1, devA, ldda, devB, M, pWork, lwork, &info);
magma_dgetmatrix(M, 1, devB, lddb, Q, M);
a[2] = Q[2];
*cx = Q[0];
*cy = Q[1];
*R = sqrt((pow(*cx, 2)+pow(*cy, 2)) - a[2]);
}
__global__ void call(int v, double *pts_x, double *pts_y, double *B) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < v) {
B[i] = -(pow(pts_x[i], 2.0) + pow(pts_y[i], 2.0));
}
}
You use pow to square numbers, this is very inefficient. Use multiplication with an inline function:
static inline double square(double x) { return x * x; }
You might be getting NaN values because the number passed to pow is negative. This should not be a problem, but the cuda implementation of pow or __powf might not support that.
Also note that computing the euclidian distance between two points can be done more directly with the hypot() function:
double hypot(double x, double y);
Finally, as Weather Vane underlined, you might not need to take the square root if all you are interested in is the comparison with another distance computed the same way.

is there a way to do "saypx" in cuBLAS?

cublasSaxpy computes y' = a * x + y, where x and y are vectors and a is scalar.
It turns out I need to compute y' = a * y + x instead. I'm not seeing how to twist the cuBLAS library into doing that.
(Of course, I could compute y' = a * y, then y' = y' + x, but y' is read too often in that case. And I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code. I'm just surprised there's no apparent way to do "saypx" directly.)
[Added] There are functions similar to "saxpby" in Intel's version of cblas, which would do what I need. But oddly enough, that's not in cuBLAS.
[Added #2] It looks like I can use the cudnnAddTensor function, with some aliasing of descriptors (I have a FilterDescriptor that points to the tensor, which AddTensor won't accept, but I should be able to alias a TensorDescriptor to the same memory and shape.)
There isn't a way I am aware of to do what you are asking in CUBLAS, nor in standard BLAS. What you have found in MKL is an extension added by Intel, but I don't recall seeing something similar in other host and accelerator BLAS implementations.
The good news is that your assertion that "I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code", is untrue, at least for an operation as trivial as saxpy. Even a naïve implementation of saxpy will get very close to CUBLAS because there really aren't that many was to read two arrays, perform an FMAD and write back the result. As long as you get memory coalescing correct, it is pretty simple to write performant code. For example:
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>
#include "cublas_v2.h"
typedef enum
{
AXPY = 0,
AXPBY = 1
} saxpy_op_t;
__device__ __host__ __inline__
float axpby_op(float y, float x, float a)
{
return a * y + x;
}
__device__ __host__ __inline__
float axpy_op(float y, float x, float a)
{
return y + a * x;
}
template<typename T>
class pitched_accessor
{
T * p;
size_t pitch;
public:
__host__ __device__
pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};
__host__ __device__
T& operator[](size_t idx) { return p[pitch*idx]; };
__host__ __device__
const T& operator[](size_t idx) const { return p[pitch*idx]; };
};
template<saxpy_op_t op>
__global__
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x,
const float a, const unsigned int N1)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
#pragma unroll 8
for(; idx < N1; idx += stride) {
switch (op) {
case AXPY:
y[idx] = axpy_op(y[idx], x[idx], a);
break;
case AXPBY:
y[idx] = axpby_op(y[idx], x[idx], a);
break;
}
}
}
__host__ void saxby(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
__host__ void saxpy(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
auto it = yhat.begin();
for(; it != yhat.end(); ++it) {
float err = std::fabs(*it - result);
assert( err < tolerance );
}
}
int main()
{
const int N = 1<<22;
std::vector<float> x_h(N);
std::vector<float> y_h(N);
const float a = 2.f, y0 = 1234.f, x0 = 532.f;
std::fill(y_h.begin(), y_h.end(), y0);
std::fill(x_h.begin(), x_h.end(), x0);
float *x_d, *y_d;
size_t sz = sizeof(float) * size_t(N);
cudaMalloc((void **)&x_d, sz);
cudaMalloc((void **)&y_d, sz);
cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxby(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpby_op(y0, x0, a));
}
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxpy(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
}
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
cublasDestroy(handle);
}
return int(cudaDeviceReset());
}
This demonstrates that a very simple axpy kernel can be easily adapted to perform both the standard operation and the version you want, and run within 10% of the runtime of CUBLAS on the compute 5.2 device I tested it on:
$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%) Time Calls Avg Min Max Name
54.06% 11.190ms 5 2.2381ms 960ns 2.9094ms [CUDA memcpy HtoD]
40.89% 8.4641ms 3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH]
1.73% 357.59us 1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.72% 355.15us 1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.60% 332.21us 1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

CUDA function pointers

I was trying to make somtehing like this (actually I need to write some integration functions) in CUDA
#include <iostream>
using namespace std;
float f1(float x) {
return x * x;
}
float f2(float x) {
return x;
}
void tabulate(float p_f(float)) {
for (int i = 0; i != 10; ++i) {
std::cout << p_f(i) << ' ';
}
std::cout << std::endl;
}
int main() {
tabulate(f1);
tabulate(f2);
return 0;
}
output:
0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9
I tried the following but only got the error
Error: Function pointers and function template parameters are not supported in sm_1x.
float f1(float x) {
return x;
}
__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
for (lower; lower < upper; lower++) {
*result = *result + p_function(lower);
}
}
int main() {
float res;
float* dev_res;
cudaMalloc( (void**)&dev_res, sizeof(float) ) ;
tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("%f\n", res);
/************************************************************************/
scanf("%s");
return 0;
}
To get rid of your compile error, you'll have to use -gencode arch=compute_20,code=sm_20 as a compiler argument when compiling your code. But then you'll likely have some runtime problems:
Taken from the CUDA Programming Guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions
Function pointers to __global__ functions are supported in host code, but not in device code.
Function pointers to __device__ functions are only supported in device code compiled for devices of compute capability 2.x and higher.
It is not allowed to take the address of a __device__ function in host code.
so you can have something like this (adapted from the "FunctionPointers" sample):
//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);
//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
...
}
//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;
//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))
You can then pass the h_pointFunction as a parameter to your kernel, which can use it to call your __device__ function.
//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
unsigned char tmp;
...
tmp = (*pPointOperation)(tmp, 150.0)
...
}
//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);
Hopefully that made some sense. In all, it looks like you would have to change your f1 function to be a __device__ function and follow a similar procedure (the typedefs aren't necessary, but they do make the code nicer) to get it as a valid function pointer on the host-side to pass to your kernel. I'd also advise giving the FunctionPointers CUDA sample a look over
Even though you may be able to compile this code (see #Robert Crovella's answer) this code will not work. You cannot pass function pointers from host code as the host compiler has no way of figuring out the function address.
Here is a simple class for function pointers that are callable from within a kernel I wrote based on this question:
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr)
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
you could use it like this:
__device__ double func1(double x)
{
return x + 1.0f;
}
typedef double (*func)(double x);
__device__ func f_ = func1;
__global__ void test_kernel(func* f)
{
double x = (*f)(2.0);
printf("%g\n", x);
}
int main()
{
cudaCallableFunctionPointer<func> f(&f_);
test_kernel << < 1, 1 >> > (f.ptr);
}
output:
3

CUDA random number generating

I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.
here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}
You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.