Using both CUB and Thrust for parallel sum scan - cuda

I am trying to do parallel sum scan on a test vector. I am using both Thrust and CUB library for this purpose
struct CustomSum
{
template <typename T>
CUB_RUNTIME_FUNCTION __forceinline__
T operator()(const T &a, const T &b) const {
return a + b;
}
};
// 2d array stored in row-major order [(0,0), (0,1), (0,2), ... ]
thrust::host_vector<int> hVec_I1(SIZE_IMG, 1);
thrust::host_vector<int> hVec_I2(SIZE_IMG, 1);
thrust::host_vector<int> h_out(SIZE_IMG, 1);
CustomSum sum_op;
// Innitialize vector with synthetic image:
initialize(N, N, hVec_I1, hVec_I2);
// Compute Integral Image M1 and M2
thrust::device_vector<int> dVec_M1 = hVec_I1;
thrust::device_vector<int> dVec_M2 = hVec_I2;
thrust::device_vector<int> d_o = h_out;
//thrust::device_ptr<double> d_in = dVec_M1.data();
//thrust::device_ptr<double> d_out1 = d_out.data();
int* d_in = thrust::raw_pointer_cast(&dVec_M1[0]);
int *d_out = thrust::raw_pointer_cast(&d_o[0]);
//d_in = thrust::raw_pointer_cast(dVec_M2.data());
//thrust::device_vector<int> d_out;
//int *d_out = thrust::raw_pointer_cast(dVec_M1.data());
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
// Run inclusive prefix sum-scan
cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, sum_op, SIZE_IMG);
// Allocate temporary storage for inclusive prefix scan
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run inclusive prefix sum-scan
cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, sum_op, SIZE_IMG);
The error I am getting is
Error 43 error : calling a __host__ function("CustomSum::operator ()<int> ") from a __device__ function("cub::TilePrefixCallbackOp<int, CustomSum, cub::ScanTileState<int, (bool)1> > ::operator ()") is not allowed c:\users\asu_cuda_laptop\documents\visual studio 2013\projects\stats_kernel\cub\agent\single_pass_scan_operators.cuh 747 1 stats_kernel
I could not interpret the error correctly and I am sure there is a problem with the way I am handling raw pointers. Any help is appreciated.
Related link: How to use CUB and Thrust in one CUDA code

Try defining CustomSum::operator() as a __device__ function. More on __host__ vs __device__ functions in the CUDA C programming guide.

Related

creating 16-bit input to cufftXtMakePlanMany and workSize for 1 GPU

I need to compute FFT on unsigned int 8bit data. Previously, I was using cufftPlanMany and my input was cufftReal and the output was cufftComplex, and I was using casting before and after FFT to convert from unsigned 8bit to cufftReal and then from cufftComplex to signed 8bit.
It came to my attention that cuFFT has a nice option to run FFT for half-precision data which I hope improves the running time. According to the documentation, it currently doesn't support all of cudaDataType (that would be wonderful if it can in the future), but at least I can run it with 16bit float (half-precision) with the following signature:
cufftResult
cufftXtMakePlanMany(cufftHandle plan, int rank, long long int *n, long long int *inembed,
long long int istride, long long int idist, cudaDataType inputtype,
long long int *onembed, long long int ostride, long long int odist,
cudaDataType outputtype, long long int batch, size_t *workSize,
cudaDataType executiontype);
with data types for input, output and execution respectively as: CUDA_R_16F, CUDA_C_16F and CUDA_C_16F. Tha twould be ideal that I can feed this cuFFT with my U8 data, is there any way for doing so? Otherwise, if the first casting from U8 to cufftReal is necessary how can I convert my data from cufftReal to CUDA_R_16F and then from CUDA_C_16F ? Is cuda smart enough to cast the input from float to half-precision data, because cufftExecR2C ultimaltey would be the same and there is no other function to be called for the half-precision?
The other question is about workSize which is designed for multiple GPU cases. Any idea how this size has to be calculated? (I have just 1 GPU). Am I responsible for managing that buffer?
TL;DR: I can see two possible approaches here, one using a half-precision transform and one using a single-precision transform (perhaps with CUFFT callbacks). The reasons to choose one or the other may depend on a number of factors such as size of your transform, control of scope of input data, the GPU you are running on, and other factors.
I'm not going to try to address the processing of the output data that you indicate here:
then from cufftComplex to signed 8bit.
since I don't know how to do that without more information. However the processing of the input data in each case should be illustrative for how you could process the output data.
Using half-precision transforms
A few things to note here are that you cannot (currently) use callbacks with half-precision transforms, and half-precision transforms can be more sensitive to input data characteristics (e.g. DC offset, transform size, etc.) than single or double-precision transforms. Also, half-precision transforms for the most part require a pascal or newer GPU (ignoring Jetson family).
Because half-precision transforms don't support callbacks, we'll use "ordinary" host code to process the input data; you could also do this processing on the device prior to the transform, the provided code outlines both possibilities. My "preprocessing" here is mostly just designed to prevent the 16-bit transform from overflowing. If you play around with this code you'll quickly see what an overflow looks like (inf and/or nan in the output).
$ cat t1961.cu
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <cufftXt.h>
#include <cuda_fp16.h>
#include <assert.h>
#include <iostream>
typedef half2 ctype;
typedef half rtype;
typedef unsigned char dtype;
long long sig_size = 1<<18;
const int amplitude = 127;
const float ramplitude = 1/(float)(4*amplitude);
__host__ __device__ half convert(int val){
return __float2half_rn((val - amplitude)*ramplitude);
}
__global__ void dev_convert(rtype *out, dtype *in, int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < sz)
out[idx] = convert(in[idx]);
}
int main(){
//put 4x sine waves into a U8 array
dtype *my_data = (dtype *)malloc(sig_size*sizeof(dtype));
for (int i = 0; i < sig_size; i++) my_data[i] = amplitude*(sin((i*8*3.141592654f)/sig_size)+1.0);
rtype *d_idata;
ctype *d_odata;
cudaMalloc(&d_idata, sizeof(rtype)*sig_size);
#ifdef USE_HOST
rtype *h_idata = (rtype *)malloc(sig_size*sizeof(rtype));
// convert to 16 bit float non-offset suitable for cufft
for (int i = 0; i < sig_size; i++) h_idata[i] = convert(my_data[i]);
cudaMemcpy(d_idata, h_idata, sig_size*sizeof(rtype), cudaMemcpyHostToDevice);
#else
const int bs = 256;
dtype *d_mydata;
cudaMalloc(&d_mydata, sig_size*sizeof(dtype));
cudaMemcpy(d_mydata, my_data, sig_size*sizeof(dtype), cudaMemcpyHostToDevice);
dev_convert<<<(sig_size+bs-1)/bs, bs>>>(d_idata, d_mydata, sig_size);
#endif
cudaMalloc(&d_odata, sizeof(ctype)*(sig_size/2+1));
cufftHandle plan;
cufftResult r;
r = cufftCreate(&plan);
assert(r == CUFFT_SUCCESS);
size_t ws = 0;
r = cufftXtMakePlanMany(plan, 1, &sig_size, NULL, 1, 1, CUDA_R_16F, NULL, 1, 1, CUDA_C_16F, 1, &ws, CUDA_C_16F);
assert(r == CUFFT_SUCCESS);
r = cufftXtExec(plan, d_idata, d_odata, CUFFT_FORWARD); // warm-up
assert(r == CUFFT_SUCCESS);
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
r = cufftXtExec(plan, d_idata, d_odata, CUFFT_FORWARD);
assert(r == CUFFT_SUCCESS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("forward FFT time for %ld samples: %fms\n", sig_size, et);
ctype *h_odata = (ctype *)malloc((sig_size/2+1)*sizeof(ctype));
cudaMemcpy(h_odata, d_odata, (sig_size/2+1)*sizeof(ctype), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
std::cout << __half2float(h_odata[i].x) << " + " << __half2float(h_odata[i].y) << "i" << std::endl;
return 0;
}
$ nvcc -o t1961 t1961.cu -lcufft
$ ./t1961
forward FFT time for 262144 samples: 0.027520ms
-258 + 0i
0.00349998 + 0.00127506i
-0.000146866 + -0.000833511i
0.00140095 + -0.00501251i
-1.57031 + -32752i
-0.00198174 + 0.00856018i
0.00474548 + 0.00359917i
-0.00226784 + 0.00987244i
$
Using a single precision transform with a load callback
This in my view has a few benefits. It is not as subject to the overflow phenomenon as the half precision transforms are, and the (load) callback routine allows us to still operate on U8 input data.
$ cat t1962.cu
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <cufftXt.h>
#include <cuda_fp16.h>
#include <assert.h>
#include <iostream>
typedef cufftComplex ctype;
typedef cufftReal rtype;
typedef unsigned char dtype;
long long sig_size = 1<<18;
const int amplitude = 127;
const cufftReal ramplitude = 1/(float)(4*amplitude);
__device__ rtype convert(int val){
return (val - amplitude)*ramplitude;
}
__device__ rtype myOwnCallback(void *dataIn,
size_t offset,
void *callerInfo,
void *sharedPtr) {
rtype ret;
ret = convert(((dtype *)dataIn)[offset]);
return ret;
}
__device__ cufftCallbackLoadR myOwnCallbackPtr = myOwnCallback;
int main(){
cufftCallbackLoadR hostCopyOfCallbackPtr;
cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr,
myOwnCallbackPtr,
sizeof(hostCopyOfCallbackPtr));
//put 4x sine waves into a U8 array
dtype *my_data = (dtype *)malloc(sig_size*sizeof(dtype));
for (int i = 0; i < sig_size; i++) my_data[i] = amplitude*(sin((i*8*3.141592654f)/sig_size)+1.0);
ctype *d_odata;
dtype *d_mydata;
cudaMalloc(&d_mydata, sig_size*sizeof(dtype));
cudaMemcpy(d_mydata, my_data, sig_size*sizeof(dtype), cudaMemcpyHostToDevice);
cudaMalloc(&d_odata, sizeof(ctype)*(sig_size/2+1));
cufftHandle plan;
cufftResult r;
r = cufftCreate(&plan);
assert(r == CUFFT_SUCCESS);
size_t ws = 0;
r = cufftXtMakePlanMany(plan, 1, &sig_size, NULL, 1, 1, CUDA_R_32F, NULL, 1, 1, CUDA_C_32F, 1, &ws, CUDA_C_32F);
assert(r == CUFFT_SUCCESS);
void *rps[] = {(void *)hostCopyOfCallbackPtr};
r = cufftXtSetCallback(plan, rps, CUFFT_CB_LD_REAL, NULL);
assert(r == CUFFT_SUCCESS);
r = cufftXtExec(plan, (cufftReal *)d_mydata, d_odata, CUFFT_FORWARD); // warm-up
assert(r == CUFFT_SUCCESS);
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
r = cufftXtExec(plan, (cufftReal *)d_mydata, d_odata, CUFFT_FORWARD);
assert(r == CUFFT_SUCCESS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("forward FFT time for %ld samples: %fms\n", sig_size, et);
ctype *h_odata = (ctype *)malloc((sig_size/2+1)*sizeof(ctype));
cudaMemcpy(h_odata, d_odata, (sig_size/2+1)*sizeof(ctype), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
std::cout << h_odata[i].x << " + " << h_odata[i].y << "i" << std::endl;
return 0;
}
$ nvcc -o t1962 t1962.cu -rdc=true -lcufft_static -lculibos
$ ./t1962
forward FFT time for 262144 samples: 0.031488ms
-257.969 + 0i
0.00344251 + 0.00137726i
-3.96543e-05 + -0.00106905i
0.0013994 + -0.00490045i
0.0331312 + -32759.4i
-0.00190887 + 0.00865401i
0.00454092 + 0.00368094i
-0.00219025 + 0.00983646i
$
Yes, the results are not numerically identical between the two transform types. It's not reasonable to expect that 16-bit floating point calculations and 32-bit floating point calculations will be identical. In all probability the 32-bit calculations are "more accurate". For this sinewave case, the terms I consider most important are the DC term as well as the magnitude spike at the fundamental. Those are numerically close to each other. The other terms are "in the noise". The timing results are not exactly comparable either, as the 16-bit calculation case omits the cost of the kernel call to convert the data from U8 to F16. You can use a profiler or just refactor the code to get more comparable timing.
workSize can be ignored for the single GPU case when using cufftXtMakePlanMany, otherwise, use the provided routines to determine workSize.

Cublas - Column/Row wise operations

I am looking for a way to perform operations over columns .
I have MxN matrix, i want to activate cublas function (for example nrm2) over each column.
The result i expect to get is : M x 1
How can I do that?
CUBLAS has no batched Level 1 routines, so there is no direct way to compute the column or row norms in a single call. You can do it by calling nrm2 many times in a loop over all the rows or columns of the matrix, for example:
#include <cublas_v2.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <iostream>
struct prg
{
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void)
{
const int M = 1024, N = M;
const int num = N * M;
thrust::device_vector<float> matrix(num);
thrust::device_vector<float> vector(N, -1.0f);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + num,
matrix.begin(),
prg(1.f,2.f));
float* m_d = thrust::raw_pointer_cast(matrix.data());
float* v_d = thrust::raw_pointer_cast(vector.data());
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
cublasSetStream(handle, stream);
for(int col=0; col < N; col++) {
cublasSnrm2(handle, M, m_d + col*M, 1, v_d + col);
}
cudaDeviceSynchronize();
for(auto x : vector) {
float normval = x;
std::cout << normval << std::endl;
}
return 0;
}
Unless you have very large rows or columns, there is little scope to exploit streams to run simultaneous kernels and reduce the overall runtime because each nrm2 call will be too short. So there is a lot of latency in running lots of individual kernels, which will negatively effect performance.
A much better alternative would be to write your own kernel to do this.

Optimizing memory access for complex numbers

I have a kernel that operates on complex numbers, and I am loading the values like this:
thrust::complex<float> x = X[tIdx];
where X is in global memory. When I profile this kernel with nvvp, I find that it is memory bandwidth-limited and the profiler suggests that I improve the memory access pattern:
Global Load L2 Transactions/Access=8, Ideal Transactions/Access=4
The disassembly confirms that this line is indeed split into two 32-bit loads, producing a strided access pattern:
LDG.E R9, [R16];
LDG.E R11, [R16+0x4];
How can I get this to compile into a single 64-bit load?
Potential solutions
I realize this is pretty closely related to this earlier question but the proposed solutions (change the global memory layout or use shared memory) seem less ideal than a 64-bit load.
The NVidia developer blog suggests reinterpret_cast to a vector data type such as float2, but I'm a little hazy about how this fits in with pointer aliasing rules.
I must also confess that this is somewhat of a theoretical question. For this particular kernel, I'm limited by the device memory bandwidth, so halving the # of L2 transactions shouldn't significantly improve the overall performance. But I anticipate working with more complex numbers in my future, and if there's a simple solution then I'd like to start using it now.
The basic problem here is that the compiler seems to need explicit alignment specifications for a type before it will generate vector load and store instructions. Consider the following trivial example:
class __align__(8) cplx0
{
public:
__device__ __host__ cplx0(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
class cplx1
{
public:
__device__ __host__ cplx1(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
template<typename T>
__global__ void memsetkernel(T* out, const T val, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = val;
}
template<typename T>
__global__ void memcpykernel(const T* __restrict__ in, T* __restrict__ out, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = in[tid];
}
template<typename T>
void memcpy(const T* in, T* out, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memcpykernel<T><<<nblocks, nthreads>>>(in, out, Nitems);
cudaDeviceSynchronize();
}
template<typename T>
void memset(T* in, const T value, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memsetkernel<T><<<nblocks, nthreads>>>(in, value, Nitems);
cudaDeviceSynchronize();
}
int main(void)
{
const int Nitems = 1 << 24;
typedef cplx0 fcomplex0;
typedef cplx1 fcomplex1;
{
fcomplex0* in;
fcomplex0* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex0));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex0>(in, fcomplex0(1.0f,1.0f), Nitems);
memcpy<fcomplex0>(in, out, Nitems);
}
cudaFree(in);
cudaFree(out);
}
{
fcomplex1* in;
fcomplex1* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex1));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex1>(in, fcomplex1(1.0f,1.0f), Nitems);
memcpy<fcomplex1>(in, out, Nitems);
cudaDeviceSynchronize();
}
cudaFree(in);
cudaFree(out);
}
cudaDeviceReset();
return 0;
}
Here we has two home-baked complex types, one with explicit alignment specifications, and one without. Otherwise they are identical. Putting them through a naïve mempcy and memset kernels in this test harness allows us to inspect the code generation behaviour of the toolchain for each type and benchmark the performance.
Firstly, the code. For cplx0 class, which has explicit 8-byte alignment, the compiler emits vectorized loads and stores in both kernels:
memcpykernel
ld.global.nc.v2.f32 {%f5, %f6}, [%rd17];
st.global.v2.f32 [%rd18], {%f5, %f6};
memsetkernel
st.global.v2.f32 [%rd11], {%f1, %f2};
whereas for the cplx1 case, it does not:
memcpykernel
ld.global.nc.f32 %f1, [%rd16];
ld.global.nc.f32 %f2, [%rd16+4];
st.global.f32 [%rd15+4], %f2;
st.global.f32 [%rd15], %f1;
memsetkernel
st.global.f32 [%rd11+4], %f2;
st.global.f32 [%rd11], %f1;
Looking at performance, there is a non-trivial difference in performance for the memset case (CUDA 8 release toolkit, GTX 970 with Linux 367.48 driver):
$ nvprof ./complex_types
==29074== NVPROF is profiling process 29074, command: ./complex_types
==29074== Profiling application: ./complex_types
==29074== Profiling result:
Time(%) Time Calls Avg Min Max Name
33.04% 19.264ms 10 1.9264ms 1.9238ms 1.9303ms void memcpykernel<cplx1>(cplx1 const *, cplx1*, int)
32.72% 19.080ms 10 1.9080ms 1.9055ms 1.9106ms void memcpykernel<cplx0>(cplx0 const *, cplx0*, int)
19.15% 11.165ms 10 1.1165ms 1.1120ms 1.1217ms void memsetkernel<cplx1>(cplx1*, cplx1, int)
15.09% 8.7985ms 10 879.85us 877.67us 884.13us void memsetkernel<cplx0>(cplx0*, cplx0, int)
The Thrust templated complex type does not have an explicit alignment definition (although it potentially could via specialization, although that would somewhat defeat the purpose). So your only choice here is to either make your own version of the Thrust type with explicit alignment, or use another complex type which does (like the cuComplex type which CUBLAS and CUFFT use).

is there a way to do "saypx" in cuBLAS?

cublasSaxpy computes y' = a * x + y, where x and y are vectors and a is scalar.
It turns out I need to compute y' = a * y + x instead. I'm not seeing how to twist the cuBLAS library into doing that.
(Of course, I could compute y' = a * y, then y' = y' + x, but y' is read too often in that case. And I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code. I'm just surprised there's no apparent way to do "saypx" directly.)
[Added] There are functions similar to "saxpby" in Intel's version of cblas, which would do what I need. But oddly enough, that's not in cuBLAS.
[Added #2] It looks like I can use the cudnnAddTensor function, with some aliasing of descriptors (I have a FilterDescriptor that points to the tensor, which AddTensor won't accept, but I should be able to alias a TensorDescriptor to the same memory and shape.)
There isn't a way I am aware of to do what you are asking in CUBLAS, nor in standard BLAS. What you have found in MKL is an extension added by Intel, but I don't recall seeing something similar in other host and accelerator BLAS implementations.
The good news is that your assertion that "I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code", is untrue, at least for an operation as trivial as saxpy. Even a naïve implementation of saxpy will get very close to CUBLAS because there really aren't that many was to read two arrays, perform an FMAD and write back the result. As long as you get memory coalescing correct, it is pretty simple to write performant code. For example:
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>
#include "cublas_v2.h"
typedef enum
{
AXPY = 0,
AXPBY = 1
} saxpy_op_t;
__device__ __host__ __inline__
float axpby_op(float y, float x, float a)
{
return a * y + x;
}
__device__ __host__ __inline__
float axpy_op(float y, float x, float a)
{
return y + a * x;
}
template<typename T>
class pitched_accessor
{
T * p;
size_t pitch;
public:
__host__ __device__
pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};
__host__ __device__
T& operator[](size_t idx) { return p[pitch*idx]; };
__host__ __device__
const T& operator[](size_t idx) const { return p[pitch*idx]; };
};
template<saxpy_op_t op>
__global__
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x,
const float a, const unsigned int N1)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
#pragma unroll 8
for(; idx < N1; idx += stride) {
switch (op) {
case AXPY:
y[idx] = axpy_op(y[idx], x[idx], a);
break;
case AXPBY:
y[idx] = axpby_op(y[idx], x[idx], a);
break;
}
}
}
__host__ void saxby(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
__host__ void saxpy(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
auto it = yhat.begin();
for(; it != yhat.end(); ++it) {
float err = std::fabs(*it - result);
assert( err < tolerance );
}
}
int main()
{
const int N = 1<<22;
std::vector<float> x_h(N);
std::vector<float> y_h(N);
const float a = 2.f, y0 = 1234.f, x0 = 532.f;
std::fill(y_h.begin(), y_h.end(), y0);
std::fill(x_h.begin(), x_h.end(), x0);
float *x_d, *y_d;
size_t sz = sizeof(float) * size_t(N);
cudaMalloc((void **)&x_d, sz);
cudaMalloc((void **)&y_d, sz);
cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxby(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpby_op(y0, x0, a));
}
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxpy(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
}
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
cublasDestroy(handle);
}
return int(cudaDeviceReset());
}
This demonstrates that a very simple axpy kernel can be easily adapted to perform both the standard operation and the version you want, and run within 10% of the runtime of CUBLAS on the compute 5.2 device I tested it on:
$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%) Time Calls Avg Min Max Name
54.06% 11.190ms 5 2.2381ms 960ns 2.9094ms [CUDA memcpy HtoD]
40.89% 8.4641ms 3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH]
1.73% 357.59us 1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.72% 355.15us 1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.60% 332.21us 1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}