Shufflling for Differential Evolutionary Algorithm in CUDA

Shufflling for Differential Evolutionary Algorithm in CUDA - cuda

I would like to implement a Differential Evolutionary Algorithm in CUDA.
How can I get two random vectors from matrix, knowing that they cannot be accessed again or, conversely, that they can? Is there an easy way of shuffling vectors in matrices?
I would also need to compute something using values from such a vector, and put new values in the bottom cell of each vector. It is easy to do? How to do it?
Maybe there is something like a stack implementation library (get by id, peek by id, ...)?

Maybe you should have a look at thrust library, which is sort of a C++ STL equivalent for CUDA. It has been integrated in the latest release of the CUDA toolkit, but if you have an older version of CUDA you can still download it for free at: http://code.google.com/p/thrust/
in this library, you'll find easy ways to handle vectors and to generate random numbers.

Concerning the implementation of the Differential Evolutionary Algorithm in CUDA as proposed in
R. Storn and K. Price, "Differential evolution: a simple and efficient heuristic for global optimization over continuous spaces," Journal of Global Optimization, vol. 11, no. 4, pp. 341-359, 1997
you seem to be concerned with the crossover operation. The only significant CUDA implementation of the Differential Evolutionary Algorithm in CUDA I'm aware is that in
L.P. de Veronese, R.A. Krohling, "Differential evolution algorithm on the GPU with C-CUDA," Proc. of the IEEE Congress on Evolutionary Computation, Barcelona, Spain, July 18-23, 2010, pp. 1-7.
Below, I'm showing a full CUDA code based on the implementation suggested in the latter paper.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <curand.h>
#include <curand_kernel.h>
using namespace thrust;
#include <stdio.h>
#include <time.h>
#include <fstream>
#include "Utilities.cuh"
#define pi 3.14159265358979f
#define BLOCK_SIZE_POP 32
#define BLOCK_SIZE_RAND 64
#define BLOCK_SIZE_UNKN 8
#define BLOCK_SIZE 256
//#define DEBUG
// --- REFERENCES
// [1] R. Storn and K. Price, “Differential evolution – a simple and efficient heuristic for global optimization over continuous spaces,”
// Journal of Global Optimization, vol. 11, no. 4, pp. 341–359, 1997
// [2] Lucas de P. Veronese and Renato A. Krohling, “Differential Evolution Algorithm on the GPU with C-CUDA,”
// Proc. of the IEEE Congress on Evolutionary Computation, Barcelona, Spain, Jul. 18-23, 2010, pp. 1-7.
// Conventions: the index j addresses the population member while the index i addresses the member component
// the homologous host and device variables have the same name with a "h_" or "d_" prefix, respectively
// the __host__ and __device__ functions pointer parameters have the same name for comparison purposes. it is up to the caller to use
// host or device pointers, as appropriate
/****************************************/
/* EVALUATION OF THE OBJECTIVE FUNCTION */
/****************************************/
__global__ void curand_setup_kernel(curandState * __restrict state, const unsigned long int seed)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state[tid]);
}
/********************************/
/* INITIALIZE POPULATION ON GPU */
/********************************/
__global__ void initialize_population_GPU(float * __restrict pop, const float * __restrict minima, const float * __restrict maxima,
curandState * __restrict state, const int D, const int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < Np)) pop[j*D+i] = (maxima[i] - minima[i]) * curand_uniform(&state[j*D+i]) + minima[i];
}
/****************************************/
/* EVALUATION OF THE OBJECTIVE FUNCTION */
/****************************************/
__host__ __device__ float functional(const float * __restrict x, const int D) {
float sum = 0.f;
// --- De Jong function
//for (int i=0; i<D; i++) sum = sum + x[i] * x[i];
// --- Rosenbrock's saddle
sum = 0.f;
for (int i=1; i<D; i++) sum = sum + 100.f * (x[i] - x[i-1] * x[i-1]) * (x[i] - x[i-1] * x[i-1]) + (x[i-1] - 1.f) * (x[i-1] - 1.f);
return sum;
}
/********************************/
/* POPULATION EVALUATION ON GPU */
/********************************/
__global__ void evaluation_GPU(const int Np, const int D, const float * __restrict pop, float * __restrict fobj) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
if (j < Np) fobj[j] = functional(&pop[j*D], D);
}
/**********************************************************/
/* GENERATE MUTATION INDICES AND CROSS-OVER VALUES ON GPU */
/**********************************************************/
__global__ void generate_mutation_indices_and_crossover_values_GPU(float * __restrict Rand, int * __restrict mutation, const int Np, const int D,
curandState * __restrict state) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
int a, b, c;
if (j < Np) {
do a=Np*(curand_uniform(&state[j*D])); while(a==j);
do b=Np*(curand_uniform(&state[j*D])); while(b==j||b==a);
do c=Np*(curand_uniform(&state[j*D])); while(c==j||c==a||c==b);
mutation[j*3]=a;
mutation[j*3+1]=b;
mutation[j*3+2]=c;
Rand[j]=curand_uniform(&state[j*D]);
}
}
/**********************************/
/* GENERATION OF A NEW POPULATION */
/**********************************/
__global__ void generation_new_population_GPU(const float * __restrict pop, const int NP, const int D, float * __restrict npop, const float F,
const float CR, const float * __restrict rand, const int * __restrict mutation,
const float * __restrict minimum, const float * __restrict maximum) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < NP)) {
// --- Mutation indices
int a=mutation[j*3];
int b=mutation[j*3+1];
int c=mutation[j*3+2];
// --- Mutation and crossover
// --- One of the best strategies. Try F = 0.7 and CR = 0.5 as a first guess.
if(rand[j]<CR) npop[j*D+i] = pop[a*D+i]+F*(pop[b*D+i]-pop[c*D+i]);
else npop[j*D+i] = pop[j*D+i];
// --- Other possible approaches to mutation and crossover
// --- Not bad, but found several optimization problems where misconvergence occurs.
//npop[j*D+i] = pop[best_old_gen_ind*D+i] + F*(pop[b*D+i]-pop_old[c*D+i]);
// --- One of the best strategies. Try F = 0.85 and CR = 1. In case of misconvergence, try to increase NP. If this doesn't help,
// play around with all the control variables.
//npop[j*D+i] = pop[j*D+i] + F*(pop[best_old_gen_ind*D+i] - pop[j*D+i]) + F*(pop[a*D+i]-pop[b*D+i]);
// --- Powerful strategy worth trying.
//npop[j*D+i] = pop[best_old_gen_ind*D+i] + (pop[a*D+i]+pop[b*D+i]-pop[c*D+i]-pop[d*D+i])*F;
// --- Robust optimizer for many functions.
//npop[j*D+i] = pop[e*D+i] + (pop[a*D+i]+pop[b*D+i]-pop[c*D+i]-pop[d*D+i])*F;
// --- Saturation due to constraints on the unknown parameters
if (npop[j*D+i]>maximum[i]) npop[j*D+i]=maximum[i];
else if (npop[j*D+i]<minimum[i]) npop[j*D+i]=minimum[i];
}
}
/*******************************/
/* POPULATION SELECTION ON GPU */
/*******************************/
// Assumption: all the optimization variables are associated to the same thread block
__global__ void selection_and_evaluation_GPU(const int Np, const int D, float * __restrict pop, const float * __restrict npop, float * __restrict fobj) {
int i = threadIdx.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < Np)) {
float nfobj = functional(&npop[j*D], D);
float temp = fobj[j];
if (nfobj < temp) {
pop[j*D+i] = npop[j*D+i];
fobj[j] = nfobj;
}
}
}
/***********************/
/* FIND MINIMUM ON GPU */
/***********************/
void find_minimum_GPU(const int N, float *t, float * __restrict minval, int * __restrict index) {
// --- Wrap raw pointer with a device_ptr
device_ptr<float> dev_ptr = device_pointer_cast(t);
// --- Use device_ptr in thrust min_element
device_ptr<float> min_ptr = thrust::min_element(dev_ptr, dev_ptr + N);
index[0] = &min_ptr[0] - &dev_ptr[0];
minval[0] = min_ptr[0];;
}
/********/
/* MAIN */
/********/
int main()
{
// --- Number of individuals in the population (Np >=4 for mutation purposes)
int Np = 80;
// --- Dimensionality of each individual (number of unknowns)
int D = 5;
// --- Mutation factor (0 < F <= 2). Typically chosen in [0.5, 1], see Ref. [1]
float F = 0.7f;
// --- Maximum number of generations
int Gmax = 2000;
// --- Crossover constant (0 < CR <= 1)
float CR = 0.4f;
// --- Mutually different random integer indices selected from {1, 2, … ,Np}
int *d_mutation, // --- Device side mutation vector
*d_best_index, // --- Device side current optimal member index
*h_best_index_dev; // --- Host side current optimal member index of device side
float *d_pop, // --- Device side population
*d_npop, // --- Device side new population (trial vectors)
*d_Rand, // --- Device side crossover rand vector (uniformly distributed in (0,1))
*d_fobj, // --- Device side objective function value
*d_maxima, // --- Device side maximum constraints vector
*d_minima, // --- Device side minimum constraints vector
*h_pop_dev_res, // --- Host side population result of GPU computations
*h_best_dev, // --- Host side population best value history of device side
*h_maxima, // --- Host side maximum constraints vector
*h_minima; // --- Host side minimum constraints vector
curandState *devState; // --- Device side random generator state vector
// --- Device side memory allocations
gpuErrchk(cudaMalloc((void**)&d_pop,D*Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_npop,D*Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_Rand,Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_fobj,Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_mutation,3*Np*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_maxima,D*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_minima,D*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&devState, D*Np*sizeof(curandState)));
// --- Host side memory allocations
h_pop_dev_res = (float*)malloc(D*Np*sizeof(float));
h_best_dev = (float*)malloc(Gmax*sizeof(float));
h_best_index_dev = (int*)malloc(Gmax*sizeof(int));
h_maxima = (float*)malloc(D*sizeof(float));
h_minima = (float*)malloc(D*sizeof(float));
// --- Define grid sizes
int Num_Blocks_Pop = iDivUp(Np,BLOCK_SIZE_POP);
int Num_Blocks_Rand2 = iDivUp(Np,BLOCK_SIZE_RAND);
dim3 Grid(iDivUp(D,BLOCK_SIZE_UNKN),iDivUp(Np,BLOCK_SIZE_POP));
dim3 Block(BLOCK_SIZE_UNKN,BLOCK_SIZE_POP);
// --- Set maxima and minima
for (int i=0; i<D; i++) {
h_maxima[i] = 2.;
h_minima[i] = -2.;
}
gpuErrchk(cudaMemcpy(d_maxima, h_maxima, D*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_minima, h_minima, D*sizeof(float), cudaMemcpyHostToDevice));
// --- Initialize cuRAND states
curand_setup_kernel<<<iDivUp(D*Np, BLOCK_SIZE), BLOCK_SIZE>>>(devState, time(NULL));
// --- Initialize popultion
initialize_population_GPU<<<Grid, Block>>>(d_pop, d_minima, d_maxima, devState, D, Np);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Evaluate population
evaluation_GPU<<<iDivUp(Np, BLOCK_SIZE), BLOCK_SIZE>>>(Np, D, d_pop, d_fobj);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
int a, b, c;
for(int i=0;i<Gmax;i++) {
// --- Generate mutation indices and cross-over uniformly distributed random vector
generate_mutation_indices_and_crossover_values_GPU<<<Num_Blocks_Rand2,BLOCK_SIZE_RAND>>>(d_Rand, d_mutation, Np, D, devState);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Generate new population
generation_new_population_GPU<<<Grid,Block>>>(d_pop, Np, D, d_npop, F, CR, d_Rand, d_mutation, d_minima, d_maxima);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Select new population and evaluate it
selection_and_evaluation_GPU<<<Grid,Block>>>(Np, D, d_pop, d_npop, d_fobj);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
find_minimum_GPU(Np, d_fobj, &h_best_dev[i], &h_best_index_dev[i]);
printf("Iteration: %i; best member value: %f: best member index: %i\n", i, h_best_dev[i], h_best_index_dev[i]);
}
gpuErrchk(cudaMemcpy(h_pop_dev_res, d_pop, Np*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<D; i++) printf("Variable nr. %i = %f\n", i, h_pop_dev_res[h_best_index_dev[Gmax-1]*D+i]);
return 0;
}

Related

CUDA dynamic parallelism is computing sequentially

I need to write an application that computes some matrices from other matrices. In general, it sums outer products of rows of initial matrix E and multiplies it by some numbers calculated from v and t for each t in a given range. I am a newbie to CUDA, so there might be some very wrong ideas in the implementation. So, there is my code and some explanation in comments:
#include <cupy/complex.cuh>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
const int BLOCK_SIZE = 16;
const int DIM_SIZE = 16;
const double d_hbar=1.0545718176461565e-34;
extern "C"
struct sum_functor { //sum functor for thrust::transform, summs array of matrices
int N;
complex<float> *arr;
complex<float> *result;
__host__ __device__ sum_functor(int _N, complex<float>* _arr, complex<float>* _result) : N(_N), arr(_arr), result(_result) {};
__host__ __device__ complex<float> operator()(int n){
complex<float> sum = result[n];
for (int i = 0; i < BLOCK_SIZE; i++) {
sum += arr[N * N * i + n];
}
return sum;
}
};
extern "C" //outer product multiplied by given exp and rho
__global__ void outer(const complex<float>* E, int size, complex<float>* blockResult,
int m, int n, complex<float> exp, complex<float> rho) {
int col = blockIdx.y*blockDim.y+threadIdx.y;
int row = blockIdx.x*blockDim.x+threadIdx.x;
if (row < size && col < size) {
blockResult[row * size + col] = exp * rho * E[m * size + row] * E[n * size + col];
}
}
//compute constants and launch outer product kernels
//outer products are stored in blockResult, i.e. 16 matrices one after another
extern "C"
__global__ void calcBlock(const complex<float>* v, const complex<float>* E, int size, double t,
int rhoind, complex<float>* blockResult, int blockInd) {
int i = threadIdx.x;
int j = i + blockInd;
int m = j / size;
int n = j % size;
if (m < size && n < size) {
const complex<float>hbar(d_hbar);
complex<float> exp = thrust::exp(complex<float>(0, -1)*(v[m] - v[n]) * complex<float>(t)/hbar);
complex<float> rho = E[m * size + rhoind] * E[n * size + rhoind];
dim3 dimGrid((size - 1)/DIM_SIZE + 1, (size - 1) / DIM_SIZE + 1, 1);
dim3 dimBlock(DIM_SIZE, DIM_SIZE, 1);
outer<<<dimGrid, dimBlock>>>(E, size, blockResult + i * size * size, m, n, exp, rho);
}
}
//launch the block calculation, then sum the all matrices in block and add it to the result
//repeat block by block until all size*size matrices in total are summed
extern "C"
__global__ void calcSum(const complex<float>* v, const complex<float>* E, int size, double t, int ind,
int rhoind, complex<float>* blockResult, complex<float>* result, int* resultIndexes) {
for (int i = 0; i < size * size; i += BLOCK_SIZE) {
calcBlock<<<1, BLOCK_SIZE>>>(v, E, size, t, rhoind, blockResult, i);
cudaDeviceSynchronize();
thrust::transform(thrust::device, resultIndexes,
resultIndexes + size * size,
result + ind * size * size, sum_functor(size, blockResult, result + ind * size * size));
}
}
//launch calcSum in parallel for every t in range
extern "C"
__global__ void eigenMethod(const complex<float>* v, const complex<float>* E, int size, const double* t, int t_size,
int rhoind, complex<float>* blockResult, complex<float>* result, int* resultIndexes) {
int i = threadIdx.x;
if (i < t_size) {
calcSum<<<1, 1>>>(v, E, size, t[i], i, rhoind, blockResult + i * BLOCK_SIZE * size * size, result, resultIndexes);
}
}
//main is simplified cause I am using CuPy
int main() {
*Calculate E(size * size), v(size)*
*t is vector of size t_size*
*Initialize blockResult(t_size*BLOCK_SIZE*size*size)*
*resultIndexes(size*size) is enumerate from 0 to size * size)*
*result(t_size*size*size) filled with zeros*
eigenMetod<<<1, t_size>>>(v, E, size, t, t_size, 0, blockResult, result, resultIndexes)
}
The overall idea might be strange and stupid, but it is working. Thus, the problem I've encountered is that all calcSum kernels that are called from eigenMethod are scheduled one after another.
The calcSum function and everything above works fast enough for the purposes for which it was created. The main problem is that when I am trying to call multiple of these in the eigenMethod function. I have tried benchmarking it and got a linear dependence between runtime and the number of calls. For example, the eigenMethod function with t_size = 32 works almost two times faster than with t_size = 64.
Also, I have tried profiling it, but did not get the information that I wanted since Nsight Systems does not support CDP (CUDA Dynamic Parallelism) according to the the topics I saw. I think that accessing the same part of global memory (arrays E and v are the same pointer for all functions I call) might be a problem. As a hotfix, I have created individual copies for every calcSum function, but it did not help. Is there a way to compute multiple calcSum kernels in parallel? The benchmark results are listed below (matrix size is 128x128):
t_size
time, s
1
0.32
4
1.036
8
1.9
16
3.6

By not specifying a stream you are using the default stream which is shared among threads of the same block. Therefore all launches of calcSum go into the same stream and have to be executed after another. This can be fixed by using explicit streams instead.
"[A]ccessing the same part of global memory" from multiple kernels has nothing to do with this. As long as the kernels are only reading from the same locations and not writing to them, this is not problematic at all. Writing to the same locations would cause race conditions and therefore potentially non-deterministic output, but the CUDA runtime can not detect this and wont "sequentialize" your kernels to get around it.
As discussed in the comments, I don't think CDP is needed here at all and it is potentially expensive and in this form not future-proof. So performance will most probably not be ideal.

Computing all-pairs distances between points in different sets with CUDA

I am trying to implement a brute force distance computation algorithm in CUDA.
#define VECTOR_DIM 128
thrust::device_vector<float> feature_data_1;
feature_data_1.resize(VECTOR_DIM * 1000); // 1000 128 dimensional points
thrust::device_vector<float> feature_data_2;
feature_data_2.resize(VECTOR_DIM * 2000); // 2000 128 dimensional points
Now what I would like to do is to compute the L2 distances (sum of the squared differences) from every vector in the first matrix to every vector in the second matrix.
So, if array 1 is of size 1000 and array 2 is of size 2000, the result would be a floating point matrix of 1000*2000 in size.
I was wondering if there is a way to achieve this using Thrust algorithms alone.

Calculating the all-pairs distances between points in two different sets in CUDA can be solved by observing that
||x-y||^2=||x||^2+||y||^2-2*<x,y>
where || || is the l2 norm and <x,y> denotes the scalar product between x and y.
The norms ||x|| and ||y|| can be calculated by approaches inspired by Reduce matrix rows with CUDA, while the scalar products <x,y> can then be calculated as the matrix-matrix multiplication X*Y^T using cublas<t>gemm().
Below is a fully worked out implementation. Please, note that for the calculation of the norms || || two approaches are reported, one using cuBLAS cublas<t>gemv and one using Thurst's transform. For the problem size of your interest, I have experienced the following timings on my GT540M card:
Approach nr. 1 0.12ms
Approach nr. 2 0.59ms
include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
/***********************************************************/
/* SQUARED ABSOLUTE VALUE FUNCTOR - NEEDED FOR APPROACH #1 */
/***********************************************************/
struct abs2 {
__host__ __device__ double operator()(const float &x) const { return x * x; }
};
// --- Required for approach #2
__device__ float *vals;
/******************************************/
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
/******************************************/
struct row_reduction {
const int Ncols; // --- Number of columns
row_reduction(int _Ncols) : Ncols(_Ncols) {}
__device__ float operator()(float& x, int& y ) {
float temp = 0.f;
for (int i = 0; i<Ncols; i++)
temp += vals[i + (y*Ncols)] * vals[i + (y*Ncols)];
return temp;
}
};
/************************************************/
/* KERNEL FUNCTION TO ASSEMBLE THE FINAL RESULT */
/************************************************/
__global__ void assemble_final_result(const float * __restrict__ d_norms_x_2, const float * __restrict__ d_norms_y_2, float * __restrict__ d_dots,
const int NX, const int NY) {
const int i = threadIdx.x + blockIdx.x * gridDim.x;
const int j = threadIdx.y + blockIdx.y * gridDim.y;
if ((i < NY) && (j < NX)) d_dots[i * NX+ j] = d_norms_x_2[j] + d_norms_y_2[i] - 2 * d_dots[i * NX+ j];
}
/********/
/* MAIN */
/********/
int main()
{
//const int Ndims = 128; // --- Number of rows
//const int NX = 1000; // --- Number of columns
//const int NY = 2000; // --- Number of columns
const int Ndims = 3; // --- Number of rows
const int NX = 4; // --- Number of columns
const int NY = 5; // --- Number of columns
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrices allocation and initialization
thrust::device_vector<float> d_X(Ndims * NX);
thrust::device_vector<float> d_Y(Ndims * NY);
for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng);
for (size_t i = 0; i < d_Y.size(); i++) d_Y[i] = (float)dist(rng);
TimingGPU timerGPU;
// --- cuBLAS handle creation
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
/**********************************************/
/* CALCULATING THE NORMS OF THE ELEMENTS OF X */
/**********************************************/
thrust::device_vector<float> d_norms_x_2(NX);
// --- Approach nr. 1
//timerGPU.StartCounter();
thrust::device_vector<float> d_X_2(Ndims * NX);
thrust::transform(d_X.begin(), d_X.end(), d_X_2.begin(), abs2());
thrust::device_vector<float> d_ones(Ndims, 1.f);
float alpha = 1.f;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NX, &alpha, thrust::raw_pointer_cast(d_X_2.data()), Ndims,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_x_2.data()), 1));
//printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
// --- Approach nr. 2
//timerGPU.StartCounter();
// float *s_vals = thrust::raw_pointer_cast(&d_X[0]);
// gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
// thrust::transform(d_norms_x_2.begin(), d_norms_x_2.end(), thrust::counting_iterator<int>(0), d_norms_x_2.begin(), row_reduction(Ndims));
//printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
/**********************************************/
/* CALCULATING THE NORMS OF THE ELEMENTS OF Y */
/**********************************************/
thrust::device_vector<float> d_norms_y_2(NX);
thrust::device_vector<float> d_Y_2(Ndims * NX);
thrust::transform(d_Y.begin(), d_Y.end(), d_Y_2.begin(), abs2());
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NY, &alpha, thrust::raw_pointer_cast(d_Y_2.data()), Ndims,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_y_2.data()), 1));
/***********************************/
/* CALCULATING THE SCALAR PRODUCTS */
/***********************************/
thrust::device_vector<float> d_dots(NX * NY);
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NY, Ndims, &alpha,
thrust::raw_pointer_cast(d_X.data()), Ndims, thrust::raw_pointer_cast(d_Y.data()), Ndims, &beta,
thrust::raw_pointer_cast(d_dots.data()), NX));
/*****************************/
/* ASSEMBLE THE FINAL RESULT */
/*****************************/
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid(iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
assemble_final_result<<<dimGrid, dimBlock>>>(thrust::raw_pointer_cast(d_norms_x_2.data()), thrust::raw_pointer_cast(d_norms_y_2.data()),
thrust::raw_pointer_cast(d_dots.data()), NX, NY);
for(int i = 0; i < NX * NY; i++) std::cout << d_dots[i] << "\n";
return 0;
}
The Utilities.cu and Utilities.cuh files are mantained here and omitted here. The TimingGPU.cu and TimingGPU.cuh are maintained here and are omitted as well.

Cholesky decomposition with CUDA

I am trying to implement Cholesky decomposition using the cuSOLVER library. I am a beginner CUDA programmer and I have always specified block-sizes and grid-sizes, but I am not able to find out how this can be set explicitly by the programmer with cuSOLVER functions.
Here is the documentation: http://docs.nvidia.com/cuda/cusolver/index.html#introduction
The QR decomposition is implemented using the cuSOLVER library (see the example here: http://docs.nvidia.com/cuda/cusolver/index.html#ormqr-example1) and even there the above two parameters are not set.
To summarize, I have the following questions
How can the parameters: block-size and grid-size can be set with the cuSOLVER library?
How is the same being done with the mentioned QR example code in the NVIDIA documentation?

Robert Crovella has already answered this question. Here, I'm just providing a full example showing how Cholesky decomposition can be easily performed using the potrf function provided by the cuSOLVER library.
The Utilities.cu and Utilities.cuh files are mantained at this page and omitted here. The example implements the CPU as well as the GPU approach.
#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"
#include<iostream>
#include <fstream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
#define prec_save 10
/******************************************/
/* SET HERMITIAN POSITIVE DEFINITE MATRIX */
/******************************************/
// --- Credit to: https://math.stackexchange.com/questions/357980/how-to-generate-random-symmetric-positive-definite-matrices-using-matlab
void setPDMatrix(double * __restrict h_A, const int N) {
// --- Initialize random seed
srand(time(NULL));
double *h_A_temp = (double *)malloc(N * N * sizeof(double));
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A_temp[i * N + j] = (float)rand() / (float)RAND_MAX;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = 0.5 * (h_A_temp[i * N + j] + h_A_temp[j * N + i]);
for (int i = 0; i < N; i++) h_A[i * N + i] = h_A[i * N + i] + N;
}
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/********/
/* MAIN */
/********/
int main(){
const int N = 1000;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Setting the host, N x N matrix
double *h_A = (double *)malloc(N * N * sizeof(double));
setPDMatrix(h_A, N);
// --- Allocate device space for the input matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * sizeof(double)));
// --- Move the relevant matrix from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice));
/****************************************/
/* COMPUTING THE CHOLESKY DECOMPOSITION */
/****************************************/
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA CHOLESKY initialization
cusolveSafeCall(cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, &work_size));
// --- CUDA POTRF execution
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
cusolveSafeCall(cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, work, work_size, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful potrf execution\n\n" << "devInfo = " << devInfo_h << "\n\n";
// --- At this point, the lower triangular part of A contains the elements of L.
/***************************************/
/* CHECKING THE CHOLESKY DECOMPOSITION */
/***************************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\h_A.txt", N * N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\d_A.txt", N * N);
cusolveSafeCall(cusolverDnDestroy(solver_handle));
return 0;
}
EDIT
Cholesky decomposition requires that the relevant matrix is Hermitian and positive definite. Symmetric and positive definite matrices can be generated by the approach in How to generate random symmetric positive definite matrices using MATLAB?.
The following Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
load h_A.txt
A = reshape(h_A, N, N);
yMatlab = A * x;
Lmatlab = chol(A, 'lower');
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Lmatlab') * xprime;
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load d_A.txt
LCUDA = tril(reshape(d_A, N, N));
fprintf('Percentage rms of Cholesky decompositions in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(Lmatlab - LCUDA).^2)) / sum(sum(abs(Lmatlab).^2))));
load xCUDA.txt
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xCUDA - x).^2)) / sum(sum(abs(x).^2))));

You don't set block sizes (or grid sizes) explicitly when using a library like cusolver, or cublas, or cusparse.
The library chooses these, when it actually runs device code internal to the library.

Solving tridiagonal linear systems in CUDA

I am trying to implement a tridiagonal system solver based on the Cyclic Reduction method on my GTS450.
Cyclic Reduction is illustrated in this paper
Y. Zhang, J. Cohen, J.D. Owens, "Fast Tridiagonal Solvers on GPU"
However, whatever I do, my CUDA code is far slower than the sequential counterpart. My result for a total of 512 x 512 points is 7ms, however on my i7 3.4GHz it is 5ms. The GPU is not accelerating!
Which could be the problem?
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ double asub[512];
__shared__ double bsub[512];
__shared__ double csub[512];
__shared__ double dsub[512];
double at=0;
double bt=0;
double ct=0;
double dt=0;
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
for(int stride=1;stride<N;stride*=2)
{
int margin_left,margin_right;
margin_left=idx-stride;
margin_right=idx+stride;
at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f;
bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f;
dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
__syncthreads();
asub[idx]=at;
bsub[idx]=bt;
csub[idx]=ct;
dsub[idx]=dt;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
I launched this kernel by cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x), and reached 100% device occupancy. This result has puzzled me for days.
There is an improved version of my code:
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
{/*{{{*/
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ float asub[512];
__shared__ float bsub[512];
__shared__ float csub[512];
__shared__ float dsub[512];
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
__syncthreads();
//Reduction
for(int stride=1;stride<512;stride*=2)
{
int margin_left=(idx-stride);
int margin_right=(idx+stride);
if(margin_left<0) margin_left=0;
if(margin_right>=512) margin_right=511;
float tmp1 = asub[idx] / bsub[margin_left];
float tmp2 = csub[idx] / bsub[margin_right];
float tmp3 = dsub[margin_right];
float tmp4 = dsub[margin_left];
__syncthreads();
dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
tmp3 = -csub[margin_right];
tmp4 = -asub[margin_left];
__syncthreads();
asub[idx] = tmp3*tmp1;
csub[idx] = tmp4*tmp2;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
The speed is improved to 0.73ms on a Quadro k4000 for 512 x 512 system, however the code in the mentioned paper runs in 0.5ms on a GTX280.

Solving a tridiagonal system of equations is a challenging parallel problem since the classical solution scheme, i.e., Gaussian elimination, is inherently sequential.
Cyclic Reduction consists of two phases:
Forward Reduction. The original system is split in two independent tridiagonal systems for two sets of unknowns, the ones with odd index and the ones with even index. Such systems can be solved independently and this step can be seen as the ﬁrst of a divide et impera scheme. The two smaller systems are split again in the same way in two subsystems and the process is repeated until a system of only 2 equations is reached.
Backward Substitution. The system of 2 equations is solved first. Then, the divide et impera structure is climbed up by solving the sub-systems independently on diﬀerent cores.
I'm not sure (but correct me if I'm wrong) that your code will return consistent results. N does not appear to be defined. Also, you are accessing csub[idx-stride], but I'm not sure what does it mean when idx==0 and stride>1. Furthermore, you are using several conditional statements, essentially for boundary checkings. Finally, your code lacks a proper thread structure capable to deal with the mentioned divide et impera scheme, conceptually pretty much like the one used in the CUDA SDK reduction samples.
As mentioned in one of my comments above, I remembered that at tridiagonalsolvers you can find an implementation of the Cyclic Reduction scheme for solving tridiagonal equation systems. Browsing the related google pages, it seems to me that the code is mantained, among others, by the first Author of the above paper (Yao Zhang). The code is copied and pasted below. Note that the boundary check is done only once (if (iRight >= systemSize) iRight = systemSize - 1;), thus limiting the number of conditional statements involved. Note also the thread structure capable to deal with a divide et impera scheme.
The code by Zhang, Cohen and Owens
__global__ void crKernel(T *d_a, T *d_b, T *d_c, T *d_d, T *d_x)
{
int thid = threadIdx.x;
int blid = blockIdx.x;
int stride = 1;
int numThreads = blockDim.x;
const unsigned int systemSize = blockDim.x * 2;
int iteration = (int)log2(T(systemSize/2));
#ifdef GPU_PRINTF
if (thid == 0 && blid == 0) printf("iteration = %d\n", iteration);
#endif
__syncthreads();
extern __shared__ char shared[];
T* a = (T*)shared;
T* b = (T*)&a[systemSize];
T* c = (T*)&b[systemSize];
T* d = (T*)&c[systemSize];
T* x = (T*)&d[systemSize];
a[thid] = d_a[thid + blid * systemSize];
a[thid + blockDim.x] = d_a[thid + blockDim.x + blid * systemSize];
b[thid] = d_b[thid + blid * systemSize];
b[thid + blockDim.x] = d_b[thid + blockDim.x + blid * systemSize];
c[thid] = d_c[thid + blid * systemSize];
c[thid + blockDim.x] = d_c[thid + blockDim.x + blid * systemSize];
d[thid] = d_d[thid + blid * systemSize];
d[thid + blockDim.x] = d_d[thid + blockDim.x + blid * systemSize];
__syncthreads();
//forward elimination
for (int j = 0; j <iteration; j++)
{
__syncthreads();
stride *= 2;
int delta = stride/2;
if (threadIdx.x < numThreads)
{
int i = stride * threadIdx.x + stride - 1;
int iLeft = i - delta;
int iRight = i + delta;
if (iRight >= systemSize) iRight = systemSize - 1;
T tmp1 = a[i] / b[iLeft];
T tmp2 = c[i] / b[iRight];
b[i] = b[i] - c[iLeft] * tmp1 - a[iRight] * tmp2;
d[i] = d[i] - d[iLeft] * tmp1 - d[iRight] * tmp2;
a[i] = -a[iLeft] * tmp1;
c[i] = -c[iRight] * tmp2;
}
numThreads /= 2;
}
if (thid < 2)
{
int addr1 = stride - 1;
int addr2 = 2 * stride - 1;
T tmp3 = b[addr2]*b[addr1]-c[addr1]*a[addr2];
x[addr1] = (b[addr2]*d[addr1]-c[addr1]*d[addr2])/tmp3;
x[addr2] = (d[addr2]*b[addr1]-d[addr1]*a[addr2])/tmp3;
}
// backward substitution
numThreads = 2;
for (int j = 0; j <iteration; j++)
{
int delta = stride/2;
__syncthreads();
if (thid < numThreads)
{
int i = stride * thid + stride/2 - 1;
if(i == delta - 1)
x[i] = (d[i] - c[i]*x[i+delta])/b[i];
else
x[i] = (d[i] - a[i]*x[i-delta] - c[i]*x[i+delta])/b[i];
}
stride /= 2;
numThreads *= 2;
}
__syncthreads();
d_x[thid + blid * systemSize] = x[thid];
d_x[thid + blockDim.x + blid * systemSize] = x[thid + blockDim.x];
}

I want to add a further answer to mention that tridiagonal systems can be easily solved in the framework of the cuSPARSE library by aid of the function
cusparse<t>gtsv()
cuSPARSE also provides
cusparse<t>gtsv_nopivot()
which, at variance with the first mentioned routine, does not perform pivoting. Both the above functions solve the same linear system with multiple right hand sides. A batched routine
cusparse<t>gtsvStridedBatch()
also exists which solves multiple linear systems.
For all the above routines, the system matrix is fixed by simply specifying the lower diagonal, the main diagonal and the upper diagonal.
Below, I'm reporting a fully worked out example using cusparse<t>gtsv() to solve a tridiagonal linear system.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int N = 5; // --- Size of the linear system
// --- Lower diagonal, diagonal and upper diagonal of the system matrix
double *h_ld = (double*)malloc(N * sizeof(double));
double *h_d = (double*)malloc(N * sizeof(double));
double *h_ud = (double*)malloc(N * sizeof(double));
h_ld[0] = 0.;
h_ud[N-1] = 0.;
for (int k = 0; k < N - 1; k++) {
h_ld[k + 1] = -1.;
h_ud[k] = -1.;
}
for (int k = 0; k < N; k++) h_d[k] = 2.;
double *d_ld; gpuErrchk(cudaMalloc(&d_ld, N * sizeof(double)));
double *d_d; gpuErrchk(cudaMalloc(&d_d, N * sizeof(double)));
double *d_ud; gpuErrchk(cudaMalloc(&d_ud, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_ld, h_ld, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_d, h_d, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ud, h_ud, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(N * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0; h_x[4] = 300.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(N * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
cusparseSafeCall(cusparseDgtsv(handle, N, 1, d_ld, d_d, d_ud, d_x, N));
cudaMemcpy(h_x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost);
for (int k=0; k<N; k++) printf("%f\n", h_x[k]);
}
At this gitHub repository, a comparison of different CUDA routines available in the cuSOLVER library for the solution of tridiagonal linear systems is reported.

Things I see:
1st __syncthreads() seems redundant.
There are repetitive sets of operations such as (-csub[idx-stride]*asub[idx]/bsub[idx-stride]) in your code. Use intermediate variables to hold the result and reuse them instead of making GPU calculate those sets each time.
Use NVIDIA profiler to see where issues are.

Performing several 1D moving averages in parallel using CUDA Thrust

I'm not a programmer with any abilities. Just someone curious about CUDA and so I'm doing a little reading. I ran across an example of using Thrust to do a moving average:
Simple Moving Average Thrust Example
The example, such as it is, runs and mostly works correctly. However it's trivial in the sense that it only does one moving average operation.
How I would do say 352 of these moving average operations in parallel, all operating on the same data stream? In my mind the program flow might be:
Generate the data & send it to one CUDA core. (Same as existing code
but think lengths of 1000 or 10000 instead of 30)
Copy it from the CUDA core it's in to all of the the other 351 CUDA
cores in my GTX 465
Tell each CUDA core what number of data items to average over.
(4, 5, 6,..., 352, 353, 354)
Tell the device to run the average in each core in parallel
Read back the results from each core
I get that this code
// compute SMA using standard summation
simple_moving_average(data, w, averages);
makes it all happen, but how to I get Thrust to do many of these in parallel?
My interest here is about something like stock data. If I'm looking at GOOG prices I'd put that in the GPU using all cores and leave it there. I'd then be free to do lots of processing without loading the data anymore and just reading back results from each core. NOTE: I might not want to use GOOG in all cores. Some cores might be GOOG, others with some other symbol, but I'll get there later. I'm just thinking I don't want the stock data in global memory if there's enough room in each core.
I assume this is pretty straightforward for CUDA & Thrust?

Here is the possible way how to do this with arrayfire:
Note that I am NOT affiliated with this library whatsoever.
I am pretty sure this can also be done with thrust
but I found this one a lot simpler with arrayfire.
And if the library is free why can't I use it instead of thrust ?
In arrayfire you can use matrix to run several SMA operations in parallel:
unsigned n_SMAs = 1000; // # of SMA indicators to evaluate
unsigned len = 2000; // # of stock prices per indicator
unsigned w = 6; // window size
// generate stock prices: [0..10]
af::array data = af::randu(n_SMAs, len) * 10;
// compute inclusive prefix sums along colums of the matrix
af::array s = af::accum(data, 1);
// compute the average
af::array avg = (s.cols(w, af::end) - s.cols(0, af::end - w)) / w;
af::eval(avg);
std::cout << avg.dims() << "\n" << avg << "\n";
let me know if that's what you are looking for. This is how I understood your question: compute several SMA indicators in parallel

My understanding is that you are interested into the following two situations:
You have a long sequence of items and you want to calculate a certain number of averages, by averaging on different numbers of items, i.e., using different lengths for the moving average window. This is what I understand from your original question.
You have a series of sequences, stored consecutively in memory, and you want to average them in parallel with a fixed averaging window of size 2 * RADIUS + 1. This is what the ArrayFire code proposed by #asm does - you have accepted it.
Instead of using CUDA Thrust, I think it would be easier to write your own CUDA kernel to do the above operations. Below, a fully worked example that operates in the same way as the ArrayFire code proposed by #asm, thus covering case #2. Modifying it to cover case #1 would be straightforward.
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCK_SIZE_X 8
#define BLOCK_SIZE_Y 8
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int M, unsigned int N) {
__shared__ unsigned int temp[BLOCK_SIZE_Y][BLOCK_SIZE_X + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gindexy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int gindex = gindexy * N + gindexx;
unsigned int lindexx = threadIdx.x + RADIUS;
unsigned int lindexy = threadIdx.y;
// --- Read input elements into shared memory
temp[lindexy][lindexx] = ((gindexx < N)&&(gindexy < M))? in[gindex] : 0;
if (threadIdx.x < RADIUS) {
temp[lindexy][threadIdx.x] = ((gindexx >= RADIUS)&&(gindexx < (N + RADIUS))&&(gindexy < M)) ? in[gindex - RADIUS] : 0;
temp[lindexy][threadIdx.x + (RADIUS + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X))] = (((gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)) < N)&&(gindexy < M))? in[gindexy * N + gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)] : 0;
if ((threadIdx.y == 0)&&(gindexy < M)&&((gindexx + BLOCK_SIZE_X) < N)&&(gindexy < M)) printf("Inside 2 - tidx = %i; bidx = %i; tidy = %i; bidy = %i; lindexx = %i; temp = %i\n", threadIdx.x, blockIdx.x, threadIdx.y, blockIdx.y, threadIdx.x + (RADIUS + BLOCK_SIZE_X), temp[lindexy][threadIdx.x + (RADIUS + BLOCK_SIZE_X)]);
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexy][lindexx + offset];
}
// --- Store the result
out[gindexy * N + gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int M = 2;
const unsigned int N = 4 + 2 * RADIUS;
const unsigned int constant = 3;
thrust::device_vector<unsigned int> d_in(M * N, constant);
thrust::device_vector<unsigned int> d_out(M * N);
dim3 GridSize(iDivUp(N, BLOCK_SIZE_X), iDivUp(M, BLOCK_SIZE_Y));
dim3 BlockSize(BLOCK_SIZE_X, BLOCK_SIZE_Y);
moving_average<<<GridSize, BlockSize>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int j=0; j<M; j++) {
for (int i=0; i<N; i++)
printf("Element j = %i; i = %i; h_out = %i\n", j, i, h_out[N*j+i]);
}
return 0;
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008