CUDA dynamic parallelism is computing sequentially - cuda

I need to write an application that computes some matrices from other matrices. In general, it sums outer products of rows of initial matrix E and multiplies it by some numbers calculated from v and t for each t in a given range. I am a newbie to CUDA, so there might be some very wrong ideas in the implementation. So, there is my code and some explanation in comments:
#include <cupy/complex.cuh>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
const int BLOCK_SIZE = 16;
const int DIM_SIZE = 16;
const double d_hbar=1.0545718176461565e-34;
extern "C"
struct sum_functor { //sum functor for thrust::transform, summs array of matrices
int N;
complex<float> *arr;
complex<float> *result;
__host__ __device__ sum_functor(int _N, complex<float>* _arr, complex<float>* _result) : N(_N), arr(_arr), result(_result) {};
__host__ __device__ complex<float> operator()(int n){
complex<float> sum = result[n];
for (int i = 0; i < BLOCK_SIZE; i++) {
sum += arr[N * N * i + n];
}
return sum;
}
};
extern "C" //outer product multiplied by given exp and rho
__global__ void outer(const complex<float>* E, int size, complex<float>* blockResult,
int m, int n, complex<float> exp, complex<float> rho) {
int col = blockIdx.y*blockDim.y+threadIdx.y;
int row = blockIdx.x*blockDim.x+threadIdx.x;
if (row < size && col < size) {
blockResult[row * size + col] = exp * rho * E[m * size + row] * E[n * size + col];
}
}
//compute constants and launch outer product kernels
//outer products are stored in blockResult, i.e. 16 matrices one after another
extern "C"
__global__ void calcBlock(const complex<float>* v, const complex<float>* E, int size, double t,
int rhoind, complex<float>* blockResult, int blockInd) {
int i = threadIdx.x;
int j = i + blockInd;
int m = j / size;
int n = j % size;
if (m < size && n < size) {
const complex<float>hbar(d_hbar);
complex<float> exp = thrust::exp(complex<float>(0, -1)*(v[m] - v[n]) * complex<float>(t)/hbar);
complex<float> rho = E[m * size + rhoind] * E[n * size + rhoind];
dim3 dimGrid((size - 1)/DIM_SIZE + 1, (size - 1) / DIM_SIZE + 1, 1);
dim3 dimBlock(DIM_SIZE, DIM_SIZE, 1);
outer<<<dimGrid, dimBlock>>>(E, size, blockResult + i * size * size, m, n, exp, rho);
}
}
//launch the block calculation, then sum the all matrices in block and add it to the result
//repeat block by block until all size*size matrices in total are summed
extern "C"
__global__ void calcSum(const complex<float>* v, const complex<float>* E, int size, double t, int ind,
int rhoind, complex<float>* blockResult, complex<float>* result, int* resultIndexes) {
for (int i = 0; i < size * size; i += BLOCK_SIZE) {
calcBlock<<<1, BLOCK_SIZE>>>(v, E, size, t, rhoind, blockResult, i);
cudaDeviceSynchronize();
thrust::transform(thrust::device, resultIndexes,
resultIndexes + size * size,
result + ind * size * size, sum_functor(size, blockResult, result + ind * size * size));
}
}
//launch calcSum in parallel for every t in range
extern "C"
__global__ void eigenMethod(const complex<float>* v, const complex<float>* E, int size, const double* t, int t_size,
int rhoind, complex<float>* blockResult, complex<float>* result, int* resultIndexes) {
int i = threadIdx.x;
if (i < t_size) {
calcSum<<<1, 1>>>(v, E, size, t[i], i, rhoind, blockResult + i * BLOCK_SIZE * size * size, result, resultIndexes);
}
}
//main is simplified cause I am using CuPy
int main() {
*Calculate E(size * size), v(size)*
*t is vector of size t_size*
*Initialize blockResult(t_size*BLOCK_SIZE*size*size)*
*resultIndexes(size*size) is enumerate from 0 to size * size)*
*result(t_size*size*size) filled with zeros*
eigenMetod<<<1, t_size>>>(v, E, size, t, t_size, 0, blockResult, result, resultIndexes)
}
The overall idea might be strange and stupid, but it is working. Thus, the problem I've encountered is that all calcSum kernels that are called from eigenMethod are scheduled one after another.
The calcSum function and everything above works fast enough for the purposes for which it was created. The main problem is that when I am trying to call multiple of these in the eigenMethod function. I have tried benchmarking it and got a linear dependence between runtime and the number of calls. For example, the eigenMethod function with t_size = 32 works almost two times faster than with t_size = 64.
Also, I have tried profiling it, but did not get the information that I wanted since Nsight Systems does not support CDP (CUDA Dynamic Parallelism) according to the the topics I saw. I think that accessing the same part of global memory (arrays E and v are the same pointer for all functions I call) might be a problem. As a hotfix, I have created individual copies for every calcSum function, but it did not help. Is there a way to compute multiple calcSum kernels in parallel? The benchmark results are listed below (matrix size is 128x128):
t_size
time, s
1
0.32
4
1.036
8
1.9
16
3.6

By not specifying a stream you are using the default stream which is shared among threads of the same block. Therefore all launches of calcSum go into the same stream and have to be executed after another. This can be fixed by using explicit streams instead.
"[A]ccessing the same part of global memory" from multiple kernels has nothing to do with this. As long as the kernels are only reading from the same locations and not writing to them, this is not problematic at all. Writing to the same locations would cause race conditions and therefore potentially non-deterministic output, but the CUDA runtime can not detect this and wont "sequentialize" your kernels to get around it.
As discussed in the comments, I don't think CDP is needed here at all and it is potentially expensive and in this form not future-proof. So performance will most probably not be ideal.

Related

Wrong sizes for block reduction in CUDA?

I was checking out this sum_reduction.cu example and tutorial and noticed that for certain problem sizes it doesn't work e.g. it works with problem size n=2000 but not with n=3000. Apparently it always work with problem sizes that are multiple of the block size but neither the tutorial nor the example code states so. The question is, does this reduction algorithm only works for certain problem sizes? the example they chose N=256k which is even, a power of two and also multiple of the block size 512.
For self containment I paste the most important bits of (a template version of) the code here:
template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
and to invoke the kernel:
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
To my understanding if the problem size is smaller than the block reduction this statement T x = 0.0; ensures that the element is zeroed out and thus should work but it doesn't?
UPDATE: I am sorry the float/double thing was a typo while preparing the question and not the real problem.
The code you have posted is not consistent, as your templated kernel
is called kernelSum but you are invoking something called
block_sum.
Furthermore, I don't believe your usage of the templated kernel
function could possibly be correct as written:
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(float)>>>(d_input, d_partial_sums_and_total, num_elements);
^ ^
| these types are required to match |
The kernel template is being instantiated with type double. Therefore it is expecting enough shared memory to store block_size double quantities, based on this line:
extern __shared__ T sdata[];
But you are only passing half of the required storage:
block_size * sizeof(float)
I believe that's going to give you unexpected results.
The reduction as written does expect that the block
dimension is a power of 2, due to this loop:
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
This is not likely to be an issue on the first kernel call, because you are probably choosing a power of two for the number of threads per block (block_size):
block_sum<double> <<<num_blocks,block_size,...
However, for the second kernel call, this will depend on whether num_blocks is a power of two, which depends on your grid calculations, which you haven't shown:
block_sum<double> <<<1,num_blocks,...
Finally, the first kernel launch will fail if num_blocks exceeds the limit for your device. This may happen for very large data sets but probably not for size 3000, and it depends on your grid calculations which you haven't shown.
Item 3 above is a difficult requirement to satisfy on the fly for arbitrary vector sizes. Therefore I would suggest an alternate reduction strategy to handle arbitrary sized vectors. For this I would suggest that you study the CUDA reduction sample code and presentation.
Here's a complete program, mostly based on the code you have shown, that has the above issues addressed, and seems to work for me for a size of 3000:
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 3000
#define nTPB 256
template<typename T>
__global__ void block_sum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
int main(){
double *d_input, *d_partial_sums_and_total, *h_input, *h_partial_sums_and_total;
int num_elements=DSIZE;
int block_size = nTPB;
int num_blocks = (num_elements + block_size -1)/block_size;
// bump num_blocks up to the next power of 2
int done = 0;
int test_val = 1;
while (!done){
if (test_val >= num_blocks){
num_blocks = test_val;
done = 1;}
else test_val *= 2;
if (test_val > 65535) {printf("blocks failure\n"); exit(1);}
}
h_input = (double *)malloc(num_elements * sizeof(double));
h_partial_sums_and_total = (double *)malloc((num_blocks+1)*sizeof(double));
cudaMalloc((void **)&d_input, num_elements * sizeof(double));
cudaMalloc((void **)&d_partial_sums_and_total, (num_blocks+1)*sizeof(double));
double h_result = 0.0;
for (int i = 0; i < num_elements; i++) {
h_input[i] = rand()/(double)RAND_MAX;
h_result += h_input[i];}
cudaMemcpy(d_input, h_input, num_elements*sizeof(double), cudaMemcpyHostToDevice);
cudaMemset(d_partial_sums_and_total, 0, (num_blocks+1)*sizeof(double));
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
cudaMemcpy(h_partial_sums_and_total, d_partial_sums_and_total, (num_blocks+1)*sizeof(double), cudaMemcpyDeviceToHost);
printf("host result = %lf\n", h_result);
printf("device result = %lf\n", h_partial_sums_and_total[num_blocks]);
}
For brevity/readability, I have dispensed with error checking in the above code. When having difficulty with a cuda code, you should always do proper cuda error checking.
Also, in the future, you will make it easier for others to help you if you post a complete code to demonstrate what you are doing, as I have done above.

Solving tridiagonal linear systems in CUDA

I am trying to implement a tridiagonal system solver based on the Cyclic Reduction method on my GTS450.
Cyclic Reduction is illustrated in this paper
Y. Zhang, J. Cohen, J.D. Owens, "Fast Tridiagonal Solvers on GPU"
However, whatever I do, my CUDA code is far slower than the sequential counterpart. My result for a total of 512 x 512 points is 7ms, however on my i7 3.4GHz it is 5ms. The GPU is not accelerating!
Which could be the problem?
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ double asub[512];
__shared__ double bsub[512];
__shared__ double csub[512];
__shared__ double dsub[512];
double at=0;
double bt=0;
double ct=0;
double dt=0;
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
for(int stride=1;stride<N;stride*=2)
{
int margin_left,margin_right;
margin_left=idx-stride;
margin_right=idx+stride;
at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f;
bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f;
dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
__syncthreads();
asub[idx]=at;
bsub[idx]=bt;
csub[idx]=ct;
dsub[idx]=dt;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
I launched this kernel by cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x), and reached 100% device occupancy. This result has puzzled me for days.
There is an improved version of my code:
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
{/*{{{*/
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ float asub[512];
__shared__ float bsub[512];
__shared__ float csub[512];
__shared__ float dsub[512];
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
__syncthreads();
//Reduction
for(int stride=1;stride<512;stride*=2)
{
int margin_left=(idx-stride);
int margin_right=(idx+stride);
if(margin_left<0) margin_left=0;
if(margin_right>=512) margin_right=511;
float tmp1 = asub[idx] / bsub[margin_left];
float tmp2 = csub[idx] / bsub[margin_right];
float tmp3 = dsub[margin_right];
float tmp4 = dsub[margin_left];
__syncthreads();
dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
tmp3 = -csub[margin_right];
tmp4 = -asub[margin_left];
__syncthreads();
asub[idx] = tmp3*tmp1;
csub[idx] = tmp4*tmp2;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
The speed is improved to 0.73ms on a Quadro k4000 for 512 x 512 system, however the code in the mentioned paper runs in 0.5ms on a GTX280.
Solving a tridiagonal system of equations is a challenging parallel problem since the classical solution scheme, i.e., Gaussian elimination, is inherently sequential.
Cyclic Reduction consists of two phases:
Forward Reduction. The original system is split in two independent tridiagonal systems for two sets of unknowns, the ones with odd index and the ones with even index. Such systems can be solved independently and this step can be seen as the first of a divide et impera scheme. The two smaller systems are split again in the same way in two subsystems and the process is repeated until a system of only 2 equations is reached.
Backward Substitution. The system of 2 equations is solved first. Then, the divide et impera structure is climbed up by solving the sub-systems independently on different cores.
I'm not sure (but correct me if I'm wrong) that your code will return consistent results. N does not appear to be defined. Also, you are accessing csub[idx-stride], but I'm not sure what does it mean when idx==0 and stride>1. Furthermore, you are using several conditional statements, essentially for boundary checkings. Finally, your code lacks a proper thread structure capable to deal with the mentioned divide et impera scheme, conceptually pretty much like the one used in the CUDA SDK reduction samples.
As mentioned in one of my comments above, I remembered that at tridiagonalsolvers you can find an implementation of the Cyclic Reduction scheme for solving tridiagonal equation systems. Browsing the related google pages, it seems to me that the code is mantained, among others, by the first Author of the above paper (Yao Zhang). The code is copied and pasted below. Note that the boundary check is done only once (if (iRight >= systemSize) iRight = systemSize - 1;), thus limiting the number of conditional statements involved. Note also the thread structure capable to deal with a divide et impera scheme.
The code by Zhang, Cohen and Owens
__global__ void crKernel(T *d_a, T *d_b, T *d_c, T *d_d, T *d_x)
{
int thid = threadIdx.x;
int blid = blockIdx.x;
int stride = 1;
int numThreads = blockDim.x;
const unsigned int systemSize = blockDim.x * 2;
int iteration = (int)log2(T(systemSize/2));
#ifdef GPU_PRINTF
if (thid == 0 && blid == 0) printf("iteration = %d\n", iteration);
#endif
__syncthreads();
extern __shared__ char shared[];
T* a = (T*)shared;
T* b = (T*)&a[systemSize];
T* c = (T*)&b[systemSize];
T* d = (T*)&c[systemSize];
T* x = (T*)&d[systemSize];
a[thid] = d_a[thid + blid * systemSize];
a[thid + blockDim.x] = d_a[thid + blockDim.x + blid * systemSize];
b[thid] = d_b[thid + blid * systemSize];
b[thid + blockDim.x] = d_b[thid + blockDim.x + blid * systemSize];
c[thid] = d_c[thid + blid * systemSize];
c[thid + blockDim.x] = d_c[thid + blockDim.x + blid * systemSize];
d[thid] = d_d[thid + blid * systemSize];
d[thid + blockDim.x] = d_d[thid + blockDim.x + blid * systemSize];
__syncthreads();
//forward elimination
for (int j = 0; j <iteration; j++)
{
__syncthreads();
stride *= 2;
int delta = stride/2;
if (threadIdx.x < numThreads)
{
int i = stride * threadIdx.x + stride - 1;
int iLeft = i - delta;
int iRight = i + delta;
if (iRight >= systemSize) iRight = systemSize - 1;
T tmp1 = a[i] / b[iLeft];
T tmp2 = c[i] / b[iRight];
b[i] = b[i] - c[iLeft] * tmp1 - a[iRight] * tmp2;
d[i] = d[i] - d[iLeft] * tmp1 - d[iRight] * tmp2;
a[i] = -a[iLeft] * tmp1;
c[i] = -c[iRight] * tmp2;
}
numThreads /= 2;
}
if (thid < 2)
{
int addr1 = stride - 1;
int addr2 = 2 * stride - 1;
T tmp3 = b[addr2]*b[addr1]-c[addr1]*a[addr2];
x[addr1] = (b[addr2]*d[addr1]-c[addr1]*d[addr2])/tmp3;
x[addr2] = (d[addr2]*b[addr1]-d[addr1]*a[addr2])/tmp3;
}
// backward substitution
numThreads = 2;
for (int j = 0; j <iteration; j++)
{
int delta = stride/2;
__syncthreads();
if (thid < numThreads)
{
int i = stride * thid + stride/2 - 1;
if(i == delta - 1)
x[i] = (d[i] - c[i]*x[i+delta])/b[i];
else
x[i] = (d[i] - a[i]*x[i-delta] - c[i]*x[i+delta])/b[i];
}
stride /= 2;
numThreads *= 2;
}
__syncthreads();
d_x[thid + blid * systemSize] = x[thid];
d_x[thid + blockDim.x + blid * systemSize] = x[thid + blockDim.x];
}
I want to add a further answer to mention that tridiagonal systems can be easily solved in the framework of the cuSPARSE library by aid of the function
cusparse<t>gtsv()
cuSPARSE also provides
cusparse<t>gtsv_nopivot()
which, at variance with the first mentioned routine, does not perform pivoting. Both the above functions solve the same linear system with multiple right hand sides. A batched routine
cusparse<t>gtsvStridedBatch()
also exists which solves multiple linear systems.
For all the above routines, the system matrix is fixed by simply specifying the lower diagonal, the main diagonal and the upper diagonal.
Below, I'm reporting a fully worked out example using cusparse<t>gtsv() to solve a tridiagonal linear system.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int N = 5; // --- Size of the linear system
// --- Lower diagonal, diagonal and upper diagonal of the system matrix
double *h_ld = (double*)malloc(N * sizeof(double));
double *h_d = (double*)malloc(N * sizeof(double));
double *h_ud = (double*)malloc(N * sizeof(double));
h_ld[0] = 0.;
h_ud[N-1] = 0.;
for (int k = 0; k < N - 1; k++) {
h_ld[k + 1] = -1.;
h_ud[k] = -1.;
}
for (int k = 0; k < N; k++) h_d[k] = 2.;
double *d_ld; gpuErrchk(cudaMalloc(&d_ld, N * sizeof(double)));
double *d_d; gpuErrchk(cudaMalloc(&d_d, N * sizeof(double)));
double *d_ud; gpuErrchk(cudaMalloc(&d_ud, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_ld, h_ld, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_d, h_d, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ud, h_ud, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(N * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0; h_x[4] = 300.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(N * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
cusparseSafeCall(cusparseDgtsv(handle, N, 1, d_ld, d_d, d_ud, d_x, N));
cudaMemcpy(h_x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost);
for (int k=0; k<N; k++) printf("%f\n", h_x[k]);
}
At this gitHub repository, a comparison of different CUDA routines available in the cuSOLVER library for the solution of tridiagonal linear systems is reported.
Things I see:
1st __syncthreads() seems redundant.
There are repetitive sets of operations such as (-csub[idx-stride]*asub[idx]/bsub[idx-stride]) in your code. Use intermediate variables to hold the result and reuse them instead of making GPU calculate those sets each time.
Use NVIDIA profiler to see where issues are.

Performing several 1D moving averages in parallel using CUDA Thrust

I'm not a programmer with any abilities. Just someone curious about CUDA and so I'm doing a little reading. I ran across an example of using Thrust to do a moving average:
Simple Moving Average Thrust Example
The example, such as it is, runs and mostly works correctly. However it's trivial in the sense that it only does one moving average operation.
How I would do say 352 of these moving average operations in parallel, all operating on the same data stream? In my mind the program flow might be:
Generate the data & send it to one CUDA core. (Same as existing code
but think lengths of 1000 or 10000 instead of 30)
Copy it from the CUDA core it's in to all of the the other 351 CUDA
cores in my GTX 465
Tell each CUDA core what number of data items to average over.
(4, 5, 6,..., 352, 353, 354)
Tell the device to run the average in each core in parallel
Read back the results from each core
I get that this code
// compute SMA using standard summation
simple_moving_average(data, w, averages);
makes it all happen, but how to I get Thrust to do many of these in parallel?
My interest here is about something like stock data. If I'm looking at GOOG prices I'd put that in the GPU using all cores and leave it there. I'd then be free to do lots of processing without loading the data anymore and just reading back results from each core. NOTE: I might not want to use GOOG in all cores. Some cores might be GOOG, others with some other symbol, but I'll get there later. I'm just thinking I don't want the stock data in global memory if there's enough room in each core.
I assume this is pretty straightforward for CUDA & Thrust?
Here is the possible way how to do this with arrayfire:
Note that I am NOT affiliated with this library whatsoever.
I am pretty sure this can also be done with thrust
but I found this one a lot simpler with arrayfire.
And if the library is free why can't I use it instead of thrust ?
In arrayfire you can use matrix to run several SMA operations in parallel:
unsigned n_SMAs = 1000; // # of SMA indicators to evaluate
unsigned len = 2000; // # of stock prices per indicator
unsigned w = 6; // window size
// generate stock prices: [0..10]
af::array data = af::randu(n_SMAs, len) * 10;
// compute inclusive prefix sums along colums of the matrix
af::array s = af::accum(data, 1);
// compute the average
af::array avg = (s.cols(w, af::end) - s.cols(0, af::end - w)) / w;
af::eval(avg);
std::cout << avg.dims() << "\n" << avg << "\n";
let me know if that's what you are looking for. This is how I understood your question: compute several SMA indicators in parallel
My understanding is that you are interested into the following two situations:
You have a long sequence of items and you want to calculate a certain number of averages, by averaging on different numbers of items, i.e., using different lengths for the moving average window. This is what I understand from your original question.
You have a series of sequences, stored consecutively in memory, and you want to average them in parallel with a fixed averaging window of size 2 * RADIUS + 1. This is what the ArrayFire code proposed by #asm does - you have accepted it.
Instead of using CUDA Thrust, I think it would be easier to write your own CUDA kernel to do the above operations. Below, a fully worked example that operates in the same way as the ArrayFire code proposed by #asm, thus covering case #2. Modifying it to cover case #1 would be straightforward.
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCK_SIZE_X 8
#define BLOCK_SIZE_Y 8
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int M, unsigned int N) {
__shared__ unsigned int temp[BLOCK_SIZE_Y][BLOCK_SIZE_X + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gindexy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int gindex = gindexy * N + gindexx;
unsigned int lindexx = threadIdx.x + RADIUS;
unsigned int lindexy = threadIdx.y;
// --- Read input elements into shared memory
temp[lindexy][lindexx] = ((gindexx < N)&&(gindexy < M))? in[gindex] : 0;
if (threadIdx.x < RADIUS) {
temp[lindexy][threadIdx.x] = ((gindexx >= RADIUS)&&(gindexx < (N + RADIUS))&&(gindexy < M)) ? in[gindex - RADIUS] : 0;
temp[lindexy][threadIdx.x + (RADIUS + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X))] = (((gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)) < N)&&(gindexy < M))? in[gindexy * N + gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)] : 0;
if ((threadIdx.y == 0)&&(gindexy < M)&&((gindexx + BLOCK_SIZE_X) < N)&&(gindexy < M)) printf("Inside 2 - tidx = %i; bidx = %i; tidy = %i; bidy = %i; lindexx = %i; temp = %i\n", threadIdx.x, blockIdx.x, threadIdx.y, blockIdx.y, threadIdx.x + (RADIUS + BLOCK_SIZE_X), temp[lindexy][threadIdx.x + (RADIUS + BLOCK_SIZE_X)]);
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexy][lindexx + offset];
}
// --- Store the result
out[gindexy * N + gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int M = 2;
const unsigned int N = 4 + 2 * RADIUS;
const unsigned int constant = 3;
thrust::device_vector<unsigned int> d_in(M * N, constant);
thrust::device_vector<unsigned int> d_out(M * N);
dim3 GridSize(iDivUp(N, BLOCK_SIZE_X), iDivUp(M, BLOCK_SIZE_Y));
dim3 BlockSize(BLOCK_SIZE_X, BLOCK_SIZE_Y);
moving_average<<<GridSize, BlockSize>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int j=0; j<M; j++) {
for (int i=0; i<N; i++)
printf("Element j = %i; i = %i; h_out = %i\n", j, i, h_out[N*j+i]);
}
return 0;
}

cuda multiplication

Serial code snippet looks like this:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
I converted this to CUDA using this kernel:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(tid = 0; tid <nx*ny; tid++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
However the GPU kernel does not give any speedup improvement? Any suggestions on a better solution?? Thanks in advance
If this is the serial code:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
then you should be doing this:
__global__ void fn(float *x, int nx)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx, i = tid - j * nx;
x[tid] *= y[i];
}
fn<<<nx*ny/B, B>>>(x, nx); // with B = 256, 512, etc.
What you're doing is fairly bizarre: you're instructing each thread of the CUDA kernel to iterate over all values of tid between 0 and nx*ny, and compute the same function as your CPU version! Moreover, instead of just iterating over the indices, you're actually doing the loop less efficiently than you did for the CPU version; in other words, you do the same thing in each thread, just less efficiently, than you are doing in 1 thread on the CPU. It's no wonder that this is slower; it should be much, much slower. Your CUDA kernel is:
int **tid** = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(**tid** = 0; **tid** <nx*ny; **tid**++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
This does nx*ny iterations, same as your host code, for each thread; you lose all benefit of the parallelism, since each thread is doing the same thing; you would get the same performance using one thread on the GPU, and the same result!
If this is the verbatim code from your CUDA source file, you need to change it and redo the comparison; if this is code you have written to help explain what your code is doing for a lay non-CUDA audience, then you need to present your actual CUDA code so that we can see what's going on... as it is, the performance analysis I have done - the trivial one - is all you can expect.
Given your comment to this answer:
the nx * ny = 2205; so I used no. of blocks =
(nx*ny+(threads-1))/threads and threads = 64.
is implying you are intending to launch one thread per computation, the correct CUDA implementation would just be:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx;
int i = tid - j*nx;
if (tid < (nx*ny))
x[tid] *= y[i];
If you were intending for each thread to compute more than one computation per kernel launch, then you would size the grid to "fill" each of the SM on the target GPU, not use the same number of threads as the input size, and then do something like:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int gsize = blockDim.x * gridDim.x;
int i,j;
for(; tid <nx*ny; tid+=gsize)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
That would get you at least coalesced reads and writes to x, and remove the enormous number of redundant calculations in your posted version. There are a number of further optimizations that could be made, but it would require more information about the problem than has been supplied in the question and subsequent comments. Your indexing scheme contains an integer division and then an integer multiply-add per calculation. That is a lot of overhead for a single FLOP per input value. However, having said all of that, if the problem size I quoted is that actual problem size you are interested in, the GPU will never be faster than even a modest host CPU. You would require many orders of magnitude larger problems to realize useful speed up using the GPU for this sort low arithmetic intensity operation.
How big is the block? it may be that the time needed to copy a small amount of data to the GPU and setup the envirnoment is much longer than the calculation time.
Remember also that CUDA does a jit compile on the first run so to get accurate benchmarking you need to run it many times.
Try this using shared memory. One of the best implementations around:
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride; // In number of elements
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row +
BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Same as in previous example, except the followings:
// d_A.width = d_A.stride = A.width;
// d_B.width = d_B.stride = B.width;
// d_C.width = d_C.stride = C.width;
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A and Bsub of B
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}

Shufflling for Differential Evolutionary Algorithm in CUDA

I would like to implement a Differential Evolutionary Algorithm in CUDA.
How can I get two random vectors from matrix, knowing that they cannot be accessed again or, conversely, that they can? Is there an easy way of shuffling vectors in matrices?
I would also need to compute something using values from such a vector, and put new values in the bottom cell of each vector. It is easy to do? How to do it?
Maybe there is something like a stack implementation library (get by id, peek by id, ...)?
Maybe you should have a look at thrust library, which is sort of a C++ STL equivalent for CUDA. It has been integrated in the latest release of the CUDA toolkit, but if you have an older version of CUDA you can still download it for free at: http://code.google.com/p/thrust/
in this library, you'll find easy ways to handle vectors and to generate random numbers.
Concerning the implementation of the Differential Evolutionary Algorithm in CUDA as proposed in
R. Storn and K. Price, "Differential evolution: a simple and efficient heuristic for global optimization over continuous spaces," Journal of Global Optimization, vol. 11, no. 4, pp. 341-359, 1997
you seem to be concerned with the crossover operation. The only significant CUDA implementation of the Differential Evolutionary Algorithm in CUDA I'm aware is that in
L.P. de Veronese, R.A. Krohling, "Differential evolution algorithm on the GPU with C-CUDA," Proc. of the IEEE Congress on Evolutionary Computation, Barcelona, Spain, July 18-23, 2010, pp. 1-7.
Below, I'm showing a full CUDA code based on the implementation suggested in the latter paper.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <curand.h>
#include <curand_kernel.h>
using namespace thrust;
#include <stdio.h>
#include <time.h>
#include <fstream>
#include "Utilities.cuh"
#define pi 3.14159265358979f
#define BLOCK_SIZE_POP 32
#define BLOCK_SIZE_RAND 64
#define BLOCK_SIZE_UNKN 8
#define BLOCK_SIZE 256
//#define DEBUG
// --- REFERENCES
// [1] R. Storn and K. Price, “Differential evolution – a simple and efficient heuristic for global optimization over continuous spaces,”
// Journal of Global Optimization, vol. 11, no. 4, pp. 341–359, 1997
// [2] Lucas de P. Veronese and Renato A. Krohling, “Differential Evolution Algorithm on the GPU with C-CUDA,”
// Proc. of the IEEE Congress on Evolutionary Computation, Barcelona, Spain, Jul. 18-23, 2010, pp. 1-7.
// Conventions: the index j addresses the population member while the index i addresses the member component
// the homologous host and device variables have the same name with a "h_" or "d_" prefix, respectively
// the __host__ and __device__ functions pointer parameters have the same name for comparison purposes. it is up to the caller to use
// host or device pointers, as appropriate
/****************************************/
/* EVALUATION OF THE OBJECTIVE FUNCTION */
/****************************************/
__global__ void curand_setup_kernel(curandState * __restrict state, const unsigned long int seed)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state[tid]);
}
/********************************/
/* INITIALIZE POPULATION ON GPU */
/********************************/
__global__ void initialize_population_GPU(float * __restrict pop, const float * __restrict minima, const float * __restrict maxima,
curandState * __restrict state, const int D, const int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < Np)) pop[j*D+i] = (maxima[i] - minima[i]) * curand_uniform(&state[j*D+i]) + minima[i];
}
/****************************************/
/* EVALUATION OF THE OBJECTIVE FUNCTION */
/****************************************/
__host__ __device__ float functional(const float * __restrict x, const int D) {
float sum = 0.f;
// --- De Jong function
//for (int i=0; i<D; i++) sum = sum + x[i] * x[i];
// --- Rosenbrock's saddle
sum = 0.f;
for (int i=1; i<D; i++) sum = sum + 100.f * (x[i] - x[i-1] * x[i-1]) * (x[i] - x[i-1] * x[i-1]) + (x[i-1] - 1.f) * (x[i-1] - 1.f);
return sum;
}
/********************************/
/* POPULATION EVALUATION ON GPU */
/********************************/
__global__ void evaluation_GPU(const int Np, const int D, const float * __restrict pop, float * __restrict fobj) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
if (j < Np) fobj[j] = functional(&pop[j*D], D);
}
/**********************************************************/
/* GENERATE MUTATION INDICES AND CROSS-OVER VALUES ON GPU */
/**********************************************************/
__global__ void generate_mutation_indices_and_crossover_values_GPU(float * __restrict Rand, int * __restrict mutation, const int Np, const int D,
curandState * __restrict state) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
int a, b, c;
if (j < Np) {
do a=Np*(curand_uniform(&state[j*D])); while(a==j);
do b=Np*(curand_uniform(&state[j*D])); while(b==j||b==a);
do c=Np*(curand_uniform(&state[j*D])); while(c==j||c==a||c==b);
mutation[j*3]=a;
mutation[j*3+1]=b;
mutation[j*3+2]=c;
Rand[j]=curand_uniform(&state[j*D]);
}
}
/**********************************/
/* GENERATION OF A NEW POPULATION */
/**********************************/
__global__ void generation_new_population_GPU(const float * __restrict pop, const int NP, const int D, float * __restrict npop, const float F,
const float CR, const float * __restrict rand, const int * __restrict mutation,
const float * __restrict minimum, const float * __restrict maximum) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < NP)) {
// --- Mutation indices
int a=mutation[j*3];
int b=mutation[j*3+1];
int c=mutation[j*3+2];
// --- Mutation and crossover
// --- One of the best strategies. Try F = 0.7 and CR = 0.5 as a first guess.
if(rand[j]<CR) npop[j*D+i] = pop[a*D+i]+F*(pop[b*D+i]-pop[c*D+i]);
else npop[j*D+i] = pop[j*D+i];
// --- Other possible approaches to mutation and crossover
// --- Not bad, but found several optimization problems where misconvergence occurs.
//npop[j*D+i] = pop[best_old_gen_ind*D+i] + F*(pop[b*D+i]-pop_old[c*D+i]);
// --- One of the best strategies. Try F = 0.85 and CR = 1. In case of misconvergence, try to increase NP. If this doesn't help,
// play around with all the control variables.
//npop[j*D+i] = pop[j*D+i] + F*(pop[best_old_gen_ind*D+i] - pop[j*D+i]) + F*(pop[a*D+i]-pop[b*D+i]);
// --- Powerful strategy worth trying.
//npop[j*D+i] = pop[best_old_gen_ind*D+i] + (pop[a*D+i]+pop[b*D+i]-pop[c*D+i]-pop[d*D+i])*F;
// --- Robust optimizer for many functions.
//npop[j*D+i] = pop[e*D+i] + (pop[a*D+i]+pop[b*D+i]-pop[c*D+i]-pop[d*D+i])*F;
// --- Saturation due to constraints on the unknown parameters
if (npop[j*D+i]>maximum[i]) npop[j*D+i]=maximum[i];
else if (npop[j*D+i]<minimum[i]) npop[j*D+i]=minimum[i];
}
}
/*******************************/
/* POPULATION SELECTION ON GPU */
/*******************************/
// Assumption: all the optimization variables are associated to the same thread block
__global__ void selection_and_evaluation_GPU(const int Np, const int D, float * __restrict pop, const float * __restrict npop, float * __restrict fobj) {
int i = threadIdx.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if ((i < D) && (j < Np)) {
float nfobj = functional(&npop[j*D], D);
float temp = fobj[j];
if (nfobj < temp) {
pop[j*D+i] = npop[j*D+i];
fobj[j] = nfobj;
}
}
}
/***********************/
/* FIND MINIMUM ON GPU */
/***********************/
void find_minimum_GPU(const int N, float *t, float * __restrict minval, int * __restrict index) {
// --- Wrap raw pointer with a device_ptr
device_ptr<float> dev_ptr = device_pointer_cast(t);
// --- Use device_ptr in thrust min_element
device_ptr<float> min_ptr = thrust::min_element(dev_ptr, dev_ptr + N);
index[0] = &min_ptr[0] - &dev_ptr[0];
minval[0] = min_ptr[0];;
}
/********/
/* MAIN */
/********/
int main()
{
// --- Number of individuals in the population (Np >=4 for mutation purposes)
int Np = 80;
// --- Dimensionality of each individual (number of unknowns)
int D = 5;
// --- Mutation factor (0 < F <= 2). Typically chosen in [0.5, 1], see Ref. [1]
float F = 0.7f;
// --- Maximum number of generations
int Gmax = 2000;
// --- Crossover constant (0 < CR <= 1)
float CR = 0.4f;
// --- Mutually different random integer indices selected from {1, 2, … ,Np}
int *d_mutation, // --- Device side mutation vector
*d_best_index, // --- Device side current optimal member index
*h_best_index_dev; // --- Host side current optimal member index of device side
float *d_pop, // --- Device side population
*d_npop, // --- Device side new population (trial vectors)
*d_Rand, // --- Device side crossover rand vector (uniformly distributed in (0,1))
*d_fobj, // --- Device side objective function value
*d_maxima, // --- Device side maximum constraints vector
*d_minima, // --- Device side minimum constraints vector
*h_pop_dev_res, // --- Host side population result of GPU computations
*h_best_dev, // --- Host side population best value history of device side
*h_maxima, // --- Host side maximum constraints vector
*h_minima; // --- Host side minimum constraints vector
curandState *devState; // --- Device side random generator state vector
// --- Device side memory allocations
gpuErrchk(cudaMalloc((void**)&d_pop,D*Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_npop,D*Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_Rand,Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_fobj,Np*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_mutation,3*Np*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_maxima,D*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_minima,D*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&devState, D*Np*sizeof(curandState)));
// --- Host side memory allocations
h_pop_dev_res = (float*)malloc(D*Np*sizeof(float));
h_best_dev = (float*)malloc(Gmax*sizeof(float));
h_best_index_dev = (int*)malloc(Gmax*sizeof(int));
h_maxima = (float*)malloc(D*sizeof(float));
h_minima = (float*)malloc(D*sizeof(float));
// --- Define grid sizes
int Num_Blocks_Pop = iDivUp(Np,BLOCK_SIZE_POP);
int Num_Blocks_Rand2 = iDivUp(Np,BLOCK_SIZE_RAND);
dim3 Grid(iDivUp(D,BLOCK_SIZE_UNKN),iDivUp(Np,BLOCK_SIZE_POP));
dim3 Block(BLOCK_SIZE_UNKN,BLOCK_SIZE_POP);
// --- Set maxima and minima
for (int i=0; i<D; i++) {
h_maxima[i] = 2.;
h_minima[i] = -2.;
}
gpuErrchk(cudaMemcpy(d_maxima, h_maxima, D*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_minima, h_minima, D*sizeof(float), cudaMemcpyHostToDevice));
// --- Initialize cuRAND states
curand_setup_kernel<<<iDivUp(D*Np, BLOCK_SIZE), BLOCK_SIZE>>>(devState, time(NULL));
// --- Initialize popultion
initialize_population_GPU<<<Grid, Block>>>(d_pop, d_minima, d_maxima, devState, D, Np);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Evaluate population
evaluation_GPU<<<iDivUp(Np, BLOCK_SIZE), BLOCK_SIZE>>>(Np, D, d_pop, d_fobj);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
int a, b, c;
for(int i=0;i<Gmax;i++) {
// --- Generate mutation indices and cross-over uniformly distributed random vector
generate_mutation_indices_and_crossover_values_GPU<<<Num_Blocks_Rand2,BLOCK_SIZE_RAND>>>(d_Rand, d_mutation, Np, D, devState);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Generate new population
generation_new_population_GPU<<<Grid,Block>>>(d_pop, Np, D, d_npop, F, CR, d_Rand, d_mutation, d_minima, d_maxima);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Select new population and evaluate it
selection_and_evaluation_GPU<<<Grid,Block>>>(Np, D, d_pop, d_npop, d_fobj);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
find_minimum_GPU(Np, d_fobj, &h_best_dev[i], &h_best_index_dev[i]);
printf("Iteration: %i; best member value: %f: best member index: %i\n", i, h_best_dev[i], h_best_index_dev[i]);
}
gpuErrchk(cudaMemcpy(h_pop_dev_res, d_pop, Np*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<D; i++) printf("Variable nr. %i = %f\n", i, h_pop_dev_res[h_best_index_dev[Gmax-1]*D+i]);
return 0;
}