Let's say I have two arrays
A = {1, 2, 3}
and
B = {10,20,30,40,50}
I want to generate a new array which would have a size of
sizeof(A) * sizeof(B)
I want to replicate B sizeof(A) times, and on each repetition i, the resultant array should have A[i] added to it. So the result would be something like
{11,21,31,41,51,12,22,32,42,52,13,23,33,43,53}
This task can be interpreted as a 2-dimensional problem where the output array can be treated as a matrix of dimensions sizeof(A) times sizeof(B). In this way, we can use 2D CUDA indexing to achieve the desired functionality. A sample CUDA C++ code of this 2D implementation is shown below:
#include <iostream>
#include <cuda_runtime.h>
#include <cassert>
using namespace std;
__global__ void kernel_replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
const int ai = blockIdx.x * blockDim.x + threadIdx.x;
const int bi = blockIdx.y * blockDim.y + threadIdx.y;
if(ai<alen && bi<blen)
{
const int ci = ai * blen + bi;
c[ci] = a[ai] + b[bi];
}
}
void replicate_device(int* a, int* b, int* c, int alen, int blen, int clen)
{
dim3 block(16,16);
dim3 grid;
grid.x = (alen + block.x - 1) / block.x;
grid.y = (blen + block.y - 1) / block.y;
kernel_replicate<<<grid, block>>>(a,b,c,alen,blen,clen);
assert(cudaSuccess == cudaDeviceSynchronize());
}
void replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
int *ad, *bd, *cd;
size_t abytes = alen * sizeof(int);
size_t bbytes = blen * sizeof(int);
size_t cbytes = clen * sizeof(int);
cudaMalloc(&ad, abytes);
cudaMalloc(&bd, bbytes);
cudaMalloc(&cd, cbytes);
cudaMemcpy(ad,a, abytes, cudaMemcpyHostToDevice);
cudaMemcpy(bd,b, bbytes, cudaMemcpyHostToDevice);
replicate_device(ad,bd,cd, alen,blen,clen);
cudaMemcpy(c,cd, cbytes, cudaMemcpyDeviceToHost);
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
}
int main()
{
const int alen = 3;
const int blen = 5;
const int clen = alen * blen;
int A[alen] = {1,2,3};
int B[blen] = {10,20,30,40,50};
int C[clen] = {0};
replicate(A,B,C,alen, blen, clen);
for(int i=0; i<alen; i++)
{
cout<<A[i]<<" ";
}
cout<<endl;
for(int i=0; i<blen; i++)
{
cout<<B[i]<<" ";
}
cout<<endl;
for(int i=0; i<clen; i++)
{
cout<<C[i]<<" ";
}
cout<<endl;
return 0;
}
I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?
One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$
I want to compute 'out = alpha * px + beta * py','px' and 'py' is array.*
I have a simple kernel:
__global__
void saxpyGPU2( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
size_t i = blockDim.x*blockIdx.x + threadIdx.x;
while (i < N)
{
out[i] = alpha * px[i] + beta * py[i];
i += blockDim.x*gridDim.x;
}
}
It works, so I want to loop unroll.
The code in cuda-handbook is:
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x; i < N-n*blockDim.x*gridDim.x; i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = alpha*x[j]+beta* y[j];
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
I don't understand in the second branch when i > N-n*blockDim.x*gridDim.x. why use a outer loop
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ )....}
And I test those two kernel , first one is OK, but second one I copy from the book is incorrect.
I initial two array while(i<1024) a[i] = i; b[i] = 10*i;i++, and I want to compute the c = alpha*a + beta*b use the two kernels above, but the result in the loop unrolled kernel is 4.3e8 for all element in c.
This my test code:
int main(){
int arraySize = 1024;
float* a =new float[arraySize];
float* b =new float[arraySize];
float* c =new float[arraySize];
for (int i =0;i<arraySize;i++)
{
a[i] = 1.0* i;
b[i] = 10.0*i;
c[i] = 0.0;
}
float* d_a;
float* d_b;
float* d_c;
cudaMalloc((void**)&d_a,sizeof(float)*arraySize);
cudaMemcpy(d_a,a,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b,sizeof(float)*arraySize);
cudaMemcpy(d_b,b,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_c,sizeof(float)*arraySize);
for (int i=0;i<arraySize;i++)
{
c[i] = a[i] + b[i];
}
dim3 block_size(256,1,1);
dim3 grid_size((arraySize -1)/block_size.x+1,1,1);
float alpha = 1.0;
float beta = 1.0;
bool flag = true;
if(flag)
{
saxpyGPU<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
float* temp = new float[arraySize];
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
}
else
{
saxpyGPU2<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
Those two kernel show different result
The kernel code you posted is perfectly correct and produces the expected results. This can be demonstrated using the following code:
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py,
size_t N, float alpha,float beta) {
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x;
i < N-n*blockDim.x*gridDim.x;
i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
out[index] = alpha*x[j] + beta*y[j];
}
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py,
size_t N, float alpha,float beta ) {
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
struct prg {
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const {
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void) {
const int N = 100000;
const float alpha = 0.12345f, beta = 0.9876f;
prg gen(1.f, 2.f);
thrust::device_vector<float> x(N), y(N), z(N);
thrust::counting_iterator<unsigned int> iseqx(0);
thrust::counting_iterator<unsigned int> iseqy(N);
thrust::transform(iseqx, iseqx + N, x.begin(), gen);
thrust::transform(iseqy, iseqy + N, y.begin(), gen);
float *xp = thrust::raw_pointer_cast(&x[0]);
float *yp = thrust::raw_pointer_cast(&y[0]);
float *zp = thrust::raw_pointer_cast(&z[0]);
dim3 blockdim(128);
dim3 griddim(16);
saxpyGPU<<<griddim, blockdim>>>(zp, xp, yp, N, alpha, beta);
cudaDeviceSynchronize();
std::vector<float> xh(N), yh(N), zh(N);
thrust::copy(x.begin(), x.end(), xh.begin());
thrust::copy(y.begin(), y.end(), yh.begin());
thrust::copy(z.begin(), z.end(), zh.begin());
float maxabserr = -1.f, maxrelerr = -1.f;
for(int i=0; i<N; i++) {
float saxpyval = alpha * xh[i] + beta * yh[i];
float abserr = fabs(zh[i]-saxpyval);
float relerr = abserr / fmaxf(fabs(zh[i]), fabs(saxpyval));
maxabserr = fmaxf(abserr, maxabserr);
maxrelerr = fmaxf(relerr, maxrelerr);
}
std::cout.precision(10);
std::cout << "Maximum absolute error = " << maxabserr << std::endl;
std::cout << "Maximum relative error = " << maxrelerr << std::endl;
return 0;
}
which gives me the following:
$ nvcc -arch=sm_30 -o unrolled_saxpy unrolled_saxpy.cu
$ ./unrolled_saxpy
Maximum absolute error = 2.384185791e-07
Maximum relative error = 1.1920676e-07
If you (still) do not understand why the kernel is written as it is, follow what I showed you in your previous question and manually unroll the saxpy function. Start with n=1 and confirm it is functionally the same as the unrolled equivalent, and then try n=2, n=4, etc. to see what the action of loop unrolling is.
I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched.
Any help?
Here's the entire code:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[D*id+0]=a;
mutation[D*id+1]=b;
mutation[D*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
UPDATE - Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}