Line detection on CUDA

Line detection on CUDA - cuda

I am trying to do real time line detection using CUDA. I have calculated the hough transform along with the min, max line coordinates of each bin. For getting the line segments I am tracing (using Bresenham's line algorithm) through the min to max point and get the line segments on each bin. When the hough threshold is low and when lot of lines are there in the image trace_lines takes lot of time to complete.
hough transform (hough_line_transform) computation takes around 5-10ms per frame(1280x720) on a GTX 660 (observed to be 10 times faster than CPU implementation). But tracing the line segments from the min, max points takes 1ms-15ms.
I have two questions on line detection
Does there exist a better algorithm to get the line segments from the min, max points of the hough bins?
Is it possible to optimize hought_line_transform (see the code below) further? I am using atomic operations. Is it possible to avoid atomics.
I am attaching the code below.
class Header
#ifndef _HOUGH_LINES_H_
#define _HOUGH_LINES_H_
#include <cuda_gl_interop.h>
#include <thrust/device_vector.h>
union Pos;
struct Line;
struct Hough_params
{
int w;
int h;
int r;
};
class Hough_lines
{
public:
enum Type {INT, SHORT_INT, FLOAT};
Hough_lines(int _w, int _h);
~Hough_lines();
public:
bool init();
bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);
protected:
void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);
static void compute_trig_funcs();
protected:
Hough_params params;
thrust::device_vector<Hough_params> d_param;
static bool trig_init;
};
#endif
Body
#include <hough_lines.h>
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_gl_interop.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#define ANGLE_SIZE 360
#define MAX_LINE_PER_THREAD 10
union Pos
{
struct
{
uint16_t x;
uint16_t y;
};
uint32_t value;
};
struct Hough_info
{
Pos end;
Pos start;
int count;
};
struct Line
{
Pos start;
Pos end;
};
struct Line_info
{
int line_count;
Line line[MAX_LINE_PER_THREAD];
};
__constant__ float dev_sint[ANGLE_SIZE];
__constant__ float dev_cost[ANGLE_SIZE];
texture<uint8_t, 2, cudaReadModeElementType> luma_tex;
bool Hough_lines::trig_init = false;
__global__ void mark_edges(const Hough_params* param, int* edge)
{
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
}
__global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
{
int index;
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
if (255 == tex2D(luma_tex, x, y))
{
index = edge[pos];
coord[index].y = y;
coord[index].x = x;
}
}
__global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
{
int i;
int angle;
int rdata;
__shared__ Hough_info sh_rho_data[1001];
i = threadIdx.x;
while (i < param->r)
{
sh_rho_data[i].end.value = 0x0;
sh_rho_data[i].start.value = 0xFFFFFFFF;
sh_rho_data[i].count = 0;
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
angle = blockIdx.x;
const float cos_angle = dev_cost[angle];
const float sin_angle = dev_sint[angle];
while (i < size)
{
rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
if (rdata >= 0)
{
atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);
atomicAdd(&sh_rho_data[rdata].count, 1);
}
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
rdata = (angle*param->r);
while (i < param->r)
{
memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
i += blockDim.x;
}
}
__global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
{
int pos;
int i = threadIdx.x;
int offset = (blockIdx.x*param->r);
while (i < param->r)
{
if (hdata[offset+i].count >= threshold)
{
pos = mark[offset+i];
lines[pos].start.value = hdata[offset+i].start.value;
lines[pos].end.value = hdata[offset+i].end.value;
}
i += blockDim.x;
}
}
__device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
{
int d = abs(xe-xs)+abs(ye-ys);
if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
{
line->line[line->line_count].start.x = xs;
line->line[line->line_count].start.y = ys;
line->line[line->line_count].end.x = xe;
line->line[line->line_count].end.y = ye;
++line->line_count;
//printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
}
}
__global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
{
int d;
int dsub;
int dstep;
int xstep;
int ystep;
int xs, ys, xe, ye;
int i = (blockIdx.x*blockDim.x+threadIdx.x);
if (i >= inp_size)
{
return;
}
xs = input[i].start.x;
ys = input[i].start.y;
xe = input[i].end.x;
ye = input[i].end.y;
line_info[i].line_count = 0;
int dx = abs(xe-xs);
int dy = abs(ye-ys);
int xinc = (xe > xs)?1:-1;
int yinc = (ye > ys)?1:-1;
int gap = 0;
bool sflag;
int s_x, s_y, e_x, e_y;
if (dx > dy)
{
dsub = (dx<<1);
dstep = (dy<<1);
d = dstep-dx;
xstep = xinc;
ystep = 0;
xinc = 0;
}
else
{
dsub = (dy<<1);
dstep = (dx<<1);
d = dstep-dy;
xstep = 0;
ystep = yinc;
yinc = 0;
}
sflag = true;
s_x = xs;
s_y = ys;
e_x = xs;
e_y = ys;
int x = xs;
int y = ys;
while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
{
x += xstep;
y += ystep;
if (d > 0)
{
x += xinc;
y += yinc;
d -= dsub;
}
d += dstep;
if (255 == tex2D(luma_tex, x, y))
{
e_x = x;
e_y = y;
gap = 0;
if (!sflag)
{
s_x = x;
s_y = y;
sflag = true;
}
}
else if (sflag)
{
++gap;
if (gap >= min_gap)
{
sflag = false;
add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
}
}
}
if (sflag)
{
add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
}
mark[i] = line_info[i].line_count;
}
__global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
{
int index = (blockIdx.x*blockDim.x+threadIdx.x);
if (index >= size)
{
return;
}
int pos;
int start = 4*mark[index];
Line* line_data = &line[index].line[0];
for (int i = 0; i < line[index].line_count; i++)
{
pos = start+(4*i);
coords[pos] = line_data[i].start.x-(param->w>>1);
coords[pos+1] = (param->h>>1)-line_data[i].start.y;
coords[pos+2] = line_data[i].end.x-(param->w>>1);
coords[pos+3] = (param->h>>1)-line_data[i].end.y;
}
if ((index+1) == size)
{
*count = mark[index];
}
}
Hough_lines::Hough_lines(int _w, int _h)
:d_param(1)
{
params.w = _w;
params.h = _h;
params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
thrust::copy_n(&params, 1, d_param.begin());
}
Hough_lines::~Hough_lines()
{
}
bool Hough_lines::init()
{
if (false == trig_init)
{
trig_init = true;
compute_trig_funcs();
}
return true;
}
void Hough_lines::compute_trig_funcs()
{
float theta;
cudaError_t err = cudaSuccess;
static float sint[ANGLE_SIZE];
static float cost[ANGLE_SIZE];
for (int i = 0; i < ANGLE_SIZE; i++)
{
theta = (M_PI*(float)i)/180.0;
sint[i] = sin(theta);
cost[i] = cos(theta);
}
err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
if (cudaSuccess != err)
{
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
}
}
void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
{
dim3 bsize(16, 16);
dim3 gsize(params.w/bsize.x, params.h/bsize.y);
thrust::device_vector<int> d_mark(params.w*params.h);
size = 0;
mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_coords.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
{
int edge_count = 0;
thrust::device_vector<Pos> d_coords(params.w*params.h);
get_edges(d_coords, edge_count);
thrust::device_vector<int> d_mark(params.r*360);
thrust::device_vector<Hough_info> d_hough_data(params.r*360);
hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
edge_count,
thrust::raw_pointer_cast(d_coords.data()), threshold,
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_hough_data.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
threshold,
thrust::raw_pointer_cast(d_hough_data.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_lines.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
{
thrust::device_vector<int> d_mark_line(size);
thrust::device_vector<Line_info> d_nlines(size);
trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()),
size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
thrust::raw_pointer_cast(d_mark_line.data()));
thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());
thrust::device_vector<int> d_count(1);
copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_nlines.data()), size,
thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
thrust::raw_pointer_cast(d_count.data()));
thrust::copy(d_count.begin(), d_count.end(), &count);
//printf("\nLine count: %d", count);
}
void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
int* d_line_coord = 0;
cudaGLRegisterBufferObject(line);
cudaGLMapBufferObject((void **)&d_line_coord, line);
int size = 0;
thrust::device_vector<Line> d_lines(params.r*360);
get_hough_lines(threshold, d_lines, size);
//printf("\nget_hough_lines: %d", size);
trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);
cudaGLUnmapBufferObject(line);
cudaGLUnregisterBufferObject(line);
}
bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
cudaError_t err;
cudaArray* array_edge;
cudaGraphicsResource* res_edge;
err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
if (err != cudaSuccess)
{
printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
cudaGraphicsMapResources(1, &res_edge);
cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
if (err != cudaSuccess)
{
printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
{
printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
float time = 0.0;
//static float max = 0.0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
count = 0;
get_lines(threshold, min_length, min_gap, line, type, count);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//static int frame = 0;
//frame++;
//if (time > max)
{
//max = time;
printf("\nElpased time: %f ms", time);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaUnbindTexture(luma_tex);
cudaGraphicsUnmapResources(1, &res_edge);
cudaGraphicsUnregisterResource(res_edge);
return true;
}

In "Prefix sums and their applications", Guy Blelloch described a parallel line-drawing algorithm using parallel prefix sum. See page 55 of that paper, it might give you ideas.
Regarding how to optimize hough_line_transfer, I think the key is to eliminate shared memory atomics in your loop. Where you use them you are effectively doing keyed reductions. Thrust provides a reduce_by_key function, but that is only callable from the host. The device-library counterpart to Thrust is CUB, but it does not have reduce_by_key. I've asked the CUB authors for ideas here and if we come up with anything I'll update this answer.
You could write your own keyed reduction but it would be more productive and robust to rely on a library if possible.

Related

Cuda Implementation of Partitioned Subgroup

is there a more efficient way to implement the "Partitioned Subgroup" functions of Vulkan/OpenGL, which do not have to loop over all elements in the subgroup? My current implementation just uses a loop from 0 to WARP_SIZE.
References:
(slide 37+38) https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf
https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt
Simple Implementation:
__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
uint32_t result = 0;
for (int i = 0; i < 32; ++i)
{
int x = __shfl_sync(0xFFFFFFFF, p(0), i);
int y = __shfl_sync(0xFFFFFFFF, p(1), i);
uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
if (i == threadIdx.x & 31) result = b;
}
return result;
}
__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
float result = 0;
for ( unsigned int i = 0; i < 32; ++i)
{
float other_value = __shfl_sync(0xFFFFFFFF, value, i);
if ((1U << i) & ballot) result += other_value;
}
return result;
}

Thanks to the hint of Abator I came up with a more efficient solution. It's a little ugly because labeled_partition is only implemented for int but works quite well.
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
using namespace cooperative_groups;
thread_block block = this_thread_block();
thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);
coalesced_group g1 = labeled_partition(tile32, p(0));
coalesced_group g2 = labeled_partition(tile32, p(1));
details::_coalesced_group_data_access acc;
return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}
template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
int s = group.size();
int r = group.thread_rank();
for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
{
auto v = group.template shfl_down(value, offset);
if (r + offset < s) value += v;
}
return value;
}

Thrust/CUDA replicate an array multiple times combined with the values of another array

Let's say I have two arrays
A = {1, 2, 3}
and
B = {10,20,30,40,50}
I want to generate a new array which would have a size of
sizeof(A) * sizeof(B)
I want to replicate B sizeof(A) times, and on each repetition i, the resultant array should have A[i] added to it. So the result would be something like
{11,21,31,41,51,12,22,32,42,52,13,23,33,43,53}

This task can be interpreted as a 2-dimensional problem where the output array can be treated as a matrix of dimensions sizeof(A) times sizeof(B). In this way, we can use 2D CUDA indexing to achieve the desired functionality. A sample CUDA C++ code of this 2D implementation is shown below:
#include <iostream>
#include <cuda_runtime.h>
#include <cassert>
using namespace std;
__global__ void kernel_replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
const int ai = blockIdx.x * blockDim.x + threadIdx.x;
const int bi = blockIdx.y * blockDim.y + threadIdx.y;
if(ai<alen && bi<blen)
{
const int ci = ai * blen + bi;
c[ci] = a[ai] + b[bi];
}
}
void replicate_device(int* a, int* b, int* c, int alen, int blen, int clen)
{
dim3 block(16,16);
dim3 grid;
grid.x = (alen + block.x - 1) / block.x;
grid.y = (blen + block.y - 1) / block.y;
kernel_replicate<<<grid, block>>>(a,b,c,alen,blen,clen);
assert(cudaSuccess == cudaDeviceSynchronize());
}
void replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
int *ad, *bd, *cd;
size_t abytes = alen * sizeof(int);
size_t bbytes = blen * sizeof(int);
size_t cbytes = clen * sizeof(int);
cudaMalloc(&ad, abytes);
cudaMalloc(&bd, bbytes);
cudaMalloc(&cd, cbytes);
cudaMemcpy(ad,a, abytes, cudaMemcpyHostToDevice);
cudaMemcpy(bd,b, bbytes, cudaMemcpyHostToDevice);
replicate_device(ad,bd,cd, alen,blen,clen);
cudaMemcpy(c,cd, cbytes, cudaMemcpyDeviceToHost);
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
}
int main()
{
const int alen = 3;
const int blen = 5;
const int clen = alen * blen;
int A[alen] = {1,2,3};
int B[blen] = {10,20,30,40,50};
int C[clen] = {0};
replicate(A,B,C,alen, blen, clen);
for(int i=0; i<alen; i++)
{
cout<<A[i]<<" ";
}
cout<<endl;
for(int i=0; i<blen; i++)
{
cout<<B[i]<<" ";
}
cout<<endl;
for(int i=0; i<clen; i++)
{
cout<<C[i]<<" ";
}
cout<<endl;
return 0;
}

prefix scan for large arrays

I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?

One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$

CUDA loop unroll in array add

I want to compute 'out = alpha * px + beta * py','px' and 'py' is array.*
I have a simple kernel:
__global__
void saxpyGPU2( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
size_t i = blockDim.x*blockIdx.x + threadIdx.x;
while (i < N)
{
out[i] = alpha * px[i] + beta * py[i];
i += blockDim.x*gridDim.x;
}
}
It works, so I want to loop unroll.
The code in cuda-handbook is:
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x; i < N-n*blockDim.x*gridDim.x; i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = alpha*x[j]+beta* y[j];
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
I don't understand in the second branch when i > N-n*blockDim.x*gridDim.x. why use a outer loop
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ )....}
And I test those two kernel , first one is OK, but second one I copy from the book is incorrect.
I initial two array while(i<1024) a[i] = i; b[i] = 10*i;i++, and I want to compute the c = alpha*a + beta*b use the two kernels above, but the result in the loop unrolled kernel is 4.3e8 for all element in c.
This my test code:
int main(){
int arraySize = 1024;
float* a =new float[arraySize];
float* b =new float[arraySize];
float* c =new float[arraySize];
for (int i =0;i<arraySize;i++)
{
a[i] = 1.0* i;
b[i] = 10.0*i;
c[i] = 0.0;
}
float* d_a;
float* d_b;
float* d_c;
cudaMalloc((void**)&d_a,sizeof(float)*arraySize);
cudaMemcpy(d_a,a,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b,sizeof(float)*arraySize);
cudaMemcpy(d_b,b,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_c,sizeof(float)*arraySize);
for (int i=0;i<arraySize;i++)
{
c[i] = a[i] + b[i];
}
dim3 block_size(256,1,1);
dim3 grid_size((arraySize -1)/block_size.x+1,1,1);
float alpha = 1.0;
float beta = 1.0;
bool flag = true;
if(flag)
{
saxpyGPU<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
float* temp = new float[arraySize];
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
}
else
{
saxpyGPU2<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
Those two kernel show different result

The kernel code you posted is perfectly correct and produces the expected results. This can be demonstrated using the following code:
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py,
size_t N, float alpha,float beta) {
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x;
i < N-n*blockDim.x*gridDim.x;
i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
out[index] = alpha*x[j] + beta*y[j];
}
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py,
size_t N, float alpha,float beta ) {
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
struct prg {
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const {
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void) {
const int N = 100000;
const float alpha = 0.12345f, beta = 0.9876f;
prg gen(1.f, 2.f);
thrust::device_vector<float> x(N), y(N), z(N);
thrust::counting_iterator<unsigned int> iseqx(0);
thrust::counting_iterator<unsigned int> iseqy(N);
thrust::transform(iseqx, iseqx + N, x.begin(), gen);
thrust::transform(iseqy, iseqy + N, y.begin(), gen);
float *xp = thrust::raw_pointer_cast(&x[0]);
float *yp = thrust::raw_pointer_cast(&y[0]);
float *zp = thrust::raw_pointer_cast(&z[0]);
dim3 blockdim(128);
dim3 griddim(16);
saxpyGPU<<<griddim, blockdim>>>(zp, xp, yp, N, alpha, beta);
cudaDeviceSynchronize();
std::vector<float> xh(N), yh(N), zh(N);
thrust::copy(x.begin(), x.end(), xh.begin());
thrust::copy(y.begin(), y.end(), yh.begin());
thrust::copy(z.begin(), z.end(), zh.begin());
float maxabserr = -1.f, maxrelerr = -1.f;
for(int i=0; i<N; i++) {
float saxpyval = alpha * xh[i] + beta * yh[i];
float abserr = fabs(zh[i]-saxpyval);
float relerr = abserr / fmaxf(fabs(zh[i]), fabs(saxpyval));
maxabserr = fmaxf(abserr, maxabserr);
maxrelerr = fmaxf(relerr, maxrelerr);
}
std::cout.precision(10);
std::cout << "Maximum absolute error = " << maxabserr << std::endl;
std::cout << "Maximum relative error = " << maxrelerr << std::endl;
return 0;
}
which gives me the following:
$ nvcc -arch=sm_30 -o unrolled_saxpy unrolled_saxpy.cu
$ ./unrolled_saxpy
Maximum absolute error = 2.384185791e-07
Maximum relative error = 1.1920676e-07
If you (still) do not understand why the kernel is written as it is, follow what I showed you in your previous question and manually unroll the saxpy function. Start with n=1 and confirm it is functionally the same as the unrolled equivalent, and then try n=2, n=4, etc. to see what the action of loop unrolling is.

CUDA DE kernel not launching

I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched.
Any help?
Here's the entire code:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[D*id+0]=a;
mutation[D*id+1]=b;
mutation[D*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
UPDATE - Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}

Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Line detection on CUDA - cuda

Related

Cuda Implementation of Partitioned Subgroup

Thrust/CUDA replicate an array multiple times combined with the values of another array

prefix scan for large arrays

CUDA loop unroll in array add

CUDA DE kernel not launching

Categories

Resources