Can I use FFTW to make transform along first and last axis of 3D array?

Can I use FFTW to make transform along first and last axis of 3D array? - fft

Using fftw_plan_many_dft I can do transforms along x,y and y,z axis:
vector<complex<float_type>> yz_fft(vector<complex<float_type>> input, int N_X, int N_Y, int N_Z){
vector<complex<float_type>> result(input.size());
int rank = 2;
int n[] = {N_Y,N_Z};
int *inembed = n;
int *onembed = n;
int istride = 1;
int ostride = 1;
int idist = N_Y*N_Z;
int odist = N_Y*N_Z;
int howmany = N_X;
fftw_plan plan = fftw_plan_many_dft(
rank,
n,
howmany,
reinterpret_cast<fftw_complex *>(input.data()),
inembed,
istride,
idist,
reinterpret_cast<fftw_complex *>(result.data()),
onembed,
ostride,
odist,
FFTW_FORWARD,
FFTW_ESTIMATE);
fftw_execute(plan);
return result;
}
vector<complex<float_type>> xy_fft(vector<complex<float_type>> input, int N_X, int N_Y, int N_Z){
vector<complex<float_type>> result(input.size());
int rank = 2;
int n[] = {N_X,N_Y};
int *inembed = n;
int *onembed = n;
int istride = N_Z;
int ostride = N_Z;
int idist = 1;
int odist = 1;
int howmany = N_Z;
fftw_plan plan = fftw_plan_many_dft(
rank,
n,
howmany,
reinterpret_cast<fftw_complex *>(input.data()),
inembed,
istride,
idist,
reinterpret_cast<fftw_complex *>(result.data()),
onembed,
ostride,
odist,
FFTW_FORWARD,
FFTW_ESTIMATE);
fftw_execute(plan);
return result;
}
but I can't figure out how to do x,z transform. How do I do this?

So there is a way to use fftw_plan_many_dft to do xz transform. Downvotes may suggest that people are not interested in that but I decided to share it anyway. For solutnion check struct xz_fft_many below.
#include <iostream>
#include <numeric>
#include <complex>
#include <fftw3.h>
#include <benchmark/benchmark.h>
using namespace std;
using float_type = double;
using index_type = unsigned long;
vector<complex<float_type>> get_data(index_type N){
std::vector<complex<float_type>> data(N);
iota(data.begin(), data.end(),0);
return data;
}
void print(vector<complex<float_type>> data,index_type N_X,index_type N_Y,index_type N_Z){
for(int i=0; i!=N_X; ++i){
for(int j=0; j!=N_Y; ++j){
for(int k=0; k!=N_Z; ++k){
index_type idx = i*(N_Y*N_Z)+j*N_Z+k;
cout<<"[ "<<i<<", "<<j<<", "<<k<<" ] = "<<data.data()[idx]<<endl;
}
}
}
}
struct x_fft {
vector<complex<float_type>>& data;
vector<complex<float_type>> result;
fftw_plan fft_plan;
index_type N_X;
index_type N_Y;
index_type N_Z;
x_fft(vector<complex<float_type>>& data,index_type N_X,index_type N_Y,index_type N_Z)
: data(data), N_X(N_X), N_Y(N_Y), N_Z(N_Z)
{
result = vector<complex<float_type>>(data.size());
int rank = 1;
int n[] = {(int)N_X};
int *inembed = n;
int *onembed = n;
int istride = N_Y*N_Z;
int ostride = istride;
int idist = 1;
int odist = idist;
int howmany = N_Y*N_Z;
fft_plan = fftw_plan_many_dft(
rank,
n,
howmany,
reinterpret_cast<fftw_complex *>(data.data()),
inembed,
istride,
idist,
reinterpret_cast<fftw_complex *>(result.data()),
onembed,
ostride,
odist,
FFTW_FORWARD,
FFTW_MEASURE);
}
const vector<complex<float_type>> &getResult() const {
return result;
}
vector<complex<float_type>>& run(){
fftw_execute(fft_plan);
return result;
}
};
struct z_fft {
vector<complex<float_type>>& data;
vector<complex<float_type>> result;
fftw_plan fft_plan;
index_type N_X;
index_type N_Y;
index_type N_Z;
z_fft(vector<complex<float_type>>& data,index_type N_X,index_type N_Y,index_type N_Z)
: data(data), N_X(N_X), N_Y(N_Y), N_Z(N_Z)
{
result = vector<complex<float_type>>(data.size());
int rank = 1;
int n[] = {(int)N_Z};
int *inembed = n;
int *onembed = n;
int istride = 1;
int ostride = istride;
int idist = N_Z;
int odist = idist;
int howmany = N_X*N_Y;
fft_plan = fftw_plan_many_dft(
rank,
n,
howmany,
reinterpret_cast<fftw_complex *>(data.data()),
inembed,
istride,
idist,
reinterpret_cast<fftw_complex *>(result.data()),
onembed,
ostride,
odist,
FFTW_FORWARD,
FFTW_MEASURE);
}
vector<complex<float_type>>& run(){
fftw_execute(fft_plan);
return result;
}
};
struct xz_fft_many {
vector<complex<float_type>>& data;
vector<complex<float_type>> result;
fftw_plan fft_plan;
index_type N_X;
index_type N_Y;
index_type N_Z;
xz_fft_many(vector<complex<float_type>>& data,index_type N_X,index_type N_Y,index_type N_Z)
: data(data), N_X(N_X), N_Y(N_Y), N_Z(N_Z)
{
result = vector<complex<float_type>>(data.size());
int rank = 2;
int n[] = {(int) N_X, (int) N_Z};
int inembed[] = {(int) N_X, (int) (N_Z*N_Y)};
int *onembed = inembed;
int istride = 1;
int ostride = 1;
int idist = N_Z;
int odist = N_Z;
int howmany = N_Y;
fft_plan = fftw_plan_many_dft(
rank,
n,
howmany,
reinterpret_cast<fftw_complex *>(data.data()),
inembed,
istride,
idist,
reinterpret_cast<fftw_complex *>(result.data()),
onembed,
ostride,
odist,
FFTW_FORWARD,FFTW_MEASURE);
}
vector<complex<float_type>>& run(){
fftw_execute(fft_plan);
return result;
}
};
struct xz_fft_composition {
vector<complex<float_type>>& data;
index_type N_X;
index_type N_Y;
index_type N_Z;
x_fft* xFft;
z_fft* zFft;
xz_fft_composition(vector<complex<float_type>>& data,index_type N_X,index_type N_Y,index_type N_Z)
: data(data), N_X(N_X), N_Y(N_Y), N_Z(N_Z)
{
xFft = new x_fft(data,N_X,N_Y,N_Z);
zFft = new z_fft(xFft->result,N_X,N_Y,N_Z);
}
vector<complex<float_type>>& run(){
xFft->run();
return zFft->run();
}
};
struct TestData{
index_type N_X = 512;
index_type N_Y = 16;
index_type N_Z = 16;
index_type ARRAY_SIZE = N_X * N_Y * N_Z;
std::vector<complex<float_type>> data = get_data(ARRAY_SIZE);
TestData() {
// print(data,N_X,N_Y,N_Z);
}
};
TestData testData;
struct SanityTest{
SanityTest() {
xz_fft_many fft_many(testData.data, testData.N_X, testData.N_Y, testData.N_Z);
xz_fft_composition fft_composition(testData.data, testData.N_X, testData.N_Y, testData.N_Z);
std::vector<complex<float_type>> fft_many_result = fft_many.run();
std::vector<complex<float_type>> fft_composition_result = fft_composition.run();
bool equal = std::equal(fft_composition_result.begin(), fft_composition_result.end(), fft_many_result.begin());
assert(equal);
if(equal){
cout << "ok" << endl;
}
}
};
SanityTest sanityTest;
static void XZ_test_many(benchmark::State& state) {
xz_fft_many fft(testData.data, testData.N_X, testData.N_Y, testData.N_Z);
for (auto _ : state) {
auto result = fft.run();
}
}
static void XZ_test_composition(benchmark::State& state) {
xz_fft_composition fft(testData.data, testData.N_X, testData.N_Y, testData.N_Z);
for (auto _ : state) {
auto result = fft.run();
}
}
BENCHMARK(XZ_test_many)->Iterations(1000);
BENCHMARK(XZ_test_composition)->Iterations(1000);
BENCHMARK_MAIN();
If I done benchmarks correctly there are some significant differences beetwen fftw_plan_many_dft and composition approaches for different N_X, N_Y, N_Z combinations. For example using
index_type N_X = 512;
index_type N_Y = 16;
index_type N_Z = 16;
I've got almost two times difference in favour of fftw_plan_many_dft but for other sets of input parameters I've often found composition aproach to be faster but not that much.
------------------------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------------------------
XZ_test_many/iterations:1000 1412647 ns 1364813 ns 1000
XZ_test_composition/iterations:1000 2619807 ns 2542472 ns 1000

Related

prefix scan for large arrays

I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?

One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$

cuda calc distance of two points

Here I want to calculate the distance of each two points, and decide if they are neighbours. here is my simple code in cuda.
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
The DataPoint is a struct is
typedef struct DataPoint {
float pfDimens[3];
} DataPoint;
so here i want to reduce the time, How can i do? I have tried to use memory coalesing and share memory, but i didn't get a good speed up?
===============use share memory==============
__global__ void calcNeighbors2(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[threadsPerBlock];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
Here i changed the neighbors[tid*N+i] to neighbors[i*N+tid], it give me amlost 8x speed up on Tesla K10.G2.8GB. But when i use share memory to store some points, it is no use?

There are at least 4 ideas, some of which have already been stated in the comments:
Transform your point distance storage from AoS format:
struct DataPoint {
float pfDimens[3];
};
to SoA format:
struct DataPoint {
float pfDimens_x[NPTS];
float pfDimens_y[NPTS];
float pfDimens_z[NPTS];
};
this will enable full coalescing on loading of the data. In fact, to help with point 4 below, I would just switch to using 3 bare arrays, rather than a structure.
reduce the computation to (slightly less than) half:
for (int i=N-1; i>tid; i--) {
then, either in the thread code itself, or in the host, you can populate the other "half" of the output matrix by copying data.
Transpose the storage in your output matrix, so that you can write a storage operation like this:
neighbors[i*N+tid] = true;
which will nicely coalesce, as opposed to this:
neighbors[tid*N+i] = true;
which will not.
Since your input point data is read only, mark the kernel parameter appropriately:
const float * __restrict__ points_x, const float * __restrict__ points_y, const float * __restrict__ points_z
in some cases, and on some GPUs, this will often lead to a speed-up due to use of the read-only cache. If you really want to get aggressive with caching, and your data array is small enough (4K or less float points), you could put a copy of the point data in global memory as well as a copy in __constant__ memory, and load the "uniform" load you are doing here through constant memory:
DataPoint p2 = c_points[i];
thus you could perform the coalesced load through the read-only cache, the uniform load through the constant cache, and the coalesced store going to ordinary global memory.
On a K40c, on linux/CUDA 7, for N = 4096, the net effect of these changes appears to be about a 3.5x speedup, at the kernel level:
$ cat t749.cu
#include <stdio.h>
#define N 4096
// if N is 16K/3 or less, we can use constant
#define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 0.004903s, t2: 0.001395s
$
In the case of K40c, the limited number of blocks being launched above (16) is a significant impediment to performance, due to latency. If we comment out the USE_CONSTANT define, and change N to 16384, we observe an even higher speedup with the improved kernel:
$ ./t749
t1: 0.267107s, t2: 0.008209s
$
the resultant ~48 blocks being enough to approximately "fill" the K40c which has 15 SMs.
EDIT: now that you've posted a shared memory kernel, I added it to my test case as calcNeighbors3 and compared it's timing performance (as t3). It is almost as fast as my kernel, and it seems to provide the correct result (matches your original kernel) so I'm not sure what your concerns are.
Here's the updated code and test case:
$ cat t749.cu
#include <stdio.h>
#include <math.h>
#define imin(X,Y) ((X)<(Y))?(X):(Y)
#define N 32768
// if N is 16K/3 or less, we can use constant
// #define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
__global__ void calcNeighbors3(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[nTPB];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2, *hn3;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
hn3=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
#ifdef USE_CONSTANT
cudaMemcpyToSymbol(cpx, hx, N*sizeof(float));
cudaMemcpyToSymbol(cpy, hy, N*sizeof(float));
cudaMemcpyToSymbol(cpz, hz, N*sizeof(float));
#endif
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t3 = dtime_usec(0);
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 3 error");
t3 = dtime_usec(t3);
cudaMemcpy(hn3, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC, t3/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("1:2 mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
for (int i = 0; i < N*N; i++)
if (hn1[i] != hn3[i]) {printf("1:3 mismatch at %d, was: %d, should be: %d\n", i, hn1[i], hn3[i]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 1.260010s, t2: 0.022661s, t3: 0.029632s
$
For this test, I have changed the data set size to 32768 since that is closer to the range you care about. Your shared memory kernel shows about a 42x speedup over your original kernel, and my kernel shows about a 55x speedup, on my K40c.

From given vector find max value and its index by reduction method in CUDA

I am new to CUDA, for the vector to find max value and its index I use CUDA
here its my code:
#include < cuda.h >
#include < stdio.h >
#include < time.h >
#include <iostream>
using namespace std;
#define tbp 256
#define nblocks 1
__global__ void kernel_max(int *a, int *d, int *index,int *idx)
{
__shared__ int sdata[tbp]; //"static" shared memory
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = a[i];
index[tid] = i;
__syncthreads();
for(int s=tbp/2 ; s >= 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] < sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
index[tid] = index[tid+s];
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
if(tid == 0 )
{
d[blockIdx.x] = sdata[0];
idx[blockIdx.x] = index[0];
}
__syncthreads();
}
int main()
{
int i;
const int N=tbp*nblocks;
srand(time(NULL));
int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));
int *index;
index = (int*)malloc(N * sizeof(int));
int *idx;
idx = (int*)malloc(nblocks * sizeof(int));
int *dev_a, *dev_d, *dev_index,*dev_idx;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
cudaMalloc((void **) &dev_index, N*sizeof(int));
cudaMalloc((void **) &dev_idx, nblocks*sizeof(int));
int mmm=0;
int ddd=0;
for( i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
index[i]=i;
//printf("%d\n",a[i]);
if(mmm<a[i])
{
mmm=a[i];
ddd=i;
}
}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_index , index, N*sizeof(int),cudaMemcpyHostToDevice);
kernel_max <<< nblocks,tbp >>>(dev_a,dev_d,dev_index,dev_idx);
cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(index, dev_index, N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(idx, dev_idx, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
printf("cpu max= %d, gpu_max = %d ,cpu index: %d, gpu index: %d",mmm,d[0],ddd,idx[0]);
printf("\n");
if(ddd!=idx[0])
{
cout<<"index mismatch!damn!!"<<endl;
}
else
{
cout<<"congratulations!!"<<endl;
}
/*
for(i=0;i<N;i++)
cout<<*(index+i)<<endl;
*/
cudaFree(dev_a);
cudaFree(dev_d);
cudaFree(dev_index);
cudaFree(dev_idx);
free(a);
free(d);
free(index);
free(idx);
return 0;
}
The problem is that for the tbp < 128 it can get correct result both in value and index
when increase to 256,512,1024, the result will sometimes go wrong.
Can anyone given a explanation for this situation?Thanks.

Use another loop to deal with the index to avoid same max value with different index problem in this computation
int temp=0;
for(i=0;i<tbp;i++)
{
if(d[blockIdx.x]==a[i] && temp==0)
{temp = i;}
}
idx[0] = temp;

you need set int temp= -1 instead 0 to avoid the case of maximum value lcoated at 0.

Line detection on CUDA

I am trying to do real time line detection using CUDA. I have calculated the hough transform along with the min, max line coordinates of each bin. For getting the line segments I am tracing (using Bresenham's line algorithm) through the min to max point and get the line segments on each bin. When the hough threshold is low and when lot of lines are there in the image trace_lines takes lot of time to complete.
hough transform (hough_line_transform) computation takes around 5-10ms per frame(1280x720) on a GTX 660 (observed to be 10 times faster than CPU implementation). But tracing the line segments from the min, max points takes 1ms-15ms.
I have two questions on line detection
Does there exist a better algorithm to get the line segments from the min, max points of the hough bins?
Is it possible to optimize hought_line_transform (see the code below) further? I am using atomic operations. Is it possible to avoid atomics.
I am attaching the code below.
class Header
#ifndef _HOUGH_LINES_H_
#define _HOUGH_LINES_H_
#include <cuda_gl_interop.h>
#include <thrust/device_vector.h>
union Pos;
struct Line;
struct Hough_params
{
int w;
int h;
int r;
};
class Hough_lines
{
public:
enum Type {INT, SHORT_INT, FLOAT};
Hough_lines(int _w, int _h);
~Hough_lines();
public:
bool init();
bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);
protected:
void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);
static void compute_trig_funcs();
protected:
Hough_params params;
thrust::device_vector<Hough_params> d_param;
static bool trig_init;
};
#endif
Body
#include <hough_lines.h>
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_gl_interop.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#define ANGLE_SIZE 360
#define MAX_LINE_PER_THREAD 10
union Pos
{
struct
{
uint16_t x;
uint16_t y;
};
uint32_t value;
};
struct Hough_info
{
Pos end;
Pos start;
int count;
};
struct Line
{
Pos start;
Pos end;
};
struct Line_info
{
int line_count;
Line line[MAX_LINE_PER_THREAD];
};
__constant__ float dev_sint[ANGLE_SIZE];
__constant__ float dev_cost[ANGLE_SIZE];
texture<uint8_t, 2, cudaReadModeElementType> luma_tex;
bool Hough_lines::trig_init = false;
__global__ void mark_edges(const Hough_params* param, int* edge)
{
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
}
__global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
{
int index;
int x = (blockIdx.x*blockDim.x+threadIdx.x);
int y = (blockIdx.y*blockDim.y+threadIdx.y);
int pos = x+(param->w*y);
if (255 == tex2D(luma_tex, x, y))
{
index = edge[pos];
coord[index].y = y;
coord[index].x = x;
}
}
__global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
{
int i;
int angle;
int rdata;
__shared__ Hough_info sh_rho_data[1001];
i = threadIdx.x;
while (i < param->r)
{
sh_rho_data[i].end.value = 0x0;
sh_rho_data[i].start.value = 0xFFFFFFFF;
sh_rho_data[i].count = 0;
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
angle = blockIdx.x;
const float cos_angle = dev_cost[angle];
const float sin_angle = dev_sint[angle];
while (i < size)
{
rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
if (rdata >= 0)
{
atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);
atomicAdd(&sh_rho_data[rdata].count, 1);
}
i += blockDim.x;
}
__syncthreads();
i = threadIdx.x;
rdata = (angle*param->r);
while (i < param->r)
{
memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
i += blockDim.x;
}
}
__global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
{
int pos;
int i = threadIdx.x;
int offset = (blockIdx.x*param->r);
while (i < param->r)
{
if (hdata[offset+i].count >= threshold)
{
pos = mark[offset+i];
lines[pos].start.value = hdata[offset+i].start.value;
lines[pos].end.value = hdata[offset+i].end.value;
}
i += blockDim.x;
}
}
__device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
{
int d = abs(xe-xs)+abs(ye-ys);
if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
{
line->line[line->line_count].start.x = xs;
line->line[line->line_count].start.y = ys;
line->line[line->line_count].end.x = xe;
line->line[line->line_count].end.y = ye;
++line->line_count;
//printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
}
}
__global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
{
int d;
int dsub;
int dstep;
int xstep;
int ystep;
int xs, ys, xe, ye;
int i = (blockIdx.x*blockDim.x+threadIdx.x);
if (i >= inp_size)
{
return;
}
xs = input[i].start.x;
ys = input[i].start.y;
xe = input[i].end.x;
ye = input[i].end.y;
line_info[i].line_count = 0;
int dx = abs(xe-xs);
int dy = abs(ye-ys);
int xinc = (xe > xs)?1:-1;
int yinc = (ye > ys)?1:-1;
int gap = 0;
bool sflag;
int s_x, s_y, e_x, e_y;
if (dx > dy)
{
dsub = (dx<<1);
dstep = (dy<<1);
d = dstep-dx;
xstep = xinc;
ystep = 0;
xinc = 0;
}
else
{
dsub = (dy<<1);
dstep = (dx<<1);
d = dstep-dy;
xstep = 0;
ystep = yinc;
yinc = 0;
}
sflag = true;
s_x = xs;
s_y = ys;
e_x = xs;
e_y = ys;
int x = xs;
int y = ys;
while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
{
x += xstep;
y += ystep;
if (d > 0)
{
x += xinc;
y += yinc;
d -= dsub;
}
d += dstep;
if (255 == tex2D(luma_tex, x, y))
{
e_x = x;
e_y = y;
gap = 0;
if (!sflag)
{
s_x = x;
s_y = y;
sflag = true;
}
}
else if (sflag)
{
++gap;
if (gap >= min_gap)
{
sflag = false;
add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
}
}
}
if (sflag)
{
add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
}
mark[i] = line_info[i].line_count;
}
__global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
{
int index = (blockIdx.x*blockDim.x+threadIdx.x);
if (index >= size)
{
return;
}
int pos;
int start = 4*mark[index];
Line* line_data = &line[index].line[0];
for (int i = 0; i < line[index].line_count; i++)
{
pos = start+(4*i);
coords[pos] = line_data[i].start.x-(param->w>>1);
coords[pos+1] = (param->h>>1)-line_data[i].start.y;
coords[pos+2] = line_data[i].end.x-(param->w>>1);
coords[pos+3] = (param->h>>1)-line_data[i].end.y;
}
if ((index+1) == size)
{
*count = mark[index];
}
}
Hough_lines::Hough_lines(int _w, int _h)
:d_param(1)
{
params.w = _w;
params.h = _h;
params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
thrust::copy_n(&params, 1, d_param.begin());
}
Hough_lines::~Hough_lines()
{
}
bool Hough_lines::init()
{
if (false == trig_init)
{
trig_init = true;
compute_trig_funcs();
}
return true;
}
void Hough_lines::compute_trig_funcs()
{
float theta;
cudaError_t err = cudaSuccess;
static float sint[ANGLE_SIZE];
static float cost[ANGLE_SIZE];
for (int i = 0; i < ANGLE_SIZE; i++)
{
theta = (M_PI*(float)i)/180.0;
sint[i] = sin(theta);
cost[i] = cos(theta);
}
err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
if (cudaSuccess != err)
{
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
}
}
void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
{
dim3 bsize(16, 16);
dim3 gsize(params.w/bsize.x, params.h/bsize.y);
thrust::device_vector<int> d_mark(params.w*params.h);
size = 0;
mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_coords.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
{
int edge_count = 0;
thrust::device_vector<Pos> d_coords(params.w*params.h);
get_edges(d_coords, edge_count);
thrust::device_vector<int> d_mark(params.r*360);
thrust::device_vector<Hough_info> d_hough_data(params.r*360);
hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
edge_count,
thrust::raw_pointer_cast(d_coords.data()), threshold,
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_hough_data.data()));
thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
threshold,
thrust::raw_pointer_cast(d_hough_data.data()),
thrust::raw_pointer_cast(d_mark.data()),
thrust::raw_pointer_cast(d_lines.data()));
thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
}
void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
{
thrust::device_vector<int> d_mark_line(size);
thrust::device_vector<Line_info> d_nlines(size);
trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()),
size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
thrust::raw_pointer_cast(d_mark_line.data()));
thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());
thrust::device_vector<int> d_count(1);
copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
thrust::raw_pointer_cast(d_nlines.data()), size,
thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
thrust::raw_pointer_cast(d_count.data()));
thrust::copy(d_count.begin(), d_count.end(), &count);
//printf("\nLine count: %d", count);
}
void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
int* d_line_coord = 0;
cudaGLRegisterBufferObject(line);
cudaGLMapBufferObject((void **)&d_line_coord, line);
int size = 0;
thrust::device_vector<Line> d_lines(params.r*360);
get_hough_lines(threshold, d_lines, size);
//printf("\nget_hough_lines: %d", size);
trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);
cudaGLUnmapBufferObject(line);
cudaGLUnregisterBufferObject(line);
}
bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
{
cudaError_t err;
cudaArray* array_edge;
cudaGraphicsResource* res_edge;
err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
if (err != cudaSuccess)
{
printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
cudaGraphicsMapResources(1, &res_edge);
cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
if (err != cudaSuccess)
{
printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
{
printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
exit(0);
}
float time = 0.0;
//static float max = 0.0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
count = 0;
get_lines(threshold, min_length, min_gap, line, type, count);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//static int frame = 0;
//frame++;
//if (time > max)
{
//max = time;
printf("\nElpased time: %f ms", time);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaUnbindTexture(luma_tex);
cudaGraphicsUnmapResources(1, &res_edge);
cudaGraphicsUnregisterResource(res_edge);
return true;
}

In "Prefix sums and their applications", Guy Blelloch described a parallel line-drawing algorithm using parallel prefix sum. See page 55 of that paper, it might give you ideas.
Regarding how to optimize hough_line_transfer, I think the key is to eliminate shared memory atomics in your loop. Where you use them you are effectively doing keyed reductions. Thrust provides a reduce_by_key function, but that is only callable from the host. The device-library counterpart to Thrust is CUB, but it does not have reduce_by_key. I've asked the CUB authors for ideas here and if we come up with anything I'll update this answer.
You could write your own keyed reduction but it would be more productive and robust to rely on a library if possible.

CUDA DE kernel not launching

I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched.
Any help?
Here's the entire code:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[D*id+0]=a;
mutation[D*id+1]=b;
mutation[D*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
UPDATE - Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}

Solution:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
system("pause");
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Can I use FFTW to make transform along first and last axis of 3D array? - fft

Related

prefix scan for large arrays

cuda calc distance of two points

From given vector find max value and its index by reduction method in CUDA

Line detection on CUDA

CUDA DE kernel not launching

Categories

Resources