Finding the local minima of a sampled function by CUDA Thrust - cuda

I want to write program using Thrust which is supposed to calculate local minima
of a given functions, f.i. sin(x). I have done this by approximating the function derivative by finite differences and then searching for those abscissas where the derivative changes sign. I now want to collect the local minima. I have marked local minima with "1"
and the other points with "0". I have done an inclusive_scan (for calculating places in new tab).
My problem is now gathering the local minima with gather_if (condition stencil, map minima),
but the code does not compile and I do not know why.
Could someone explain why?
/**
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*/
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/remove.h>
#include <thrust/functional.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/scan.h>
#include <sys/time.h>
__host__ __device__ unsigned int bitreverse(unsigned int number) {
number = ((0xf0f0f0f0 & number) >> 4) | ((0x0f0f0f0f & number) << 4);
number = ((0xcccccccc & number) >> 2) | ((0x33333333 & number) << 2);
number = ((0xaaaaaaaa & number) >> 1) | ((0x55555555 & number) << 1);
return number;
}
struct is_even
{
__host__ __device__
bool operator()(const int x) {
return (x % 2) == 0;
}
};
struct select_mine
{
__host__ __device__
float operator()(const float x) {
return (x < 0) ? 1.0f : 0.0f;
}
};
struct bitreverse_functor
{
__host__ __device__ unsigned int operator()(const unsigned int &x) {
return bitreverse(x);
}
};
struct sign
{
__host__ __device__ float operator()(const float x) {
if (x > 0.0f)
return 1.0f;
if (x < 0.0f)
return -1.0f;
return 0.0f;
}
};
struct sine: public thrust::unary_function<float, float>
{
__host__ __device__
float operator()(float x) {
return sinf(x);
}
};
struct absolute: public thrust::unary_function<float, float>
{
__host__ __device__
float operator()(float x) {
if (x < 0.0f)
x = -x;
return x;
}
};
struct lokalne_minimum : public thrust::binary_function<float,float,float>
{
__host__ __device__
float operator()(float x, float y)
{
if (x > 0 && y < 0)
return 1.0f;
return 0.0f;
}
};
struct conv : public thrust::unary_function<float,int>
{
__host__ __device__
int operator()(float x)
{
return (int)(x);
}
};
using namespace thrust;
void help(char *arg) {
fprintf(stderr,
"Nieprawidlowe uzycie: %s [x1] [x2] [n]\nx1 - zakres od\nx2 - zakres do\nn - liczba podzialow zakresu\n",
arg);
}
int main(int argc, char **argv) {
if (argc != 4) {
help(argv[0]);
return 1;
}
int n = atoi(argv[3]);
float x1 = (float) atof(argv[1]);
float x2 = (float) atof(argv[2]);
if (n < 0 || x2 < x1) {
help(argv[0]);
return 1;
}
float step = (x2 - x1) / n;
fprintf(stderr, "Step: %f\n", step);
thrust::device_vector<float> oxdata(n);
thrust::device_vector<float> oydata(n);
thrust::device_vector<float> diff(n);
thrust::host_vector<float> ixdata(n);
// FIXME change it
for (int i = 0; i < n; i++)
ixdata[i] = x1 + i * step;
thrust::copy(ixdata.begin(), ixdata.end(), oxdata.begin());
thrust::transform(oxdata.begin(),oxdata.end(),oydata.begin(),sine());
thrust::transform(oydata.begin() + 1, oydata.end(), oydata.begin(),
diff.begin()+1, thrust::minus<float>());
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::transform(diff.begin()+1,diff.end(), diff.begin(),diff.begin(),lokalne_minimum());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::copy(oydata.begin(), oydata.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
//thrust::inclusive_scan(diff.begin(),diff.end(),diff.begin());
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::device_vector<int> minima(n);
thrust::device_vector<int> stencil(n);
thrust::host_vector<int> hminima(n);
thrust::transform(diff.begin(),diff.end(),minima.begin(),conv());
thrust::copy(minima.begin(),minima.end(),hminima.begin());
thrust::copy(minima.begin(),minima.end(),stencil.begin());
for (int i = 0; i < n; i++)
printf ("%d, ", hminima[i]);
printf ("\n");
thrust::inclusive_scan(minima.begin(), minima.end(),minima.begin());
thrust::copy(minima.begin(),minima.end(),hminima.begin());
for (int i = 0; i < n; i++)
printf ("%d, ", hminima[i]);
printf ("\n");
//thrust::gather_if(minima.begin(),minima.end(),stencil.begin(),ixdata.begin(),ixdata.begin());
return 0;
}

This is a very late answer provided to remove this question from the unanswered list. I'm profiting of Robert Crovella's comment and showing below a full working code to find local minima of a sampled function with CUDA Thrust. The rationale of the code is as follows
The function derivative is approximated by central differences as an application of thrust::transform;
The function sampling points are marked by "1" as an application of thrust::transform by seeking the sign changes of the derivative via the predicate local_minima_check();
The number of local minima is counted as an application of thrust::count;
The local minima are isolated as an application of thrust::copy_if.
#include <stdio.h>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
/**************/
/* COS STRUCT */
/**************/
struct cosine: public thrust::unary_function<float, float>
{
__host__ __device__ float operator()(float h_x) { return cosf(h_x); }
};
/******************************************/
/* SECOND ORDER CENTRAL DIFFERENCE STRUCT */
/******************************************/
struct second_order_central_difference
{
__host__ __device__ float operator()(thrust::tuple<float,float,float> t)
{
float f_1, f0, f1;
thrust::tie(f_1,f0,f1) = t;
return f_1 - 2.0f * f0 + f1;
}
};
/******************************/
/* LOCAL MINIMUM CHECK STRUCT */
/******************************/
struct local_minimum_check:public thrust::binary_function<float,float,float>
{
__host__ __device__ float operator()(float x, float y)
{
if (x < 0 && y > 0) return 1.0f;
return 0.0f;
}
};
/****************************************/
/* LOCAL MINIMUM PREDICATE CHECK STRUCT */
/****************************************/
struct pred
{
__host__ __device__ bool operator()(const int d_x) { return (d_x == 1.f); }
};
void main() {
// --- Input parameters
int n = 100; // Number of sampling points
float x1 = 3.14f / 2.f; // (x1,x2) is the sampling interval
float x2 = 1.5f * 3.14f;
// --- Calculating the sampling points x
thrust::host_vector<float> h_x(n);
float step = (x2 - x1) / n;
for (int i = 0; i < n; i++) h_x[i] = x1 + (float)i * step;
thrust::device_vector<float> d_x = h_x;
// --- Evaluating the function values y = f(x)
thrust::device_vector<float> d_y(n);
thrust::transform(d_x.begin(),d_x.end(),d_y.begin(),cosine());
// --- Computing first order central finite differences
// In Matlab's notation, it calculates d_diff1(1:n-2) = d_y(3:n,:) - d_y(1:n-2,:);
thrust::device_vector<float> d_diff1(n-2);
thrust::transform(d_y.begin() + 2, d_y.end(), d_y.begin(), d_diff1.begin(), thrust::minus<float>());
// --- Computing second order central finite differences
// In Matlab's notation, it calculates d_diff2(1:n-2) = d_y(3:n) - 2. * d_y(2:n-1) + d_y(1:n-2);
thrust::device_vector<float> d_diff2(n-2);
thrust::transform(thrust::make_zip_iterator(
thrust::make_tuple(d_y.begin(), d_y.begin() + 1, d_y.begin() + 2)),
thrust::make_zip_iterator(
thrust::make_tuple(d_y.end() - 2, d_y.end() - 1, d_y.end())),
d_diff2.begin(),second_order_central_difference());
// --- Setting a flag for all those points for which the derivative changes sign from negative to positive
thrust::device_vector<float> d_fo_derivative(n-3);
thrust::transform(d_diff1.begin(), d_diff1.end() - 1, d_diff1.begin() + 1, d_fo_derivative.begin(), local_minimum_check());
// --- Counting the number of local minima and selecting the local minima coordinates
int min_number = thrust::count(d_fo_derivative.begin(), d_fo_derivative.end(), 1.f);
thrust::device_vector<float> d_x_minima(min_number);
thrust::copy_if(d_x.begin() + 1, d_x.end() - 1, d_fo_derivative.begin(), d_x_minima.begin(), pred());
for (int i = 0; i < d_x_minima.size(); i++) {
printf ("Local minimum # %i = %f\n ", i+1, (float)d_x_minima[i]);
}
getchar();
}

Related

deep copy of structs to and from device memory

I am having trouble with the deep copy of an array of structs with dynamically allocated member variables in this cuda code. I think it is occurring because &deviceHistogram points to an address on the host instead of an address on the device. I tried making an intermediate pointer variable as in here, but that did not work; how do I properly copy this entire array of structs so I can modify it from the makeHistogram function?
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
__host__ __device__ void outputHistogram(histogramBin* histogram, int size) {
for (int i = 0; i < size; i++) {
printf("%d: ", i);
if (!histogram[i].count) {
printf("EMPTY");
} else {
for (int j = 0; j < histogram[i].count; j++) {
printf("%d ", histogram[i].items[j]);
}
}
printf("\n");
}
}
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin** histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = (*histogram)[thisBin].count; // **** out of memory access here****
(*histogram)[thisBin].items[position] = rH[r];
(*histogram)[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(&deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
Once it has been copied properly to the device, how do I get it back on the host? For example, if I call outputHistogram(histogram, 5); from inside makeHistogram, I see the following:
0: 2 4
1: 1 3 5
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
Which is the output I am expecting.
When I call outputHistogram(histogram, 8) from histogramDriver (after the cudaDeviceSynchronize()) I see the following:
0: EMPTY
1: EMPTY
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
Clearly I am not properly copying the values back from the device to the host.
I have tried copying by doing the reverse procedure from the one in histogramDriver:
for(int i = 0; i < n; i++){
cudaMemcpy(&(tempData[i]), &(deviceHistogram[i].items), sizeof(int*), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(histogram[i].items, tempData[i], rSize * sizeof(int), cudaMemcpyDeviceToHost);
}
But the output from the outputHistogram call in histogramDriver remains unchanged.
As #talonmies indicated, the biggest problem here is the design of your kernel. There is no reason/need to use a double-pointer for histogram (and indeed, the first iteration of the code you posted did not have that in the kernel prototype, although it was incomplete).
By removing the double-pointer aspect, your code runs without any runtime errors.
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin* histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = histogram[thisBin].count;
histogram[thisBin].items[position] = rH[r];
histogram[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
const int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
$ nvcc t1452.cu -o t1452
$ cuda-memcheck ./t1452
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that the only changes here are to the kernel code itself, plus removal of the ampersand on kernel call, plus I added const to the definition of rSize to get things to compile.
I have no idea if it produces correct output, because you've included no way to inspect the output, nor indicated what you expect the output to be. If you are interested in that, those would be good things to include in your MVE.

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

Replicate a vector multiple times using CUDA Thrust

I am trying to solve a problem using CUDA Thrust.
I have a host array with 3 elements. Is it possible, using Thrust, to create a device array of 384 elements in which the 3 elements in my host array is repeated 128 times (128 x 3 = 384)?
Generally speaking, starting from an array of 3 elements, how can I use Thrust to generate a device array of size X, where X = Y x 3, i.e. Y is the number of repetitions?
One possible approach:
create a device vector of appropriate size
create 3 strided ranges, one for each of the element positions {1, 2, 3} in the final output (device) vector
use thrust::fill to fill each of the 3 strided ranges with the appropriate (host vector) element {1, 2, 3}
This code is a trivial modification of the strided range example to demonstrate. You can change the REPS define to 128 to see the full expansion to 384 output elements:
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 3
#define REPS 15 // change to 128 if you like
#define DSIZE (STRIDE*REPS)
// this example illustrates how to make strided access to a range of values
// examples:
// strided_range([0, 1, 2, 3, 4, 5, 6], 1) -> [0, 1, 2, 3, 4, 5, 6]
// strided_range([0, 1, 2, 3, 4, 5, 6], 2) -> [0, 2, 4, 6]
// strided_range([0, 1, 2, 3, 4, 5, 6], 3) -> [0, 3, 6]
// ...
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
thrust::host_vector<int> h_data(STRIDE);
h_data[0] = 1;
h_data[1] = 2;
h_data[2] = 3;
thrust::device_vector<int> data(DSIZE);
typedef thrust::device_vector<int>::iterator Iterator;
strided_range<Iterator> pos1(data.begin(), data.end(), STRIDE);
strided_range<Iterator> pos2(data.begin()+1, data.end(), STRIDE);
strided_range<Iterator> pos3(data.begin()+2, data.end(), STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_data[0]);
thrust::fill(pos2.begin(), pos2.end(), h_data[1]);
thrust::fill(pos3.begin(), pos3.end(), h_data[2]);
// print the generated data
std::cout << "data: ";
thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " ")); std::cout << std::endl;
return 0;
}
Robert Crovella has already answered this question using strided ranges. He has also pointed out the possibility of using the expand operator.
Below, I'm providing a worked example using the expand operator. Opposite to the use of strided ranges, it avoids the need of for loops.
#include <thrust/device_vector.h>
#include <thrust/gather.h>
#include <thrust/sequence.h>
#include <stdio.h>
using namespace thrust::placeholders;
/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/*******************/
/* EXPAND OPERATOR */
/*******************/
template <typename InputIterator1, typename InputIterator2, typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if(thrust::counting_iterator<difference_type>(0), thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(), first1, output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
OutputIterator output_end = output; thrust::advance(output_end, output_size);
thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
/**************************/
/* STRIDED RANGE OPERATOR */
/**************************/
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
/********/
/* MAIN */
/********/
int main(){
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int Nrows = 10; // --- Number of objects
const int Ncols = 3; // --- Number of centroids
thrust::device_vector<int> d_sequence(Nrows * Ncols);
thrust::device_vector<int> d_counts(Ncols, Nrows);
thrust::sequence(d_sequence.begin(), d_sequence.begin() + Ncols);
expand(d_counts.begin(), d_counts.end(), d_sequence.begin(),
thrust::make_permutation_iterator(
d_sequence.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)));
printf("\n\nCentroid indices\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_sequence[i * Ncols + j] << " ";
std::cout << "]\n";
}
return 0;
}
As an apparently simpler alternative to using CUDA Thrust, I'm posting below a worked example implementing in CUDA the classical Matlab's meshgrid function.
In Matlab
x = [1 2 3];
y = [4 5 6 7];
[X, Y] = meshgrid(x, y);
produces
X =
1 2 3
1 2 3
1 2 3
1 2 3
and
Y =
4 4 4
5 5 5
6 6 6
7 7 7
X is exactly the four-fold replication of the x array, which is the OP's question and first guess of Robert Crovella's answer, while Y is the three-fold consecutive replication of each element of the y array, which is the second guess of Robert Crovella's answer.
Here is the code:
#include <cstdio>
#include <thrust/pair.h>
#include "Utilities.cuh"
#define BLOCKSIZE_MESHGRID_X 16
#define BLOCKSIZE_MESHGRID_Y 16
#define DEBUG
/*******************/
/* MESHGRID KERNEL */
/*******************/
template <class T>
__global__ void meshgrid_kernel(const T * __restrict__ x, size_t Nx, const float * __restrict__ y, size_t Ny, T * __restrict__ X, T * __restrict__ Y)
{
unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < Nx) && (tidy < Ny)) {
X[tidy * Nx + tidx] = x[tidx];
Y[tidy * Nx + tidx] = y[tidy];
}
}
/************/
/* MESHGRID */
/************/
template <class T>
thrust::pair<T *,T *> meshgrid(const T *x, const unsigned int Nx, const T *y, const unsigned int Ny) {
T *X; gpuErrchk(cudaMalloc((void**)&X, Nx * Ny * sizeof(T)));
T *Y; gpuErrchk(cudaMalloc((void**)&Y, Nx * Ny * sizeof(T)));
dim3 BlockSize(BLOCKSIZE_MESHGRID_X, BLOCKSIZE_MESHGRID_Y);
dim3 GridSize (iDivUp(Nx, BLOCKSIZE_MESHGRID_X), iDivUp(BLOCKSIZE_MESHGRID_Y, BLOCKSIZE_MESHGRID_Y));
meshgrid_kernel<<<GridSize, BlockSize>>>(x, Nx, y, Ny, X, Y);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
return thrust::make_pair(X, Y);
}
/********/
/* MAIN */
/********/
int main()
{
const int Nx = 3;
const int Ny = 4;
float *h_x = (float *)malloc(Nx * sizeof(float));
float *h_y = (float *)malloc(Ny * sizeof(float));
float *h_X = (float *)malloc(Nx * Ny * sizeof(float));
float *h_Y = (float *)malloc(Nx * Ny * sizeof(float));
for (int i = 0; i < Nx; i++) h_x[i] = i;
for (int i = 0; i < Ny; i++) h_y[i] = i + 4.f;
float *d_x; gpuErrchk(cudaMalloc(&d_x, Nx * sizeof(float)));
float *d_y; gpuErrchk(cudaMalloc(&d_y, Ny * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, h_x, Nx * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, Ny * sizeof(float), cudaMemcpyHostToDevice));
thrust::pair<float *, float *> meshgrid_pointers = meshgrid(d_x, Nx, d_y, Ny);
float *d_X = (float *)meshgrid_pointers.first;
float *d_Y = (float *)meshgrid_pointers.second;
gpuErrchk(cudaMemcpy(h_X, d_X, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_Y, d_Y, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
for (int j = 0; j < Ny; j++) {
for (int i = 0; i < Nx; i++) {
printf("i = %i; j = %i; x = %f; y = %f\n", i, j, h_X[j * Nx + i], h_Y[j * Nx + i]);
}
}
return 0;
}

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

CUDA Thrust reduction by key with a tuple key

I've two vectors, and after creating a tuple (with zip_iterator) I would order them with sort_by_key and then apply reduce_by_key.
But the reduction by key doesn't work well since it creates an incorrect vector counter. May someone help me? Here is my relevant code snippet.
...
typedef thrust::device_vector<int>::iterator IntIterator;
typedef thrust::tuple<IntIterator, IntIterator> IteratorTuple;
typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
typedef thrust::tuple<int, int> tupla;
...
thrust::device_vector <int> documenti(n);
thrust::device_vector <int> strip(n);
...
ZipIterator bufferBegin (thrust::make_tuple(documenti.begin(),strip.begin()));
ZipIterator bufferEnd (thrust::make_tuple(documenti.end(),strip.end()));
...
thrust::sort_by_key(bufferBegin,bufferEnd, counter.begin());
thrust::device_vector <tupla> example(n);
thrust::reduce_by_key(bufferBegin,bufferEnd, counter.begin(), example.begin(), counter.begin());
thrust::sort_by_key(counter.begin(), counter.begin()+n, example.begin(),thrust::greater <int>());
I'm providing an answer to this question just to remove it from the unanswered list.
Your question appears not much clear to me. From the code snippet you posted, my understanding is that you are interested into a reduction by key with tuple keys.
Below you can find a full worked example. I hope that it could be helpful to future users.
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
// --- Defining key tuple type
typedef thrust::tuple<int,int> Tuple;
typedef thrust::host_vector<Tuple>::iterator dIter1;
typedef thrust::host_vector<float>::iterator dIter2;
/************************************/
/* EQUALITY OPERATOR BETWEEN TUPLES */
/************************************/
struct BinaryPredicate
{
__host__ __device__ bool operator ()
(const Tuple& lhs, const Tuple& rhs)
{
return (thrust::get<0>(lhs) == thrust::get<0>(rhs)) && (thrust::get<1>(lhs) == thrust::get<1>(rhs));
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 7;
thrust::host_vector<Tuple> keys_input(N);
thrust::host_vector<float> values_input(N);
int keys1_input[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys 1
int keys2_input[N] = {1, 5, 3, 8, 2, 2, 1}; // input keys 2
float input_values[N] = {9., 8., 7., 6., 5., 4., 3.}; // input values
for (int i=0; i<N; i++) {
keys_input[i] = thrust::make_tuple(keys1_input[i], keys2_input[i]);
values_input[i] = input_values[i];
}
for (int i=0; i<N; i++) printf("%i %i\n", thrust::get<0>(keys_input[i]), thrust::get<1>(keys_input[i]));
thrust::host_vector<Tuple> keys_output(N);
thrust::host_vector<float> values_output(N);
thrust::pair<dIter1, dIter2> new_end;
new_end = thrust::reduce_by_key(keys_input.begin(),
keys_input.end(),
values_input.begin(),
keys_output.begin(),
values_output.begin(),
BinaryPredicate(),
thrust::plus<float>());
int Nkeys = new_end.first - keys_output.begin();
printf("\n\n");
for (int i = 0; i < Nkeys; i++) printf("%i; %f\n", i, values_output[i]);
printf("\n\n");
for (int i = 0; i < Nkeys; i++) printf("%i %i\n", thrust::get<0>(keys_output[i]), thrust::get<1>(keys_output[i]));
return 0;
}
EDIT
The above worked example referred to host_vector's. Below, a fully worked example considering the case when key and value vectors are regular cudaMalloc'ed arrays.
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include "Utilities.cuh"
// --- Defining key tuple type
typedef thrust::tuple<int, int> Tuple;
typedef thrust::device_vector<Tuple>::iterator dIter1;
typedef thrust::device_vector<float>::iterator dIter2;
/************************************/
/* EQUALITY OPERATOR BETWEEN TUPLES */
/************************************/
struct BinaryPredicate
{
__host__ __device__ bool operator ()
(const Tuple& lhs, const Tuple& rhs)
{
return (thrust::get<0>(lhs) == thrust::get<0>(rhs)) && (thrust::get<1>(lhs) == thrust::get<1>(rhs));
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 7;
// --- Keys and input values on the host: allocation and definition
int h_keys1_input[N] = { 1, 3, 3, 3, 2, 2, 1 }; // --- Input keys 1 - host side
int h_keys2_input[N] = { 1, 5, 3, 8, 2, 2, 1 }; // --- Input keys 2 - host side
float h_input_values[N] = { 9., 8., 7., 6., 5., 4., 3. }; // --- Input values - host side
// --- Keys and input values on the device: allocation
int *d_keys1_input; gpuErrchk(cudaMalloc(&d_keys1_input, N * sizeof(int))); // --- Input keys 1 - device side
int *d_keys2_input; gpuErrchk(cudaMalloc(&d_keys2_input, N * sizeof(int))); // --- Input keys 2 - device side
float *d_input_values; gpuErrchk(cudaMalloc(&d_input_values, N * sizeof(float))); // --- Input values - device side
// --- Keys and input values: host -> device
gpuErrchk(cudaMemcpy(d_keys1_input, h_keys1_input, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys2_input, h_keys2_input, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_input_values, h_input_values, N * sizeof(float), cudaMemcpyHostToDevice));
// --- From raw pointers to device_ptr
thrust::device_ptr<int> dev_ptr_keys1 = thrust::device_pointer_cast(d_keys1_input);
thrust::device_ptr<int> dev_ptr_keys2 = thrust::device_pointer_cast(d_keys2_input);
thrust::device_ptr<float> dev_ptr_values = thrust::device_pointer_cast(d_input_values);
// --- Declare outputs
thrust::device_vector<Tuple> d_keys_output(N);
thrust::device_vector<float> d_values_output(N);
thrust::pair<dIter1, dIter2> new_end;
auto begin = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1, dev_ptr_keys2));
auto end = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1 + N, dev_ptr_keys2 + N));
new_end = thrust::reduce_by_key(begin,
end,
dev_ptr_values,
d_keys_output.begin(),
d_values_output.begin(),
BinaryPredicate(),
thrust::plus<float>());
int Nkeys = new_end.first - d_keys_output.begin();
printf("\n\n");
for (int i = 0; i < Nkeys; i++) {
float output = d_values_output[i];
printf("%i; %f\n", i, output);
}
thrust::host_vector<Tuple> h_keys_output(d_keys_output);
printf("\n\n");
for (int i = 0; i < Nkeys; i++) {
int key1 = thrust::get<0>(h_keys_output[i]);
int key2 = thrust::get<1>(h_keys_output[i]);
printf("%i %i\n", key1, key2);
}
return 0;
}