How to use function pointers in polymorphism? - function

I want to be able to sort an array out using function pointers in polymorphism. Not to mention, am only doing this to see how things work and so forth.

Here's a simple generic sorting interface, an insertion sort implemented through that interface, and some test code that demonstrates its use:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
struct sort_interface {
// number of elements
size_t nmemb;
// passed through to 'arg' of compare() and swap()
void *arg;
// compares elements at 'i' and 'j'
int (*compare)(void *arg, size_t i, size_t j);
// swaps elements at 'i' and 'j'
void (*swap)(void *arg, size_t i, size_t j);
};
static void insertion_sort (struct sort_interface iface)
{
for (size_t i = 0; i < iface.nmemb; i++) {
size_t j = i;
while (j > 0) {
if (iface.compare(iface.arg, j - 1, j) <= 0) {
break;
}
iface.swap(iface.arg, j - 1, j);
j--;
}
}
}
static int func_comparator (void *arg, size_t i, size_t j)
{
int *arr = arg;
if (arr[i] < arr[j]) {
return -1;
}
if (arr[i] > arr[j]) {
return 1;
}
return 0;
}
static void func_swap (void *arg, size_t i, size_t j)
{
int *arr = arg;
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
int main (int argc, char *argv[])
{
int arr[] = {7, 6, 8, 2, 9, 1, 156, 1, 62, 1671, 15};
size_t count = sizeof(arr) / sizeof(arr[0]);
struct sort_interface iface;
iface.nmemb = count;
iface.arg = arr;
iface.compare = func_comparator;
iface.swap = func_swap;
insertion_sort(iface);
for (size_t i = 0; i < count; i++) {
printf("%d ", arr[i]);
}
printf("\n");
return 0;
}
You might also want to take a look at the qsort() function of the C standard library, which too uses a function pointer comparator, but is somewhat limited to compared to the above. In particular, it assumes you're sorting a continuous array, and if you have pointers to elements or their members, those will be broken (but the above interface allows you to fix pointers in swap()).
Here's an example for how to use the qsort() interface, and also an insertion sort implementation that uses the same interface as qsort():
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
static void insertion_sort (void *base, size_t nmemb, size_t size, int(*compar)(const void *, const void *))
{
char temp[size];
for (size_t i = 0; i < nmemb; i++) {
size_t j = i;
while (j > 0) {
char *x = (char *)base + (j - 1) * size;
char *y = (char *)base + j * size;
if (compar(x, y) <= 0) {
break;
}
memcpy(temp, x, size);
memcpy(x, y, size);
memcpy(y, temp, size);
j--;
}
}
}
static int int_comparator (const void *ve1, const void *ve2)
{
const int *e1 = ve1;
const int *e2 = ve2;
if (*e1 < *e2) {
return -1;
}
if (*e1 > *e2) {
return 1;
}
return 0;
}
int main (int argc, char *argv[])
{
int arr[] = {7, 6, 8, 2, 9, 1, 156, 1, 62, 1671, 15};
size_t count = sizeof(arr) / sizeof(arr[0]);
qsort(arr, count, sizeof(arr[0]), int_comparator); // or insertion_sort()
for (size_t i = 0; i < count; i++) {
printf("%d ", arr[i]);
}
printf("\n");
return 0;
}

Related

deep copy of structs to and from device memory

I am having trouble with the deep copy of an array of structs with dynamically allocated member variables in this cuda code. I think it is occurring because &deviceHistogram points to an address on the host instead of an address on the device. I tried making an intermediate pointer variable as in here, but that did not work; how do I properly copy this entire array of structs so I can modify it from the makeHistogram function?
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
__host__ __device__ void outputHistogram(histogramBin* histogram, int size) {
for (int i = 0; i < size; i++) {
printf("%d: ", i);
if (!histogram[i].count) {
printf("EMPTY");
} else {
for (int j = 0; j < histogram[i].count; j++) {
printf("%d ", histogram[i].items[j]);
}
}
printf("\n");
}
}
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin** histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = (*histogram)[thisBin].count; // **** out of memory access here****
(*histogram)[thisBin].items[position] = rH[r];
(*histogram)[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(&deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
Once it has been copied properly to the device, how do I get it back on the host? For example, if I call outputHistogram(histogram, 5); from inside makeHistogram, I see the following:
0: 2 4
1: 1 3 5
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
Which is the output I am expecting.
When I call outputHistogram(histogram, 8) from histogramDriver (after the cudaDeviceSynchronize()) I see the following:
0: EMPTY
1: EMPTY
2: EMPTY
3: EMPTY
4: EMPTY
5: EMPTY
6: EMPTY
7: EMPTY
Clearly I am not properly copying the values back from the device to the host.
I have tried copying by doing the reverse procedure from the one in histogramDriver:
for(int i = 0; i < n; i++){
cudaMemcpy(&(tempData[i]), &(deviceHistogram[i].items), sizeof(int*), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(histogram[i].items, tempData[i], rSize * sizeof(int), cudaMemcpyDeviceToHost);
}
But the output from the outputHistogram call in histogramDriver remains unchanged.
As #talonmies indicated, the biggest problem here is the design of your kernel. There is no reason/need to use a double-pointer for histogram (and indeed, the first iteration of the code you posted did not have that in the kernel prototype, although it was incomplete).
By removing the double-pointer aspect, your code runs without any runtime errors.
#include <stdlib.h>
#include <stdio.h>
#include "cuda.h"
typedef struct histogramBin {
int* items;
int count;
} histogramBin;
// This function embeds PTX code of CUDA to extract bit field from x.
__device__ uint bfe(uint x, uint start, uint nbits) {
uint bits;
asm("bfe.u32 %0, %1, %2, %3;"
: "=r"(bits)
: "r"(x), "r"(start), "r"(nbits));
return bits;
}
__global__ void makeHistogram(histogramBin* histogram, int* rH, int rSize, int bit) {
for (int r = 0; r < rSize; r++) {
int thisBin = bfe(rH[r], bit, 1);
int position = histogram[thisBin].count;
histogram[thisBin].items[position] = rH[r];
histogram[thisBin].count++;
}
}
void histogramDriver(histogramBin* histogram, int* rH, int rSize, int bit) {
int n = 8;
int* deviceRH;
histogramBin* deviceHistogram;
cudaMalloc((void**)&deviceRH, rSize * sizeof(int));
cudaMemcpy(deviceRH, rH, rSize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&deviceHistogram, n * sizeof(histogramBin));
cudaMemcpy(deviceHistogram, histogram, n * sizeof(histogramBin), cudaMemcpyHostToDevice);
int* tempData[n];
for (int i = 0; i < n; i++) {
cudaMalloc(&(tempData[i]), rSize * sizeof(int));
}
for (int i = 0; i < n; i++) {
cudaMemcpy(&(deviceHistogram[i].items), &(tempData[i]), sizeof(int*), cudaMemcpyHostToDevice);
}
for (int i = 0; i < n; i++) {
cudaMemcpy(tempData[i], histogram[i].items, rSize * sizeof(int), cudaMemcpyHostToDevice);
}
makeHistogram<<<1, 1>>>(deviceHistogram, deviceRH, rSize, bit);
cudaDeviceSynchronize();
}
int main(){
const int rSize = 5;
int rH[rSize] = {1, 2, 3, 4, 5};
histogramBin * histogram = (histogramBin*)malloc(sizeof(histogramBin) * 8);
for(int i = 0; i < 8; i++){
histogram[i].items = (int*)calloc(sizeof(int), rSize);
histogram[i].count = 0;
}
histogramDriver(histogram, rH, rSize, 0);
return 0;
}
$ nvcc t1452.cu -o t1452
$ cuda-memcheck ./t1452
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
Note that the only changes here are to the kernel code itself, plus removal of the ampersand on kernel call, plus I added const to the definition of rSize to get things to compile.
I have no idea if it produces correct output, because you've included no way to inspect the output, nor indicated what you expect the output to be. If you are interested in that, those would be good things to include in your MVE.

Error memcpy in device code

I write a code that get first _var positions of a vector of possibilities (i.e., matrix _size*_var with _var=3 and _size=27) and calling this function in my kernel (32 threads, ie, each has an object) but I do not get any return value of the function neither the NULL pointer.
The program exit without error but the printf lines in the kernel is not executed or displayed (even compiled with sm_20 or higher) as if the program stopped before.
dataIntern.h:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
template <class a_type>
class dataIntern{
private:
a_type *possibilities;
int _assign;
int _size;
int _var;
int _maxsize;
public:
CUDA_CALLABLE_MEMBER dataIntern(){
}
CUDA_CALLABLE_MEMBER dataIntern(int var){
_var = var;
_size = (int)pow(3.0, (double)_var);
_maxsize = _size * _var;
_assign = 1;
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
if(!possibilities){
exit(1);
}
createTable();
}
CUDA_CALLABLE_MEMBER void createTable(){
int i, j, k, limit, pos;
a_type value;
if(_assign == 1){
for(i=0; i<_var; i++){
#ifdef __CUDA_ARCH__
limit = (int)pow(3.0, _var-i-1);
#else
limit = (int)pow(3.0, (double)_var-i-1);
#endif
value = (a_type)_MIN;
k = 0;
for(j=0; j<_size; j++){
pos = _var*j+i;
if(k >= limit){
value++;
if(value > _MAX){
value = (a_type)_MIN;
}
k = 0;
}
possibilities[pos] = value;
k++;
}
}
}
}
CUDA_CALLABLE_MEMBER void print(){
int i;
printf("Printing.\n");
if(_assign == 1){
for(i=0; i<_size*_var; i++){
printf("%d ", possibilities[i]);
if(i%_var == _var-1){
printf("\n");
}
}
}
else{
printf("Not assigned.\n");
}
}
CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
int i, pos, count, initpos, attrib;
a_type *newnode;
a_type *newlist = NULL, *morelist = NULL;
pos = posChanged;
initpos = 0;
count = 0;
if(_assign == 1){
attrib = 0;
newnode = (a_type*)malloc(_var*sizeof(a_type));
for(i=0; i<_size; i++){
if(possibilities[pos] == valueRetified){
memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));
count++;
if(newlist!=NULL){
morelist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
}
newlist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));
initpos+=_var;
attrib = 1;
}
pos+=_var;
}
if(attrib == 1){
_size = count;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Allocation fail in newlist retify.\n");
exit(1);
}
memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
}
else{
_assign = 0;
}
}
}
CUDA_CALLABLE_MEMBER a_type* unstack(){
a_type* solution = NULL, *backup = NULL;
if(_assign == 1){
if(_size>0){
backup = (a_type*)malloc(_var*_size*sizeof(a_type));
if(backup == NULL){
printf("Erro to alloc backup pointer on unstack function in data intern\n");
return NULL;
}
solution = (a_type*)malloc(_var*sizeof(a_type));
if(solution == NULL){
printf("Erro to alloc solution pointer on unstack function in data intern\n");
return NULL;
}
memcpy(backup, possibilities, _size*_var*sizeof(a_type));
memcpy(solution, possibilities, _var*sizeof(a_type));
free(possibilities);
_size--;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Error to realloc possibilities pointer in data intern\n");
return NULL;
}
memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));
free(backup);
return solution;
}
}
return NULL;
}
CUDA_CALLABLE_MEMBER int get_size(){
return _size;
}
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
};
deviceCode.h:
#ifndef DEVICECODE_H
#define DEVICECODE_H
void CallingInMain();
__global__ void kernel();
#endif
deviceCode.cu:
#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>
//I declared like this to my kernel:
__global__ void kernel(){
__shared__ dataIntern<int> data[32];
int *vetor;
vetor = NULL;
data[threadIdx.x] = dataIntern<int>(3);
//_var == 3 in the class above
vetor = (int*)malloc(sizeof(int)*3);
vetor = data[threadIdx.x].unstack();
while(vetor!=NULL){
//never past here
printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
vetor = data[threadIdx.x].unstack();
}
//neither here in if or else
if(vetor)
printf("Not null\n");
else
printf("Null final\n");
free(vetor);
}
void CallingInMain(){
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
}
main.cu:
#include <iostream>
#include <stdio.h>
#ifndef deviceCode_H
#include "deviceCode.h"
#endif
int main(int argc, char* argv[]){
CallingInMain();
return 0;
}
Some colleagues pointed out to me that your code seems to have an error in it.
Consider this line in your kernel:
data[threadIdx.x] = dataIntern<int>(3);
This line instantiates a temporary dataIntern<int> object, runs the constructor with a value of 3 on it, and then does a copy from that object to the storage in data[threadIdx.x]. Note that the constructor performs a malloc operation:
CUDA_CALLABLE_MEMBER dataIntern(int var){
...
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
But since the original object is temporary, the C++ standard allows the object to be deleted at the termination of the statement, i.e. at the semicolon here:
data[threadIdx.x] = dataIntern<int>(3);
^
after the copy-construction process is complete. But the deletion of the object triggers the destructor, which does a free operation on possibilities:
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
Therefore usage of the pointer so allocated subsequent to this line of code:
data[threadIdx.x] = dataIntern<int>(3);
such as in unstack here:
vetor = data[threadIdx.x].unstack();
will be invalid.
This is a violation of C++ programming rules, and the error is not specific to CUDA.

CUDA - unexpected result with float array

I faced an issue I do not understand. I am trying to set values of an array in the device. With int array I am doing this this way:
#define VECTOR_SIZE 8
int main()
{
printf("Start\n");
int *input_d;
int *output_d;
int output_h[VECTOR_SIZE];
int input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(int);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
return 0;
}
The kernel looks like:
__global__ void kernel(int* input, int* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
The output (output_h) is just like I expected {0, 1, 2, 3, 4, 5, 6, 7}. Now when I am trying do the same on float array:
#define VECTOR_SIZE 8
int main()
{
printf("Start\n");
float *input_d;
float *output_d;
float output_h[VECTOR_SIZE];
float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(float);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
return 0;
}
with kernel:
__global__ void kernel(float* input, float* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
I am receiving zero array on the device in output_h variable.
The full code for handling float arrays:
#include "cuda_runtime.h"
#include <stdio.h>
#define VECTOR_SIZE 8
__global__ void kernel(float* input, float* output)//, int halfSize)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
int main()
{
printf("Start\n");
float *input_d;
float *output_d;
float output_h[VECTOR_SIZE];
float input_h[VECTOR_SIZE] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int size = VECTOR_SIZE*sizeof(float);
cudaMalloc(&input_d,size);
cudaMalloc(&output_d,size);
cudaMemcpy(input_d,input_h,size,cudaMemcpyHostToDevice);
kernel<<<1,VECTOR_SIZE>>>(input_d,output_d);
cudaMemcpy(output_h,output_d,size, cudaMemcpyDeviceToHost);
cudaFree(input_d);
cudaFree(output_d);
int i;
for (i=1; i<=VECTOR_SIZE; i++)
{
printf("%d, ", output_h[i-1]);
}
getchar();
return 0;
}
Both the integer and floating point versions of CUDA code you have posted work perfectly. The only mistake is how you are printing out the values returned by the kernel in the case of the floating point code:
int i;
for (i=1; i<=VECTOR_SIZE; i++)
{
printf("%d, ", output_h[i-1]);
}
should be changed to
int i;
for (i=0; i<VECTOR_SIZE; i++)
{
printf("%f, ", output_h[i]);
}
(note that the %f format is required for printing floating point numbers).
Given that CUDA uses a C++ compiler for host code by default, you should probably prefer iostream to printf - it will work irrespective of the type of the output and not cause the error you are seeing. If I were to write a "universal" version of your example it would look like this:
#include <iostream>
template<typename T>
__global__ void kernel(T* output)
{
int dx = threadIdx.x + (blockDim.x * blockIdx.x);
output[dx] = dx;
}
template<typename T, int VECTOR_SIZE>
void do_run(void)
{
T *output_d;
T output_h[VECTOR_SIZE] = { 999 };
size_t size = sizeof(output_h);
cudaMalloc(&output_d,size);
kernel<T><<<1,VECTOR_SIZE>>>(output_d);
cudaMemcpy(output_h, output_d, size, cudaMemcpyDeviceToHost);
for(int i=0; i<VECTOR_SIZE; i++)
std::cout << output_h[i] << std::endl;
cudaFree(output_d);
}
int main()
{
std::cout << "Integer version" << std::endl;
do_run<int, 8>();
std::cout << "floating point version" << std::endl;
do_run<float, 8>();
return 0;
}
Note that the output code can be used unchanged for both int and float versions, eliminating the possibility of the mistake you made here.

Finding the local minima of a sampled function by CUDA Thrust

I want to write program using Thrust which is supposed to calculate local minima
of a given functions, f.i. sin(x). I have done this by approximating the function derivative by finite differences and then searching for those abscissas where the derivative changes sign. I now want to collect the local minima. I have marked local minima with "1"
and the other points with "0". I have done an inclusive_scan (for calculating places in new tab).
My problem is now gathering the local minima with gather_if (condition stencil, map minima),
but the code does not compile and I do not know why.
Could someone explain why?
/**
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*/
#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/remove.h>
#include <thrust/functional.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/scan.h>
#include <sys/time.h>
__host__ __device__ unsigned int bitreverse(unsigned int number) {
number = ((0xf0f0f0f0 & number) >> 4) | ((0x0f0f0f0f & number) << 4);
number = ((0xcccccccc & number) >> 2) | ((0x33333333 & number) << 2);
number = ((0xaaaaaaaa & number) >> 1) | ((0x55555555 & number) << 1);
return number;
}
struct is_even
{
__host__ __device__
bool operator()(const int x) {
return (x % 2) == 0;
}
};
struct select_mine
{
__host__ __device__
float operator()(const float x) {
return (x < 0) ? 1.0f : 0.0f;
}
};
struct bitreverse_functor
{
__host__ __device__ unsigned int operator()(const unsigned int &x) {
return bitreverse(x);
}
};
struct sign
{
__host__ __device__ float operator()(const float x) {
if (x > 0.0f)
return 1.0f;
if (x < 0.0f)
return -1.0f;
return 0.0f;
}
};
struct sine: public thrust::unary_function<float, float>
{
__host__ __device__
float operator()(float x) {
return sinf(x);
}
};
struct absolute: public thrust::unary_function<float, float>
{
__host__ __device__
float operator()(float x) {
if (x < 0.0f)
x = -x;
return x;
}
};
struct lokalne_minimum : public thrust::binary_function<float,float,float>
{
__host__ __device__
float operator()(float x, float y)
{
if (x > 0 && y < 0)
return 1.0f;
return 0.0f;
}
};
struct conv : public thrust::unary_function<float,int>
{
__host__ __device__
int operator()(float x)
{
return (int)(x);
}
};
using namespace thrust;
void help(char *arg) {
fprintf(stderr,
"Nieprawidlowe uzycie: %s [x1] [x2] [n]\nx1 - zakres od\nx2 - zakres do\nn - liczba podzialow zakresu\n",
arg);
}
int main(int argc, char **argv) {
if (argc != 4) {
help(argv[0]);
return 1;
}
int n = atoi(argv[3]);
float x1 = (float) atof(argv[1]);
float x2 = (float) atof(argv[2]);
if (n < 0 || x2 < x1) {
help(argv[0]);
return 1;
}
float step = (x2 - x1) / n;
fprintf(stderr, "Step: %f\n", step);
thrust::device_vector<float> oxdata(n);
thrust::device_vector<float> oydata(n);
thrust::device_vector<float> diff(n);
thrust::host_vector<float> ixdata(n);
// FIXME change it
for (int i = 0; i < n; i++)
ixdata[i] = x1 + i * step;
thrust::copy(ixdata.begin(), ixdata.end(), oxdata.begin());
thrust::transform(oxdata.begin(),oxdata.end(),oydata.begin(),sine());
thrust::transform(oydata.begin() + 1, oydata.end(), oydata.begin(),
diff.begin()+1, thrust::minus<float>());
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::transform(diff.begin()+1,diff.end(), diff.begin(),diff.begin(),lokalne_minimum());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::copy(oydata.begin(), oydata.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
//thrust::inclusive_scan(diff.begin(),diff.end(),diff.begin());
thrust::copy(diff.begin(), diff.end(), ixdata.begin());
for (int i = 0; i < n; i++)
printf ("%f, ", ixdata[i]);
printf ("\n");
thrust::device_vector<int> minima(n);
thrust::device_vector<int> stencil(n);
thrust::host_vector<int> hminima(n);
thrust::transform(diff.begin(),diff.end(),minima.begin(),conv());
thrust::copy(minima.begin(),minima.end(),hminima.begin());
thrust::copy(minima.begin(),minima.end(),stencil.begin());
for (int i = 0; i < n; i++)
printf ("%d, ", hminima[i]);
printf ("\n");
thrust::inclusive_scan(minima.begin(), minima.end(),minima.begin());
thrust::copy(minima.begin(),minima.end(),hminima.begin());
for (int i = 0; i < n; i++)
printf ("%d, ", hminima[i]);
printf ("\n");
//thrust::gather_if(minima.begin(),minima.end(),stencil.begin(),ixdata.begin(),ixdata.begin());
return 0;
}
This is a very late answer provided to remove this question from the unanswered list. I'm profiting of Robert Crovella's comment and showing below a full working code to find local minima of a sampled function with CUDA Thrust. The rationale of the code is as follows
The function derivative is approximated by central differences as an application of thrust::transform;
The function sampling points are marked by "1" as an application of thrust::transform by seeking the sign changes of the derivative via the predicate local_minima_check();
The number of local minima is counted as an application of thrust::count;
The local minima are isolated as an application of thrust::copy_if.
#include <stdio.h>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
/**************/
/* COS STRUCT */
/**************/
struct cosine: public thrust::unary_function<float, float>
{
__host__ __device__ float operator()(float h_x) { return cosf(h_x); }
};
/******************************************/
/* SECOND ORDER CENTRAL DIFFERENCE STRUCT */
/******************************************/
struct second_order_central_difference
{
__host__ __device__ float operator()(thrust::tuple<float,float,float> t)
{
float f_1, f0, f1;
thrust::tie(f_1,f0,f1) = t;
return f_1 - 2.0f * f0 + f1;
}
};
/******************************/
/* LOCAL MINIMUM CHECK STRUCT */
/******************************/
struct local_minimum_check:public thrust::binary_function<float,float,float>
{
__host__ __device__ float operator()(float x, float y)
{
if (x < 0 && y > 0) return 1.0f;
return 0.0f;
}
};
/****************************************/
/* LOCAL MINIMUM PREDICATE CHECK STRUCT */
/****************************************/
struct pred
{
__host__ __device__ bool operator()(const int d_x) { return (d_x == 1.f); }
};
void main() {
// --- Input parameters
int n = 100; // Number of sampling points
float x1 = 3.14f / 2.f; // (x1,x2) is the sampling interval
float x2 = 1.5f * 3.14f;
// --- Calculating the sampling points x
thrust::host_vector<float> h_x(n);
float step = (x2 - x1) / n;
for (int i = 0; i < n; i++) h_x[i] = x1 + (float)i * step;
thrust::device_vector<float> d_x = h_x;
// --- Evaluating the function values y = f(x)
thrust::device_vector<float> d_y(n);
thrust::transform(d_x.begin(),d_x.end(),d_y.begin(),cosine());
// --- Computing first order central finite differences
// In Matlab's notation, it calculates d_diff1(1:n-2) = d_y(3:n,:) - d_y(1:n-2,:);
thrust::device_vector<float> d_diff1(n-2);
thrust::transform(d_y.begin() + 2, d_y.end(), d_y.begin(), d_diff1.begin(), thrust::minus<float>());
// --- Computing second order central finite differences
// In Matlab's notation, it calculates d_diff2(1:n-2) = d_y(3:n) - 2. * d_y(2:n-1) + d_y(1:n-2);
thrust::device_vector<float> d_diff2(n-2);
thrust::transform(thrust::make_zip_iterator(
thrust::make_tuple(d_y.begin(), d_y.begin() + 1, d_y.begin() + 2)),
thrust::make_zip_iterator(
thrust::make_tuple(d_y.end() - 2, d_y.end() - 1, d_y.end())),
d_diff2.begin(),second_order_central_difference());
// --- Setting a flag for all those points for which the derivative changes sign from negative to positive
thrust::device_vector<float> d_fo_derivative(n-3);
thrust::transform(d_diff1.begin(), d_diff1.end() - 1, d_diff1.begin() + 1, d_fo_derivative.begin(), local_minimum_check());
// --- Counting the number of local minima and selecting the local minima coordinates
int min_number = thrust::count(d_fo_derivative.begin(), d_fo_derivative.end(), 1.f);
thrust::device_vector<float> d_x_minima(min_number);
thrust::copy_if(d_x.begin() + 1, d_x.end() - 1, d_fo_derivative.begin(), d_x_minima.begin(), pred());
for (int i = 0; i < d_x_minima.size(); i++) {
printf ("Local minimum # %i = %f\n ", i+1, (float)d_x_minima[i]);
}
getchar();
}

CUDA Thrust reduction by key with a tuple key

I've two vectors, and after creating a tuple (with zip_iterator) I would order them with sort_by_key and then apply reduce_by_key.
But the reduction by key doesn't work well since it creates an incorrect vector counter. May someone help me? Here is my relevant code snippet.
...
typedef thrust::device_vector<int>::iterator IntIterator;
typedef thrust::tuple<IntIterator, IntIterator> IteratorTuple;
typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
typedef thrust::tuple<int, int> tupla;
...
thrust::device_vector <int> documenti(n);
thrust::device_vector <int> strip(n);
...
ZipIterator bufferBegin (thrust::make_tuple(documenti.begin(),strip.begin()));
ZipIterator bufferEnd (thrust::make_tuple(documenti.end(),strip.end()));
...
thrust::sort_by_key(bufferBegin,bufferEnd, counter.begin());
thrust::device_vector <tupla> example(n);
thrust::reduce_by_key(bufferBegin,bufferEnd, counter.begin(), example.begin(), counter.begin());
thrust::sort_by_key(counter.begin(), counter.begin()+n, example.begin(),thrust::greater <int>());
I'm providing an answer to this question just to remove it from the unanswered list.
Your question appears not much clear to me. From the code snippet you posted, my understanding is that you are interested into a reduction by key with tuple keys.
Below you can find a full worked example. I hope that it could be helpful to future users.
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
// --- Defining key tuple type
typedef thrust::tuple<int,int> Tuple;
typedef thrust::host_vector<Tuple>::iterator dIter1;
typedef thrust::host_vector<float>::iterator dIter2;
/************************************/
/* EQUALITY OPERATOR BETWEEN TUPLES */
/************************************/
struct BinaryPredicate
{
__host__ __device__ bool operator ()
(const Tuple& lhs, const Tuple& rhs)
{
return (thrust::get<0>(lhs) == thrust::get<0>(rhs)) && (thrust::get<1>(lhs) == thrust::get<1>(rhs));
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 7;
thrust::host_vector<Tuple> keys_input(N);
thrust::host_vector<float> values_input(N);
int keys1_input[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys 1
int keys2_input[N] = {1, 5, 3, 8, 2, 2, 1}; // input keys 2
float input_values[N] = {9., 8., 7., 6., 5., 4., 3.}; // input values
for (int i=0; i<N; i++) {
keys_input[i] = thrust::make_tuple(keys1_input[i], keys2_input[i]);
values_input[i] = input_values[i];
}
for (int i=0; i<N; i++) printf("%i %i\n", thrust::get<0>(keys_input[i]), thrust::get<1>(keys_input[i]));
thrust::host_vector<Tuple> keys_output(N);
thrust::host_vector<float> values_output(N);
thrust::pair<dIter1, dIter2> new_end;
new_end = thrust::reduce_by_key(keys_input.begin(),
keys_input.end(),
values_input.begin(),
keys_output.begin(),
values_output.begin(),
BinaryPredicate(),
thrust::plus<float>());
int Nkeys = new_end.first - keys_output.begin();
printf("\n\n");
for (int i = 0; i < Nkeys; i++) printf("%i; %f\n", i, values_output[i]);
printf("\n\n");
for (int i = 0; i < Nkeys; i++) printf("%i %i\n", thrust::get<0>(keys_output[i]), thrust::get<1>(keys_output[i]));
return 0;
}
EDIT
The above worked example referred to host_vector's. Below, a fully worked example considering the case when key and value vectors are regular cudaMalloc'ed arrays.
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include "Utilities.cuh"
// --- Defining key tuple type
typedef thrust::tuple<int, int> Tuple;
typedef thrust::device_vector<Tuple>::iterator dIter1;
typedef thrust::device_vector<float>::iterator dIter2;
/************************************/
/* EQUALITY OPERATOR BETWEEN TUPLES */
/************************************/
struct BinaryPredicate
{
__host__ __device__ bool operator ()
(const Tuple& lhs, const Tuple& rhs)
{
return (thrust::get<0>(lhs) == thrust::get<0>(rhs)) && (thrust::get<1>(lhs) == thrust::get<1>(rhs));
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 7;
// --- Keys and input values on the host: allocation and definition
int h_keys1_input[N] = { 1, 3, 3, 3, 2, 2, 1 }; // --- Input keys 1 - host side
int h_keys2_input[N] = { 1, 5, 3, 8, 2, 2, 1 }; // --- Input keys 2 - host side
float h_input_values[N] = { 9., 8., 7., 6., 5., 4., 3. }; // --- Input values - host side
// --- Keys and input values on the device: allocation
int *d_keys1_input; gpuErrchk(cudaMalloc(&d_keys1_input, N * sizeof(int))); // --- Input keys 1 - device side
int *d_keys2_input; gpuErrchk(cudaMalloc(&d_keys2_input, N * sizeof(int))); // --- Input keys 2 - device side
float *d_input_values; gpuErrchk(cudaMalloc(&d_input_values, N * sizeof(float))); // --- Input values - device side
// --- Keys and input values: host -> device
gpuErrchk(cudaMemcpy(d_keys1_input, h_keys1_input, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys2_input, h_keys2_input, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_input_values, h_input_values, N * sizeof(float), cudaMemcpyHostToDevice));
// --- From raw pointers to device_ptr
thrust::device_ptr<int> dev_ptr_keys1 = thrust::device_pointer_cast(d_keys1_input);
thrust::device_ptr<int> dev_ptr_keys2 = thrust::device_pointer_cast(d_keys2_input);
thrust::device_ptr<float> dev_ptr_values = thrust::device_pointer_cast(d_input_values);
// --- Declare outputs
thrust::device_vector<Tuple> d_keys_output(N);
thrust::device_vector<float> d_values_output(N);
thrust::pair<dIter1, dIter2> new_end;
auto begin = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1, dev_ptr_keys2));
auto end = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1 + N, dev_ptr_keys2 + N));
new_end = thrust::reduce_by_key(begin,
end,
dev_ptr_values,
d_keys_output.begin(),
d_values_output.begin(),
BinaryPredicate(),
thrust::plus<float>());
int Nkeys = new_end.first - d_keys_output.begin();
printf("\n\n");
for (int i = 0; i < Nkeys; i++) {
float output = d_values_output[i];
printf("%i; %f\n", i, output);
}
thrust::host_vector<Tuple> h_keys_output(d_keys_output);
printf("\n\n");
for (int i = 0; i < Nkeys; i++) {
int key1 = thrust::get<0>(h_keys_output[i]);
int key2 = thrust::get<1>(h_keys_output[i]);
printf("%i %i\n", key1, key2);
}
return 0;
}