Elementwise power operation using CUDA Thrust - cuda

Is there a way of transforming a thrust vector with a pow function? In other words, I want to transform each element x of a vector to pow(x,a), with a a constant.

Please refer to Section Transformations in Thrust Quict Start Guide for how to write a functor with initialized parameters.
struct saxpy_functor
{
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__
float operator()(const float& x, const float& y) const {
return a * x + y;
}
};

Here is a full example. As #Eric has mentioned, all what is needed is defining your own power functor and using thrust::transform.
#include <thrust/sequence.h>
#include <thrust/device_vector.h>
class power_functor {
double a;
public:
power_functor(double a_) { a = a_; }
__host__ __device__ double operator()(double x) const
{
return pow(x,a);
}
};
void main() {
int N = 20;
thrust::device_vector<double> d_n(N);
thrust::sequence(d_n.begin(), d_n.end());
thrust::transform(d_n.begin(),d_n.end(),d_n.begin(),power_functor(2.));
for (int i=0; i<N; i++) {
double val = d_n[i];
printf("Device vector element number %i equal to %f\n",i,val);
}
getchar();
}

Related

Thrust Gathering/Filtering

What I am trying to do is create a filter on a vector so it removes elements that do not pass a predicate test; but not too sure how I go about it.
I evaluate each element in my inputer vector against the predicate, for example in my code the is_even functor, in a device_vector vector. It is true if it passes the test and false if it's not.
Now I am stuck because I now have this bool vector and I want to gather the elements that passed this predicate test. I store it in a bool vector because I want to keep the result to filter other vectors.
#include ...
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 and col2 based on filter
return 0;
}
Within the stream compaction group you may be interested in thrust::copy_if
We can select the even elements into a new vector directly using your defined predicate without making an intermediate filter vector:
thrust::copy_if(col1.begin(), col1.end(), result.begin(), is_even<int>());
(result should be a vector of identical type to col1, and already defined to be a length equal to or greater than col1, since it's unknown how many elements will pass the predicate test.)
If you want to work off of the filter vector you have created, use the stencil version of copy_if instead.
Here's a worked example using the stencil method based on your comments:
$ cat t267.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
struct is_true : thrust::unary_function<bool, bool>
{
__host__ __device__
bool operator()(const bool &x)
{
return x;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::device_vector<int> result(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 based on filter
thrust::device_vector<int>::iterator end = thrust::copy_if(col1.begin(), col1.end(), filter.begin(), result.begin(), is_true());
int len = end - result.begin();
thrust::host_vector<int> h_result(len);
thrust::copy_n(result.begin(), len, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, "\n"));
return 0;
}
$ nvcc -arch=sm_20 -o t267 t267.cu
$ ./t267
Loading test!
0
2
4
6
8
10
12
14
16
18
$

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

CUDA function pointers

I was trying to make somtehing like this (actually I need to write some integration functions) in CUDA
#include <iostream>
using namespace std;
float f1(float x) {
return x * x;
}
float f2(float x) {
return x;
}
void tabulate(float p_f(float)) {
for (int i = 0; i != 10; ++i) {
std::cout << p_f(i) << ' ';
}
std::cout << std::endl;
}
int main() {
tabulate(f1);
tabulate(f2);
return 0;
}
output:
0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9
I tried the following but only got the error
Error: Function pointers and function template parameters are not supported in sm_1x.
float f1(float x) {
return x;
}
__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
for (lower; lower < upper; lower++) {
*result = *result + p_function(lower);
}
}
int main() {
float res;
float* dev_res;
cudaMalloc( (void**)&dev_res, sizeof(float) ) ;
tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("%f\n", res);
/************************************************************************/
scanf("%s");
return 0;
}
To get rid of your compile error, you'll have to use -gencode arch=compute_20,code=sm_20 as a compiler argument when compiling your code. But then you'll likely have some runtime problems:
Taken from the CUDA Programming Guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions
Function pointers to __global__ functions are supported in host code, but not in device code.
Function pointers to __device__ functions are only supported in device code compiled for devices of compute capability 2.x and higher.
It is not allowed to take the address of a __device__ function in host code.
so you can have something like this (adapted from the "FunctionPointers" sample):
//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);
//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
...
}
//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;
//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))
You can then pass the h_pointFunction as a parameter to your kernel, which can use it to call your __device__ function.
//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
unsigned char tmp;
...
tmp = (*pPointOperation)(tmp, 150.0)
...
}
//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);
Hopefully that made some sense. In all, it looks like you would have to change your f1 function to be a __device__ function and follow a similar procedure (the typedefs aren't necessary, but they do make the code nicer) to get it as a valid function pointer on the host-side to pass to your kernel. I'd also advise giving the FunctionPointers CUDA sample a look over
Even though you may be able to compile this code (see #Robert Crovella's answer) this code will not work. You cannot pass function pointers from host code as the host compiler has no way of figuring out the function address.
Here is a simple class for function pointers that are callable from within a kernel I wrote based on this question:
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr)
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
you could use it like this:
__device__ double func1(double x)
{
return x + 1.0f;
}
typedef double (*func)(double x);
__device__ func f_ = func1;
__global__ void test_kernel(func* f)
{
double x = (*f)(2.0);
printf("%g\n", x);
}
int main()
{
cudaCallableFunctionPointer<func> f(&f_);
test_kernel << < 1, 1 >> > (f.ptr);
}
output:
3

Thrust reduce not working with non equal input/output types

I'm attempting to reduce the min and max of an array of values using Thrust and I seem to be stuck. Given an array of floats what I would like is to reduce their min and max values in one pass, but using thrust's reduce method I instead get the mother (or at least auntie) of all template compile errors.
My original code contains 5 lists of values spread over 2 float4 arrays that I want reduced, but I've boiled it down to this short example.
struct ReduceMinMax {
__host__ __device__
float2 operator()(float lhs, float rhs) {
return make_float2(Min(lhs, rhs), Max(lhs, rhs));
}
};
int main(int argc, char *argv[]){
thrust::device_vector<float> hat(4);
hat[0] = 3;
hat[1] = 5;
hat[2] = 6;
hat[3] = 1;
ReduceMinMax binary_op_of_dooooom;
thrust::reduce(hat.begin(), hat.end(), 4.0f, binary_op_of_dooooom);
}
If I split it into 2 reductions instead it of course works. My question is then: Is it possible to reduce both the min and max in one pass with thrust and how? If not then what is the most efficient way of achieving said reduction? Will a transform iterator help me (and if so, will the reduction then be a one pass reduction?)
Some additional info:
I'm using Thrust 1.5 (as supplied by CUDA 4.2.7)
My actual code is using reduce_by_key, not just reduce.
I found transform_reduce while writing this question, but that one doesn't take keys into account.
As talonmies notes, your reduction does not compile because thrust::reduce expects the binary operator's argument types to match its result type, but ReduceMinMax's argument type is float, while its result type is float2.
thrust::minmax_element implements this operation directly, but if necessary you could instead implement your reduction with thrust::inner_product, which generalizes thrust::reduce:
#include <thrust/inner_product.h>
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <cassert>
struct minmax_float
{
__host__ __device__
float2 operator()(float lhs, float rhs)
{
return make_float2(thrust::min(lhs, rhs), thrust::max(lhs, rhs));
}
};
struct minmax_float2
{
__host__ __device__
float2 operator()(float2 lhs, float2 rhs)
{
return make_float2(thrust::min(lhs.x, rhs.x), thrust::max(lhs.y, rhs.y));
}
};
float2 minmax1(const thrust::device_vector<float> &x)
{
return thrust::inner_product(x.begin(), x.end(), x.begin(), make_float2(4.0, 4.0f), minmax_float2(), minmax_float());
}
float2 minmax2(const thrust::device_vector<float> &x)
{
using namespace thrust;
pair<device_vector<float>::const_iterator, device_vector<float>::const_iterator> ptr_to_result;
ptr_to_result = minmax_element(x.begin(), x.end());
return make_float2(*ptr_to_result.first, *ptr_to_result.second);
}
int main()
{
thrust::device_vector<float> hat(4);
hat[0] = 3;
hat[1] = 5;
hat[2] = 6;
hat[3] = 1;
float2 result1 = minmax1(hat);
float2 result2 = minmax2(hat);
assert(result1.x == result2.x);
assert(result1.y == result2.y);
}

thrust::device_reference can't be used with printf?

I am using the thrust partition function to partition array into even and odd numbers. However, when i try to display the device vector, it shows random values. Please let me know where is the error. I think i have done everything correct.
#include<stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include<thrust/partition.h>
struct is_even
{
//const int toCom;
//is_even(int val):toCom(val){}
__device__
bool operator()(const int &x)
{
return x%2;
}
};
void main(){
thrust::host_vector<int> H(6);
for(int i =0 ; i<H.size();i++){
H[i] = i+1;
}
thrust::device_vector<int> D = H;
thrust::partition(D.begin(),D.end(),is_even());
for(int i =0 ;i< D.size();i++){
printf("%d,",D[i]);
}
getchar();
}
You can't send a thrust::device_reference (i.e., the result of D[i]) through printf's ellipsis because it is not a POD type. See the documentation. Your code will produce a compiler warning to this effect.
Cast to int first:
for(int i = 0; i < D.size(); ++i)
{
printf("%d,", (int) D[i]);
}