thrust::device_reference can't be used with printf? - cuda

I am using the thrust partition function to partition array into even and odd numbers. However, when i try to display the device vector, it shows random values. Please let me know where is the error. I think i have done everything correct.
#include<stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include<thrust/partition.h>
struct is_even
{
//const int toCom;
//is_even(int val):toCom(val){}
__device__
bool operator()(const int &x)
{
return x%2;
}
};
void main(){
thrust::host_vector<int> H(6);
for(int i =0 ; i<H.size();i++){
H[i] = i+1;
}
thrust::device_vector<int> D = H;
thrust::partition(D.begin(),D.end(),is_even());
for(int i =0 ;i< D.size();i++){
printf("%d,",D[i]);
}
getchar();
}

You can't send a thrust::device_reference (i.e., the result of D[i]) through printf's ellipsis because it is not a POD type. See the documentation. Your code will produce a compiler warning to this effect.
Cast to int first:
for(int i = 0; i < D.size(); ++i)
{
printf("%d,", (int) D[i]);
}

Related

In C programming how to use the value of an array outside the function where the value of the array is assigned?

In my program I used a function called
void printBoard(char board[26][26], int n);
This is my actual coding:
#include <stdio.h>
#include <stdlib.h>
void printBoard(char board[26][26], int n);
int main(void) {
//variable 'n' determines board dimension
int n;
printf("Enter the board dimension: ");
scanf("%d",&n);
char board[n][n];
printBoard(board, n);
printf("\n\n");
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%c",board[i][j]);
}
printf("\n");
}
return (EXIT_SUCCESS);
}
void printBoard(char board[26][26], int n){
printf(" ");
for(int i=0;i<n;i++){
printf("%c",97+i);
}
printf("\n");
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
board[i][j]='U';
}
}
board[(n/2)-1][(n/2)-1]='W';
board[n/2][n/2]='W';
board[(n/2)-1][n/2]='B';
board[n/2][(n/2)-1]='B';
for(int i=0;i<n;i++){
printf("%c ",97+i);
for(int j=0;j<n;j++){
printf("%c",board[i][j]);
}
printf("\n");
}
}
The value of the array 'board[26][]26' gets assigned inside the function 'printBoard'. So the value gets lost once I am outside of the scope of this function. But I need to use the value of this array outside of this function. How can I do that?
Thank you
You're not passing the array into printBoard by value, you're just passing a pointer. The changes in the function will persist beyond the function because you're modifying the same memory, not a copy. If you're seeing unexpected output, it's not because of a scope issue with changes made by printBoard.

cuda atomicAdd example fails to yield correct output

The following code was written with the goal of incrementing a 100 element array of floats by 1 ten times. In the output, I was expecting a 100 element array of 10.0f value for each element. Instead, I get random values. Can you please point out my error here?
__global__ void testAdd(float *a)
{
float temp;
for (int i = 0; i < 100 ; i++)
{
a[i] = atomicAdd(&a[i], 1.0f);
}
}
void cuTestAtomicAdd(float *a)
{
testAdd<<<1, 10>>>(a);
}
My goal is to understand the workings of atomic operations, so as to apply them elsewhere.
That's not how we do an atomicAdd operation.
Just do it like this:
atomicAdd(&a[i], 1.0f);
and the variable in question (a[i]) will be updated.
The return value from an atomic function is generally the old value that was in the variable, before the atomic update.
so doing this:
a[i] = atomicAdd(&a[i], 1.0f);
will update the variable a[i], and then (non-atomically) assign the old value to the variable a[i]. That's almost certainly not what you want.
Read the documentation:
The function returns old.
The following complete code demonstrates correct usage:
#include <iostream>
__global__ void testAdd(float *a)
{
for (int i = 0; i < 100 ; i++)
{
atomicAdd(&a[i], 1.0f);
}
}
void cuTestAtomicAdd(float *a)
{
testAdd<<<1, 10>>>(a);
}
int main(){
float *d_data, *h_data;
h_data=(float *) malloc(100*sizeof(float));
cudaMalloc((void **)&d_data, 100*sizeof(float));
cudaMemset(d_data, 0, 100*sizeof(float));
cuTestAtomicAdd(d_data);
cudaMemcpy(h_data, d_data, 100*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 100; i++)
if (h_data[i] != 10.0f) {printf("mismatch at %d, was %f, should be %f\n", i, h_data[i], 10.0f); return 1;}
printf("Success\n");
return 0;
}

Thrust: selectively move elements to another vector

I'm trying to figure out the best way to do the following using Thrust: vector A has a million floats, they have some particular order. I want to move to vector B every element x in A for which x>7.0 such that the order of elements is maintain in both vectors A and B. Importantly, only a tiny fraction of elements need be moved. Efficiency is more important for my code than elegance.
My idea was to use thrust::copy_if from A to B and then thrust::remove_if on A. But I don't know the exact number of elements to be copy, and since apparently the memory for B must be allocated in advance, another counting operation is necessary. An inelegant way to skip the counting operation is to pre-allocate "enough" memory for vector B.
Using thrust::remove_copy_if has much the same problems: you need to allocate memory for B in advance, and also it doesn't actually remove anything from A so another thrust::remove_if is required anyway.
Another idea I had was to use thrust::stable_sort with some custom-made comparison functor, to push all elements I want out to the end of A, and then somehow figure out how many there are and thrust::copy them to B. This also looks pretty inelegant...
You're on the right track with thrust::copy_if. Just allocate two more buffers of the same size as the first one. Then copy_if > 7.0f to the first one and copy_if <= 7.0f to the second one. Allocating buffers of the same size as the original buffer is fine as long as you know there's room, and 1 million floats only takes up 4MB.
Edit:
I did a performance comparison of the copy_if and stable_partition approaches. On my card, a GTX660, stable_partition took around 150% as long as copy_if for "split" values of 0.1f, 0.5f and 0.9f. I added tests to ensure that both methods are stable (maintain the order of the values).
#include <cuda.h>
#include <curand.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define SPLIT 0.1f
struct is_low
{
__host__ __device__ bool operator()(const float x)
{
return x <= SPLIT;
}
};
struct is_high
{
__host__ __device__ bool operator()(const float x)
{
return x > SPLIT;
}
};
class EventTimer {
public:
EventTimer() : mStarted(false), mStopped(false) {
cudaEventCreate(&mStart);
cudaEventCreate(&mStop);
}
~EventTimer() {
cudaEventDestroy(mStart);
cudaEventDestroy(mStop);
}
void start(cudaStream_t s = 0) {
cudaEventRecord(mStart, s);
mStarted = true;
mStopped = false;
}
void stop(cudaStream_t s = 0) {
assert(mStarted);
cudaEventRecord(mStop, s);
mStarted = false;
mStopped = true;
}
float elapsed() {
assert(mStopped);
if (!mStopped) return 0;
cudaEventSynchronize(mStop);
float elapsed = 0;
cudaEventElapsedTime(&elapsed, mStart, mStop);
return elapsed;
}
private:
bool mStarted, mStopped;
cudaEvent_t mStart, mStop;
};
int main(int argc, char *argv[])
{
const size_t n = 1024 * 1024 * 50;
// Create prng
curandGenerator_t gen;
CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
// Set seed
CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
// Generate n floats on device
thrust::device_vector<float> vec_rnd_d(n);
float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());
CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n));
thrust::device_vector<float> vec_low_d(n);
thrust::device_vector<float> vec_high_d(n);
for (int i = 0; i < 5; ++i) {
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_end;
iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());
thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());
timer.stop();
std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_low_h = vec_low_d;
thrust::host_vector<float> vec_high_h = vec_high_d;
thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
for (int i = 0; i < 5; ++i) {
thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_split =
thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());
timer.stop();
size_t n_low = iter_split - vec_rnd_copy.begin();
std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;
thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
CHECK_CURAND_CALL(curandDestroyGenerator(gen));
return EXIT_SUCCESS;
}
Output:
C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe
copy_if: 40.2919ms
copy_if: 38.0157ms
copy_if: 38.5036ms
copy_if: 37.6751ms
copy_if: 38.1054ms
stable_partition: 59.5473ms
stable_partition: 61.4016ms
stable_partition: 59.1854ms
stable_partition: 61.3195ms
stable_partition: 59.1205ms
To answer my own question, I finally found thrust::stable_partition, which is more efficient and elegant than all "copy_if"-alternatives. It just moves all elements that fail to satisfy a predicate to the end of the array and returns the start of the second sequence. Pointer arithmetic gives the size of B, but in fact it's not necessary anymore:
thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred)
thrust::device_vector<float> B(iter, A.end())
A.erase(iter, A.end());

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

Elementwise power operation using CUDA Thrust

Is there a way of transforming a thrust vector with a pow function? In other words, I want to transform each element x of a vector to pow(x,a), with a a constant.
Please refer to Section Transformations in Thrust Quict Start Guide for how to write a functor with initialized parameters.
struct saxpy_functor
{
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__
float operator()(const float& x, const float& y) const {
return a * x + y;
}
};
Here is a full example. As #Eric has mentioned, all what is needed is defining your own power functor and using thrust::transform.
#include <thrust/sequence.h>
#include <thrust/device_vector.h>
class power_functor {
double a;
public:
power_functor(double a_) { a = a_; }
__host__ __device__ double operator()(double x) const
{
return pow(x,a);
}
};
void main() {
int N = 20;
thrust::device_vector<double> d_n(N);
thrust::sequence(d_n.begin(), d_n.end());
thrust::transform(d_n.begin(),d_n.end(),d_n.begin(),power_functor(2.));
for (int i=0; i<N; i++) {
double val = d_n[i];
printf("Device vector element number %i equal to %f\n",i,val);
}
getchar();
}