Thrust: selectively move elements to another vector - stl

I'm trying to figure out the best way to do the following using Thrust: vector A has a million floats, they have some particular order. I want to move to vector B every element x in A for which x>7.0 such that the order of elements is maintain in both vectors A and B. Importantly, only a tiny fraction of elements need be moved. Efficiency is more important for my code than elegance.
My idea was to use thrust::copy_if from A to B and then thrust::remove_if on A. But I don't know the exact number of elements to be copy, and since apparently the memory for B must be allocated in advance, another counting operation is necessary. An inelegant way to skip the counting operation is to pre-allocate "enough" memory for vector B.
Using thrust::remove_copy_if has much the same problems: you need to allocate memory for B in advance, and also it doesn't actually remove anything from A so another thrust::remove_if is required anyway.
Another idea I had was to use thrust::stable_sort with some custom-made comparison functor, to push all elements I want out to the end of A, and then somehow figure out how many there are and thrust::copy them to B. This also looks pretty inelegant...

You're on the right track with thrust::copy_if. Just allocate two more buffers of the same size as the first one. Then copy_if > 7.0f to the first one and copy_if <= 7.0f to the second one. Allocating buffers of the same size as the original buffer is fine as long as you know there's room, and 1 million floats only takes up 4MB.
Edit:
I did a performance comparison of the copy_if and stable_partition approaches. On my card, a GTX660, stable_partition took around 150% as long as copy_if for "split" values of 0.1f, 0.5f and 0.9f. I added tests to ensure that both methods are stable (maintain the order of the values).
#include <cuda.h>
#include <curand.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define SPLIT 0.1f
struct is_low
{
__host__ __device__ bool operator()(const float x)
{
return x <= SPLIT;
}
};
struct is_high
{
__host__ __device__ bool operator()(const float x)
{
return x > SPLIT;
}
};
class EventTimer {
public:
EventTimer() : mStarted(false), mStopped(false) {
cudaEventCreate(&mStart);
cudaEventCreate(&mStop);
}
~EventTimer() {
cudaEventDestroy(mStart);
cudaEventDestroy(mStop);
}
void start(cudaStream_t s = 0) {
cudaEventRecord(mStart, s);
mStarted = true;
mStopped = false;
}
void stop(cudaStream_t s = 0) {
assert(mStarted);
cudaEventRecord(mStop, s);
mStarted = false;
mStopped = true;
}
float elapsed() {
assert(mStopped);
if (!mStopped) return 0;
cudaEventSynchronize(mStop);
float elapsed = 0;
cudaEventElapsedTime(&elapsed, mStart, mStop);
return elapsed;
}
private:
bool mStarted, mStopped;
cudaEvent_t mStart, mStop;
};
int main(int argc, char *argv[])
{
const size_t n = 1024 * 1024 * 50;
// Create prng
curandGenerator_t gen;
CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
// Set seed
CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
// Generate n floats on device
thrust::device_vector<float> vec_rnd_d(n);
float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());
CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n));
thrust::device_vector<float> vec_low_d(n);
thrust::device_vector<float> vec_high_d(n);
for (int i = 0; i < 5; ++i) {
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_end;
iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());
thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());
timer.stop();
std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_low_h = vec_low_d;
thrust::host_vector<float> vec_high_h = vec_high_d;
thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
for (int i = 0; i < 5; ++i) {
thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_split =
thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());
timer.stop();
size_t n_low = iter_split - vec_rnd_copy.begin();
std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;
thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
CHECK_CURAND_CALL(curandDestroyGenerator(gen));
return EXIT_SUCCESS;
}
Output:
C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe
copy_if: 40.2919ms
copy_if: 38.0157ms
copy_if: 38.5036ms
copy_if: 37.6751ms
copy_if: 38.1054ms
stable_partition: 59.5473ms
stable_partition: 61.4016ms
stable_partition: 59.1854ms
stable_partition: 61.3195ms
stable_partition: 59.1205ms

To answer my own question, I finally found thrust::stable_partition, which is more efficient and elegant than all "copy_if"-alternatives. It just moves all elements that fail to satisfy a predicate to the end of the array and returns the start of the second sequence. Pointer arithmetic gives the size of B, but in fact it's not necessary anymore:
thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred)
thrust::device_vector<float> B(iter, A.end())
A.erase(iter, A.end());

Related

Cublas - Column/Row wise operations

I am looking for a way to perform operations over columns .
I have MxN matrix, i want to activate cublas function (for example nrm2) over each column.
The result i expect to get is : M x 1
How can I do that?
CUBLAS has no batched Level 1 routines, so there is no direct way to compute the column or row norms in a single call. You can do it by calling nrm2 many times in a loop over all the rows or columns of the matrix, for example:
#include <cublas_v2.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <iostream>
struct prg
{
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void)
{
const int M = 1024, N = M;
const int num = N * M;
thrust::device_vector<float> matrix(num);
thrust::device_vector<float> vector(N, -1.0f);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + num,
matrix.begin(),
prg(1.f,2.f));
float* m_d = thrust::raw_pointer_cast(matrix.data());
float* v_d = thrust::raw_pointer_cast(vector.data());
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
cublasSetStream(handle, stream);
for(int col=0; col < N; col++) {
cublasSnrm2(handle, M, m_d + col*M, 1, v_d + col);
}
cudaDeviceSynchronize();
for(auto x : vector) {
float normval = x;
std::cout << normval << std::endl;
}
return 0;
}
Unless you have very large rows or columns, there is little scope to exploit streams to run simultaneous kernels and reduce the overall runtime because each nrm2 call will be too short. So there is a lot of latency in running lots of individual kernels, which will negatively effect performance.
A much better alternative would be to write your own kernel to do this.

Does bool variable in kernel need to be synchronized

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

Can't get matrix*vector multiplication to go faster in CUDA than in CPU

#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize / BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
Gives:
267171
580253
I've followed the guide on http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory, on how to do a matrix multiplication. I used shared memory for both the matrix (A) and the vector (B), but no matter what matrix size (100*100-20000*20000) or block size (32-1024) i choose, the sequential implementation always outperforms the CUDA implementation in terms of speed, it is about twice as fast.
Since I'm using matrix*vector multiplication, the shared arrays and blocks are handled a bit different; I'm using one block per row of the matrix instead of a 2D block over a part of the matrix.
Is my implementation wrong, or is simply CUDA not faster than the CPU?
First item: You perform checks on boundaries in the cuda implementation where you don't on CPU. Branching are really expensive on a GPU.
Second : You count the cudamemcpy in the cuda performance. It's very uncommon to perform only one multiplication before having to get the result back to cpu.
Usually (on CG for example), you perform several hundreds of multiplication on GPU before having to copy back.
Third: Dont try to implement that (except for educational purposes) and use vendor libraries (like CUBLAS, which ships with every CUDA release), which are extremely hard to outperform.

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

thrust::device_reference can't be used with printf?

I am using the thrust partition function to partition array into even and odd numbers. However, when i try to display the device vector, it shows random values. Please let me know where is the error. I think i have done everything correct.
#include<stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include<thrust/partition.h>
struct is_even
{
//const int toCom;
//is_even(int val):toCom(val){}
__device__
bool operator()(const int &x)
{
return x%2;
}
};
void main(){
thrust::host_vector<int> H(6);
for(int i =0 ; i<H.size();i++){
H[i] = i+1;
}
thrust::device_vector<int> D = H;
thrust::partition(D.begin(),D.end(),is_even());
for(int i =0 ;i< D.size();i++){
printf("%d,",D[i]);
}
getchar();
}
You can't send a thrust::device_reference (i.e., the result of D[i]) through printf's ellipsis because it is not a POD type. See the documentation. Your code will produce a compiler warning to this effect.
Cast to int first:
for(int i = 0; i < D.size(); ++i)
{
printf("%d,", (int) D[i]);
}