error when get vector value in thrust - thrust

In the thrust example, there is code expand.cu. But when i want to get the value, has shown error like this
error: expression must be a modifiable lvalue in the line
SX = d_output;
Where is the wrong part? the code is like this
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <thrust/fill.h>
#include <thrust/copy.h>
#include <iterator>
#include <iostream>
template <typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if
(thrust::counting_iterator<difference_type>(0),
thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(),
first1,
output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan
(output_indices.begin(),
output_indices.end(),
output_indices.begin(),
thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
OutputIterator output_end = output; thrust::advance(output_end, output_size);
thrust::gather(output_indices.begin(),
output_indices.end(),
first2,
output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
int main(void)
{
int counts[] = {3,5,2,0,1,3,4,2,4};
int values[] = {1,2,3,4,5,6,7,8,9};
size_t input_size = sizeof(counts) / sizeof(int);
size_t output_size = thrust::reduce(counts, counts + input_size);
// copy inputs to device
thrust::device_vector<int> d_counts(counts, counts + input_size);
thrust::device_vector<int> d_values(values, values + input_size);
thrust::device_vector<int> d_output(output_size);
// expand values according to counts
expand(d_counts.begin(), d_counts.end(),
d_values.begin(),
d_output.begin());
std::cout << "d_output: " ;
for(int i=0; i<output_size; i++)
std::cout << d_output[i] << " , ";
//d::cout << d_output[i] << " , ";
thrust::device_vector<int> SX[output_size];
//SX = d_output;
thrust::copy(d_output.begin() , d_output.end(), SX.begin());
return 0;
}

It should be a device vector of the length output_size as
thrust::device_vector<int> SX(output_size);
Using [] means array of device vectors.

Related

CUDA Thrust copy transformed result only if it satisfies a predicate

I want to perform a transformation on a input thrust::device_vector and only copy the result to the output vector if the result satisfies a predicate. So the number of results could be less than the size of the input device_vector (similar to the output vector of thrust::copy_if). I have not found a way to do this with thrust::transform_if. Currently I can do this with thrust::transform and thrust::remove_if as shown in the example below:
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random {
__host__ __device__ add_random() {}
__device__ int operator()(const int n, const int x) const {
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x > 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(5),
d_x.begin(),
d_x.begin(),
add_random());
std::cout << "after adding random number:" << std::endl;
std::ostream_iterator<int> o(std::cout, " ");
thrust::copy(d_x.begin(), d_x.end(), o);
std::cout << std::endl;
thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy(d_x.begin(), new_end, o);
std::cout << std::endl;
return 0;
}
Which gives the output:
after adding random number:
18 4 8 7 11
after removing values greater than 6:
4
I would like to avoid copying the results to memory twice, first by thrust::transform and then by thrust::remove_if in the above example. Is it possible to get the above output with a single transformation function? How can I do this? My biggest concern is the computational cost, so any optimized solution, even if it doesn't use the Thrust library would be great.
Welcome to the world of thrust fancy iterators. You can get a quick overview of some fancy iterator types by looking at the thrust quick start guide. In particular, a thrust transform iterator can frequently be used to replace a thrust transform operation that is applied to the input of another thrust algorithm, "fusing" the two algorithms into a single operation.
Here's a worked example applied to your case:
$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
__host__ __device__ int operator()(thrust::tuple<int, int> t) const {
int n = thrust::get<0>(t);
int x = thrust::get<1>(t);
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x < 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::device_vector<int> d_r(5);
int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
We've replaced your transform operation with a transform iterator applied to the same two inputs. Since you have two inputs to your transform operation, we're using a zip iterator to combine these, and the transform functor has also been reworked slightly to accept that tuple as its input.
Converted your remove_if to a copy_if, to work with the transform iterator as input. This requires a slight change in the logic of the copy predicate.

How to use Thrust to implement reduce by key when keys are strings or char array

input:
BC
BD
BC
BC
BD
CD
output:
BC 3
BD 2
CD 1
if I use char type as key it is available.But seems Thrust does not support string as a key.
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/reduce.h>
#include <string>
int main(void)
{
std::string data = "aaabbbbbcddeeeeeeeeeff";
size_t N = data.size();
thrust::device_vector<char> input(data.begin(), data.end());
thrust::device_vector<char> output(N);
thrust::device_vector<int> lengths(N);
size_t num_runs =
thrust::reduce_by_key(input.begin(), input.end(),
thrust::constant_iterator<int>(1),
output.begin(),
lengths.begin()
).first - output.begin();
return 0;
}
How to implement it using Thrust?
With apologies to #AngryLettuce, here are 2 possible approaches:
Method 1:
create a struct to hold your keys. The struct will include one char item for each character in your key.
sort the keys to bring like keys together. It appears that what you want is really just a count of each key type, regardless of where it appears in the sequence. To facilitate this with reduce_by_key, it's necessary to first group like keys together. Otherwise, reduce_by_key will treat like keys that are separated by different intervening keys as distinct key sequences. It's evident from your desired input and output that this is not what you want.
Now use reduce_by_key on the sorted keys, to count like keys.
Step 2 requires (for this method) a functor to sort keys, and step 3 requires a functor to identify the meaning of "equal" keys, which reduce_by_key needs.
Method 2:
create two separate char device_vector(s), one to hold the first letter of each key, the other to hold the second letter of each key. We will then use zip_iterator throughout the remainder of the code to treat these two vectors as a unified "key" vector.
sort the zipped key vector. In this situation, thrust knows how to sort a zipped vector of basic types, and requires no separate sorting functor
perform reduce_by_key on the zipped (and sorted) key vector. This once again requires no separate equality functor. Thrust knows how to determine equality of zipped vectors of basic types.
This second method, in addition to not requiring any functor definitions, probably would also be faster, as zip_iterator tends to improve data access as compared to the AoS (array of structures) present in the first method.
Here's a worked example demonstrating both methods:
$ cat t1004.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>
#include <thrust/iterator/zip_iterator.h>
struct key {
char k1;
char k2;
};
struct sort_functor{
__host__ __device__ bool operator()(key &k1, key &k2){
if (k1.k1 < k2.k1) return true;
if (k1.k1 > k2.k1) return false;
if (k1.k2 < k2.k2) return true;
return false;}
};
struct equal_key{
__host__ __device__ bool operator()(key k1, key k2){
if ((k1.k1 == k2.k1)&&(k1.k2 == k2.k2)) return true;
return false;}
};
int main(){
key data[] = {{'B','C'},{'B','D'},{'B','C'},{'B','C'},{'B','D'},{'C','D'}};;
size_t dsize = sizeof(data)/sizeof(key);
//method 1
thrust::device_vector<key> keys(data, data+dsize);
thrust::device_vector<key> keys_out(dsize);
thrust::device_vector<int> lengths(dsize);
thrust::sort(keys.begin(), keys.end(), sort_functor());
int rsize = thrust::reduce_by_key(keys.begin(), keys.end(), thrust::constant_iterator<int>(1), keys_out.begin(), lengths.begin(),equal_key()).first - keys_out.begin();
std::cout << "Method1:" << std::endl;
for (int i = 0; i < rsize; i++){
key temp = keys_out[i];
int len = lengths[i];
std::cout << " " << temp.k1 << temp.k2 << " " << len << std::endl;}
//method 2
//get the key data into 2 separate vectors.
//there are more efficient ways to do this
//but this is not the crux of your question
thrust::device_vector<char> k1;
thrust::device_vector<char> k2;
for (int i = 0; i < dsize; i++){
k1.push_back(data[i].k1);
k2.push_back(data[i].k2);}
thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(k1.begin(), k2.begin())), thrust::make_zip_iterator(thrust::make_tuple(k1.end(), k2.end())));
thrust::device_vector<char> k1r(dsize);
thrust::device_vector<char> k2r(dsize);
rsize = thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(k1.begin(), k2.begin())), thrust::make_zip_iterator(thrust::make_tuple(k1.end(), k2.end())), thrust::constant_iterator<int>(1), thrust::make_zip_iterator(thrust::make_tuple(k1r.begin(), k2r.begin())), lengths.begin()).first - thrust::make_zip_iterator(thrust::make_tuple(k1r.begin(),k2r.begin()));
std::cout << "Method2:" << std::endl;
for (int i = 0; i < rsize; i++){
char c1 = k1r[i];
char c2 = k2r[i];
int len = lengths[i];
std::cout << " " << c1 << c2 << " " << len << std::endl;}
return 0;
}
$ nvcc -o t1004 t1004.cu
$ ./t1004
Method1:
BC 3
BD 2
CD 1
Method2:
BC 3
BD 2
CD 1
$
Here's an improved version of method 2. You should be able to use a string/char array directly, and this version can also be fairly easily modified to accommodate a key length from 2 to 10 characters. This method uses a strided range iterator to pull the individual key characters directly from the data array:
$ cat t1004.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
typedef thrust::device_vector<char>::iterator cIterator;
int main(){
//method 2
//get the key data into separate vectors, one per character in key.
#define KEYLEN 2
const char data[] = "BCBDBCBCBDCD";
size_t dsize = sizeof(data)/sizeof(char);
size_t numkeys = dsize/KEYLEN;
thrust::device_vector<char> keys(data, data+dsize);
strided_range<cIterator> *str_k[KEYLEN];
for (int i = 0; i < KEYLEN; i++)
str_k[i] = new strided_range<cIterator>(keys.begin()+i, keys.end(), KEYLEN);
//modify this line also if KEYLEN changes (max 10)
auto my_z = thrust::make_zip_iterator(thrust::make_tuple((*str_k[0]).begin(), (*str_k[1]).begin()));
thrust::sort(my_z, my_z+numkeys);
thrust::device_vector<char> kr[KEYLEN];
for (int i = 0; i < KEYLEN; i++)
kr[i].resize(numkeys);
//modify this line also if KEYLEN changes (max 10)
auto my_zr = thrust::make_zip_iterator(thrust::make_tuple(kr[0].begin(), kr[1].begin()));
thrust::device_vector<int> lengths(numkeys);
size_t rsize = thrust::reduce_by_key(my_z, my_z + numkeys, thrust::constant_iterator<int>(1), my_zr, lengths.begin()).first - my_zr;
std::cout << "Method2:" << std::endl;
for (int i = 0; i < rsize; i++){
std::cout << " ";
for (int j = 0; j < KEYLEN; j++){
char c = kr[j][i];
std::cout << c; }
int len = lengths[i];
std::cout <<" " << len << std::endl;}
return 0;
}
$ nvcc -std=c++11 t1004.cu -o t1004
$ ./t1004
Method2:
BC 3
BD 2
CD 1
$

CUDA Thrust: reduce_by_key on only some values in an array, based off values in a "key" array

Let's say I have two device_vector<byte> arrays, d_keys and d_data.
If d_data is, for example, a flattened 2D 3x5 array ( e.g. { 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3 } ) and d_keys is a 1D array of size 5 ( e.g. { 1, 0, 0, 1, 1 } ), how can I do a reduction such that I'd end up only adding values on a per-row basis if the corresponding d_keys value is one ( e.g. ending up with a result of { 10, 23, 14 } )?
The sum_rows.cu example allows me to add every value in d_data, but that's not quite right.
Alternatively, I can, on a per-row basis, use a zip_iterator and combine d_keys with one row of d_data at a time, and do a transform_reduce, adding only if the key value is one, but then I'd have to loop through the d_data array.
What I really need is some sort of transform_reduce_by_key functionality that isn't built-in, but surely there must be a way to make it!
Based on the additional comment that instead of 3 rows there are thousands of rows, we can write a transform functor that sums an entire row. Based on the fact that there are thousands of rows, this should keep the machine pretty busy:
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#define ROW 20
#define COL 10
__device__ int *vals;
__device__ int *keys;
struct test_functor
{
const int a;
test_functor(int _a) : a(_a) {}
__device__
int operator()(int& x, int& y ) {
int temp = 0;
for (int i = 0; i<a; i++)
temp += vals[i + (y*a)] * keys[i];
return temp;
}
};
int main(){
int *s_vals, *s_keys;
thrust::host_vector<int> h_vals(ROW*COL);
thrust::host_vector<int> h_keys(COL);
thrust::sequence(h_vals.begin(), h_vals.end());
thrust::fill(h_keys.begin(), h_keys.end(), 1);
h_keys[0] = 0;
thrust::device_vector<int> d_vals = h_vals;
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_sums(ROW);
thrust::fill(d_sums.begin(), d_sums.end(), 0);
s_vals = thrust::raw_pointer_cast(&d_vals[0]);
s_keys = thrust::raw_pointer_cast(&d_keys[0]);
cudaMemcpyToSymbol(vals, &s_vals, sizeof(int *));
cudaMemcpyToSymbol(keys, &s_keys, sizeof(int *));
thrust::device_vector<int> d_idx(ROW);
thrust::sequence(d_idx.begin(), d_idx.end());
thrust::transform(d_sums.begin(), d_sums.end(), d_idx.begin(), d_sums.begin(), test_functor(COL));
thrust::host_vector<int> h_sums = d_sums;
std::cout << "Results :" << std::endl;
for (unsigned i = 0; i<ROW; i++)
std::cout<<"h_sums["<<i<<"] = " << h_sums[i] << std::endl;
return 0;
}
This approach has the drawback that in general accesses to the vals array will not be coalesced. However for a few thousand rows the cache may offer significant relief. We can fix this problem by re-ordering the data to be stored in column-major form in the flattened array, and change our indexing method in the loop in the functor to be like this:
for (int i=0; i<a; i++)
temp += vals[(i*ROW)+y]*keys[i];
If preferred, you can pass ROW as an additional parameter to the functor.
Here is some sample code that does something like what you are after, using the approach I outlined in my comment below your question. In fact we want to use 4-tuples, to pick up your key value. Reproducing the suitably modified comment here:
You could make a zip iterator that zips your 3 rows together plus the key "row" and passes a 4-tuple to a special functor. Your special functor would then do a reduction on the array of 3-tuples (using the key also) and return a result that is a 4-tuple. The thrust dot product example may give you some ideas.
This is one possible approach:
#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>
#define N 30 // make this evenly divisible by 3 for this example
typedef thrust::tuple<int, int, int, int> tpl4int;
typedef thrust::host_vector<int>::iterator intiter;
typedef thrust::tuple<intiter, intiter, intiter, intiter> tpl4intiter;
typedef thrust::zip_iterator<tpl4intiter> int4zip;
struct r3key_unary_op : public thrust::unary_function<tpl4int, tpl4int>
{
__host__ __device__
tpl4int operator()(const tpl4int& x) const
{
tpl4int result;
thrust::get<0>(result) = x.get<0>()*x.get<3>();
thrust::get<1>(result) = x.get<1>()*x.get<3>();
thrust::get<2>(result) = x.get<2>()*x.get<3>();
thrust::get<3>(result) = 1;
return result;
}
};
struct r3key_binary_op : public thrust::binary_function<tpl4int, tpl4int, tpl4int>
{
__host__ __device__
tpl4int operator()(const tpl4int& x, const tpl4int& y) const
{
tpl4int result;
thrust::get<0>(result) = x.get<0>()*x.get<3>() + y.get<0>()*y.get<3>();
thrust::get<1>(result) = x.get<1>()*x.get<3>() + y.get<1>()*y.get<3>();
thrust::get<2>(result) = x.get<2>()*x.get<3>() + y.get<2>()*y.get<3>();
thrust::get<3>(result) = 1;
return result;
}
};
int main() {
thrust::host_vector<int> A(N); // values, in 3 "rows" flattened
thrust::sequence(A.begin(), A.end());
thrust::host_vector<int> K(N/3); // keys in one row
thrust::fill(K.begin(), K.end(), 1); // set some keys to 1
K[9] = 0; // set some keys to zero
int4zip first = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), A.begin() + N/3, A.begin() + 2*N/3, K.begin()));
int4zip last = thrust::make_zip_iterator(thrust::make_tuple(A.begin() + N/3, A.begin() + 2*N/3, A.end(), K.end()));
r3key_unary_op my_unary_op;
r3key_binary_op my_binary_op;
tpl4int init = my_unary_op(*first);
// init = thrust::make_tuple((int) 0, (int) 0, (int) 0, (int) 0);
tpl4int result = thrust::transform_reduce(first, last, my_unary_op, init, my_binary_op);
std::cout << "row 0 = " << result.get<0>() << std::endl;
std::cout << "row 1 = " << result.get<1>() << std::endl;
std::cout << "row 2 = " << result.get<2>() << std::endl;
return 0;
}
Notes:
This is just using host_vector. Extending it to work with device_vector, or templatizing it to work with something other than int should be straightforward.
For completeness, I am using the unary functor to provide an init value other than zero for the sum reduction of each row. You might want to change the init value to zero (a 4-tuple of zeros).

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

Cuda Texture Memory does not inherit the right Values

I'm Trying to bin a 2D array to a texture and to do interpolation between the data. My Problem is. When I'm binding my Array to the texture the the Values i access are total nonsense. Even when I'm trying to acces the first Value (text2D(tex,0.0f,0.0f) i doesn't make sense. So i guess I'm binding it wrong or my memcopy is wrong. Any ideas where my mistake is?
Here is the Code
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel.cu"
#include "linearInterpolation_kernel2.cu"
#include "linearInterpolation_kernel3.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 200;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.5f;
for(int i = 0; i < N; i++){
A[i] = (float)rand();
B[i] = (float)rand();
}
cout << A[3] << endl;
cout << B[3] << endl;
ipLinearTexture(A,B,result,angle,N);
float result2;
result2 = (angle)*A[3] + (1-angle)*B[3];
printf(" A %f B %f Result %f\n", A[3], B[3], result[3]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
const int N2 = N;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) calloc( 2 , sizeof(float *));
}
}
for (int i = 0; i < N; i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N * 2 * sizeof(float);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2 ));
checkCudaErrors(cudaMemcpyToArray( cu_array, 0, 0, AB, size, cudaMemcpyHostToDevice));
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = false; // access with normalized texture coordinates
checkCudaErrors(cudaBindTextureToArray( tex, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel3<<< dimGrid, dimBlock, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaUnbindTexture(tex));
cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
result[0] = (float)cuTime;
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << endl;
}
cout << "==================================================" << endl;
cudaFree(dev_result);
cudaFreeArray(cu_array);
}
Here is the code inside the Kernel
#ifndef _SIMPLETEXTURE_KERNEL3_H_
#define _SIMPLETEXTURE_KERNEL3_H_
// declare texture reference for 2D float texture
texture<float, 1> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups
//! #param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel3( float* g_odata, int width, int height, float theta)
{
unsigned int id = blockIdx.x*blockDim.x + threadIdx.x;
if (id < width*height)
{
g_odata[id] = tex1D(tex, xid * 2 + 0.5f);
}
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Like the concept in OpenGL, you could think a 2D texture is a rectangle field. The center point of each small rectangle is your array data. So, tex2D(tex, 0.5f/width, 0.5f/height) will be exactly your first value of array data. (width & height is the width and height of 2D array data)