Let us say that I have a single array which stores time stamps for multiple events. For example, T1_e1, T2_e1,....,T1_e2, T2_e2, T3_e2,.....T1_eN, T2,eN,..
I know that Thrust offers a function which computes adjacent differences, but here I need to do it for multiple events. Basically, constructing multiple histograms from a single input array.
So the output would have N different histograms (one for each event) like this:
histogram bins for e1, histogram bins for e2, histogram bins for e3,....histogram bins for eN.
Input1 (timestamps): 100, 101, 104, 105, 101,104, 106, 111, 90, 91, 93, 94,95
Input2 (events): 4123,4123,4123,4123,2129,2129,2129,2129,300,300,300,300,300
output: 4123:(1,2),(2,0),(3,1),(4,0),(5,0)
2129:(1,0),(2,1),(3,1),(4,0),(5,1)
300: (1,2),(2,1),(3,0),(4,),(5,0)
The number of bins will be fixed, i.e. 5 bins per histogram.
Regarding the tuples: (x,y) -> x is the difference between two consecutive time stamps belonging to the same event. y is the count.
If we consider event 4123, the first tuple is (1,2), because the difference between 101 and 100 is 1, and 105 and 104 is 1. So there are two time stamp differences which belong to this bin, hence (1,2).
Can someone please suggest the most efficient way to do this. So far, it seems that I will have to write my own code. But if there are existing solutions, I would like to try them first.
Here's one possible approach that computes the sparse histogram (i.e. non-zero bins only). It should not be difficult to convert a sparse histogram to a dense histogram (zero and non-zero bins included) using thrust scattering.
compute the timestamp differences using thrust::adjacent_difference and a special functor (my_adj_diff) that computes adjacent differences only for like events.
use thrust::remove_if to remove the zero values (one per event) at the start of each event sequence (created by the functor in step 1).
combine events and timestamp differences into a single integer for histogramming using thrust::transform. This just multiplies the event by 100 in my example, and adds the timestamp difference (assumed to be a set of bins of less than 100 max bin).
use the sparse histogram method from the thrust histogram example code.
Here's a fully worked example. The data does not quite match your expected output (non-zero values only) because you have an error in your expected output.
$ cat t678.cu
#include <thrust/adjacent_difference.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/inner_product.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <iostream>
#define ZIP(X,Y) thrust::make_zip_iterator(thrust::make_tuple(X,Y))
#define SCALE 100
struct my_adj_diff
{
template <typename T>
__host__ __device__
T operator()(T &d2, T &d1) const
{
if (thrust::get<1>(d1) == thrust::get<1>(d2)) {
thrust::get<0>(d2) -= thrust::get<0>(d1);}
else {
thrust::get<0>(d2) = 0;}
return d2;
}
};
struct my_is_zero
{
template <typename T>
__host__ __device__
bool operator()(const T &d1) const
{
return (thrust::get<0>(d1) == 0);
}
};
struct my_combine
{
template <typename T>
__host__ __device__
T operator()(const T &d1, const T &d2) const
{
return (d1*SCALE)+d2;
}
};
// sparse histogram using reduce_by_key
// modified from: https://github.com/thrust/thrust/blob/master/examples/histogram.cu
template <typename Vector1,
typename Vector2,
typename Vector3>
void sparse_histogram(const Vector1& input,
Vector2& histogram_values,
Vector3& histogram_counts)
{
typedef typename Vector1::value_type ValueType; // input value type
typedef typename Vector3::value_type IndexType; // histogram index type
thrust::device_vector<ValueType> data(input);
// sort data to bring equal elements together
thrust::sort(data.begin(), data.end());
// number of histogram bins is equal to number of unique values (assumes data.size() > 0)
IndexType num_bins = thrust::inner_product(data.begin(), data.end() - 1,
data.begin() + 1,
IndexType(1),
thrust::plus<IndexType>(),
thrust::not_equal_to<ValueType>());
// resize histogram storage
histogram_values.resize(num_bins);
histogram_counts.resize(num_bins);
// compact find the end of each bin of values
thrust::reduce_by_key(data.begin(), data.end(),
thrust::constant_iterator<IndexType>(1),
histogram_values.begin(),
histogram_counts.begin());
}
int main(){
int tstamps[] = { 100, 101, 104, 105, 101,104, 106, 111, 90, 91, 93, 94,95 };
int mevents[] = {4123,4123,4123,4123,2129,2129,2129,2129,300,300,300,300,300};
int dsize = sizeof(tstamps)/sizeof(int);
thrust::host_vector<int> h_stamps(tstamps, tstamps+dsize);
thrust::host_vector<int> h_events(mevents, mevents+dsize);
thrust::device_vector<int> d_stamps = h_stamps;
thrust::device_vector<int> d_events = h_events;
thrust::device_vector<int> diffs(dsize);
// compute timestamp differences by event
thrust::adjacent_difference(ZIP(d_stamps.begin(), d_events.begin()), ZIP(d_stamps.end(), d_events.end()), ZIP(d_stamps.begin(), d_events.begin()), my_adj_diff());
d_stamps[0] = 0; // fix up first event for adjacent_difference
int sz1 = thrust::remove_if(ZIP(d_stamps.begin(), d_events.begin()), ZIP(d_stamps.end(), d_events.end()), my_is_zero()) - ZIP(d_stamps.begin(), d_events.begin());
d_stamps.resize(sz1);
d_events.resize(sz1);
// pack events and timestamps into a single vector - assumes max bin (time difference) is less than SCALE
thrust::device_vector<int> d_data(sz1);
thrust::transform(d_events.begin(), d_events.end(), d_stamps.begin(), d_data.begin(), my_combine());
// compute histogram
thrust::device_vector<int> histogram_values;
thrust::device_vector<int> histogram_counts;
sparse_histogram(d_data, histogram_values, histogram_counts);
thrust::copy(histogram_values.begin(), histogram_values.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
thrust::copy(histogram_counts.begin(), histogram_counts.end(), std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc t678.cu -o t678
$ ./t678
30001,30002,212902,212903,212905,412301,412303,
3,1,1,1,1,2,1,
$
Note that you will need to use CUDA 7.0 (not CUDA 7.0 RC or any earlier version of CUDA) or else download the latest thrust master branch from github, because older versions of thrust have an issue when attempting to use zip iterators with thrust::adjacent_difference.
Related
I run into this issue over and over in CUDA. I have, for a set of elements, done some GPU calculation. This results in some value that has linear meaning (for instance, in terms of memory):
element_sizes = [ 10, 100, 23, 45 ]
And now, for the next stage of GPU calculation, I need the following values:
memory_size = sum(element_sizes)
memory_offsets = [ 0, 10, 110, 133 ]
I can calculate memory_size at 80 gbps on my GPU using the reduction code available from NVIDIA. However, I can't use this code, as it uses a branching technique that does not compose the memory offsets array. I have tried many things, but what I have found is that simply copying over elements_sizes to the host and calculating the offsets with a simd for loop is the simplest, fastest, way to go:
// in pseudo code
host_element_sizes = copy_to_host(element_sizes);
host_offsets = (... *) malloc(...);
int total_size = 0;
for(int i = 0; i < ...; ...){
host_offsets[i] = total_size;
total_size += host_element_sizes[i];
}
device_offsets = (... *) device_malloc(...);
device_offsets = copy_to_device(host_offsets,...);
However, I have done this many times now, and it is starting to become a bottleneck. This seems like a typical problem, but I have found no work-around.
What is the expected way for a CUDA programmer to solve this problem?
I think the algorithm you are looking for is a prefix sum. A prefix sum on a vector produces another vector which contains the cumulative sum values of the input vector. A prefix sum exists in at least two variants - an exclusive scan or an inclusive scan. Conceptually these are similar.
If your element_sizes vector has been deposited in GPU global memory (it appears to be the case based on your pseudocode), then there exist library functions that run on the GPU that you could call at that point, to produce the memory_offsets data (vector), and the memory_size value could be trivially obtained from the last value in the vector, with a slight variation based on whether you are doing an inclusive scan or exclusive scan.
Here's a trivial worked example using thrust:
$ cat t319.cu
#include <thrust/scan.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <iostream>
int main(){
const int element_sizes[] = { 10, 100, 23, 45 };
const int ds = sizeof(element_sizes)/sizeof(element_sizes[0]);
thrust::device_vector<int> dv_es(element_sizes, element_sizes+ds);
thrust::device_vector<int> dv_mo(ds);
thrust::exclusive_scan(dv_es.begin(), dv_es.end(), dv_mo.begin());
std::cout << "element_sizes:" << std::endl;
thrust::copy_n(dv_es.begin(), ds, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "memory_offsets:" << std::endl;
thrust::copy_n(dv_mo.begin(), ds, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "memory_size:" << std::endl << dv_es[ds-1] + dv_mo[ds-1] << std::endl;
}
$ nvcc -o t319 t319.cu
$ ./t319
element_sizes:
10,100,23,45,
memory_offsets:
0,10,110,133,
memory_size:
178
$
I'm trying to figure out how to work with index array in CUDA thrust. My problem is the following:
vector<int> index(20);
vector<float> data1(100), data2(100), result(20);
for(int i=0;i<index.size();++i)
result.push_back(do_something(data1[index[i]],data2[index[i]]));
The function do_something() takes elements from several large arrays selected by index array. The size of index is usually much smaller than the size of data and the elements of index are sorted.
I can't figure out what is the best strategy of doing this efficiently in thrust.
I recommend reading the thrust quick start guide for basic understanding of thrust, and some of the concepts I will use below.
I can't figure out what is the best strategy of doing this efficiently in thrust.
Most of what you have in your example should map directly to straightforward operations in thrust. Your for-loop and do_something operation would be replaced by a thrust algorithm, probably with an appropriate functor definition that would mimic the functionality you have in do_something. An easy-to-use thrust algorithm is thrust::transform() that may be applicable to this case.
The use of the index array for indirection would often be handled using a thrust permutation_iterator, which is designed exactly for this purpose.
Combining these concepts, and supposing for a simple example that your do_something operation will sum the squares of each indexed element in the two input vectors, and then store the square root of that result in the result vector, we could have an example like this:
$ cat t74.cu
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/copy.h>
#include <math.h>
#include <iostream>
struct do_something
{
template <typename T>
__host__ __device__ T operator()(const T &i1, const T &i2){
return sqrtf(i1*i1 + i2*i2);
}
};
int main(){
//pythagorean triples
float d1[] = { 3, 5, 8, 7, 20, 12, 9, 28};
float d2[] = { 4, 12, 15, 24, 21, 35, 40, 45};
int i1[] = {1, 3, 5, 7};
const size_t isize = sizeof(i1)/sizeof(i1[0]);
const size_t dsize = sizeof(d1)/sizeof(d1[0]);
thrust::device_vector<int> index(i1, i1+isize);
thrust::device_vector<float> data1(d1, d1+dsize);
thrust::device_vector<float> data2(d2, d2+dsize);
thrust::device_vector<float> result(isize);
thrust::transform(thrust::make_permutation_iterator(data1.begin(), index.begin()), thrust::make_permutation_iterator(data1.begin(), index.end()), thrust::make_permutation_iterator(data2.begin(), index.begin()), result.begin(), do_something());
thrust::copy_n(result.begin(), result.size(), std::ostream_iterator<float>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -arch=sm_30 -o t74 t74.cu
$ ./t74
13,25,37,53,
$
The index set need not be sorted for this usage (although it may be somewhat beneficial from a performance perspective).
I have two int vectors for keys and values, their size is about 500K.
The key vector is already sorted. And there are 10K groups approximately.
The value is non-negative(stands for useful) or -2(stands for no use), in each group there should be one or zero non-negative values, and the rest is -2.
key: 0 0 0 0 1 2 2 3 3 3 3
value:-2 -2 1 -2 3 -2 -2 -2 -2 -2 0
The third pair of group 0 [0 1] is useful. For group 1 we get the pair[1 3]. The values of group 2 are all -2, so we get nothing. And for group 3, the result is [3 0].
So, the question is how can I do this by thrust or cuda ?
Here are two ideas.
First one:
Get the number of each group by a histogram algorithm. So the barrier of each group can be computed.
Operate thrust::find_if on each group to get the useful element.
Second one:
Use thrust::transform to add 2 for every value and now all the value are non-negative, and zero stands for useless.
Use thrust::reduce_by_key to get the reduction for every group, and then subtract 2 for every output value.
I think there must be some other methods which will achieve much more performance than the above two.
Performance of the methods:
I have test the Second method above and the method given by #Robert Crovella, ie. reduce_by_key and remove_if method.
The size of the vectors is 2691028, the vectors consist of 100001 groups. Here is their average time:
reduce_by_key: 1204ms
remove_if: 192ms
From above result, we can see that remove_if method is much faster. And also the "remove_if" method is easy to implement and consume much less gpu memory.
Briefly, #Robert Crovella 's method is very good.
I would use a thrust::zip_iterator to zip the key and values pairs together, and then I would do a thrust::remove_if operation on the zipped values which would require a functor definition that would indicate to remove every pair for which the value is negative (or whatever test you wish.)
Here's a worked example:
$ cat t1009.cu
#include <thrust/remove.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/copy.h>
struct remove_func
{
template <typename T>
__host__ __device__
bool operator()(T &t){
return (thrust::get<1>(t) < 0); // could change to other kinds of tests
}
};
int main(){
int keys[] = {0,0,0,0,1,2,2,3,3,3,3};
int vals[] = {-2,-2,1,-2,3,-2,-2,-2,-2,-2,0};
size_t dsize = sizeof(keys)/sizeof(int);
thrust::device_vector<int>dkeys(keys, keys+dsize);
thrust::device_vector<int>dvals(vals, vals+dsize);
auto zr = thrust::make_zip_iterator(thrust::make_tuple(dkeys.begin(), dvals.begin()));
size_t rsize = thrust::remove_if(zr, zr+dsize, remove_func()) - zr;
thrust::copy_n(dkeys.begin(), rsize, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
thrust::copy_n(dvals.begin(), rsize, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -std=c++11 -o t1009 t1009.cu
$ ./t1009
0,1,3,
1,3,0,
$
I want to compute the density of particles over a grid. Therefore, I have a vector that contains the cellID of each particle, as well as a vector with the given mass which does not have to be uniform.
I have taken the non-sparse example from Thrust to compute a histogram of my particles.
However, to compute the density, I need to include the weight of each particle, instead of simply summing the number of particles per cell, i.e. I'm interested in rho[i] = sum W[j] for all j that satify cellID[j]=i (probably unnecessary to explain, since everybody knows that).
Implementing this with Thrust has not worked for me. I also tried to use a CUDA kernel and thrust_raw_pointer_cast, but I did not succeed with that either.
EDIT:
Here is a minimal working example which should compile via nvcc file.cu under CUDA 6.5 and with Thrust installed.
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/adjacent_difference.h>
// Predicate
struct is_out_of_bounds {
__host__ __device__ bool operator()(int i) {
return (i < 0); // out of bounds elements have negative id;
}
};
// cf.: https://code.google.com/p/thrust/source/browse/examples/histogram.cu, but modified
template<typename T1, typename T2>
void computeHistogram(const T1& input, T2& histogram) {
typedef typename T1::value_type ValueType; // input value type
typedef typename T2::value_type IndexType; // histogram index type
// copy input data (could be skipped if input is allowed to be modified)
thrust::device_vector<ValueType> data(input);
// sort data to bring equal elements together
thrust::sort(data.begin(), data.end());
// there are elements that we don't want to count, those have ID -1;
data.erase(thrust::remove_if(data.begin(), data.end(), is_out_of_bounds()),data.end());
// number of histogram bins is equal to the maximum value plus one
IndexType num_bins = histogram.size();
// find the end of each bin of values
thrust::counting_iterator<IndexType> search_begin(0);
thrust::upper_bound(data.begin(), data.end(), search_begin,
search_begin + num_bins, histogram.begin());
// compute the histogram by taking differences of the cumulative histogram
thrust::adjacent_difference(histogram.begin(), histogram.end(),
histogram.begin());
}
int main(void) {
thrust::device_vector<int> cellID(5);
cellID[0] = -1; cellID[1] = 1; cellID[2] = 0; cellID[3] = 2; cellID[4]=1;
thrust::device_vector<float> mass(5);
mass[0] = .5; mass[1] = 1.0; mass[2] = 2.0; mass[3] = 3.0; mass[4] = 4.0;
thrust::device_vector<int> histogram(3);
thrust::device_vector<float> density(3);
computeHistogram(cellID,histogram);
std::cout<<"\nHistogram:\n";
thrust::copy(histogram.begin(), histogram.end(),
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
// this will print: " Histogram 1 2 1 "
// meaning one element with ID 0, two elements with ID 1
// and one element with ID 2
/* here is what I am unable to implement:
*
*
* computeDensity(cellID,mass,density);
*
* print(density): 2.0 5.0 3.0
*
*
*/
}
I hope the comment at the end of the file also makes clear what I mean by computing the density. If there is any question open, please feel free to ask. Thanks!
There still seems to be a problem in understanding my problem, which I am sorry for! Therefore I added some pictures.
Consider the first picture. For my understanding, a histogram would simply be the count of particles per grid cell. In this case a histogram would be an array of size 36, since there are 36 cells. Also, there would be a lot of zero entries in the vector, since for example in the upper left corner almost no cell contains a particle. This is what I already have in my code.
Now consider the slightly more complicated case. Here each particle has a different mass, indicated by the different size in the plot. To compute the density I can't just add the number of particles per cell, but I have to add the mass of all particles per cell. This is what I'm unable to implement.
What you described in your example does not look like a histogram but rather like a segmented reduction.
The following example code uses thrust::reduce_by_key to sum up the masses of particles within the same cell:
density.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/scatter.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
std::cout << name << ":\t\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
std::cout << std::endl << std::endl;
}
int main()
{
const int particle_count = 5;
const int cell_count = 10;
thrust::device_vector<int> cellID(particle_count);
cellID[0] = -1; cellID[1] = 1; cellID[2] = 0; cellID[3] = 2; cellID[4]=1;
thrust::device_vector<float> mass(particle_count);
mass[0] = .5; mass[1] = 1.0; mass[2] = 2.0; mass[3] = 3.0; mass[4] = 4.0;
std::cout << "input data" << std::endl;
PRINTER(cellID);
PRINTER(mass);
thrust::sort_by_key(cellID. begin(), cellID.end(), mass.begin());
std::cout << "after sort_by_key" << std::endl;
PRINTER(cellID);
PRINTER(mass);
thrust::device_vector<int> reduced_cellID(particle_count);
thrust::device_vector<float> density(particle_count);
int new_size = thrust::reduce_by_key(cellID. begin(), cellID.end(),
mass.begin(),
reduced_cellID.begin(),
density.begin()
).second - density.begin();
if (reduced_cellID[0] == -1)
{
density.erase(density.begin());
reduced_cellID.erase(reduced_cellID.begin());
new_size--;
}
density.resize(new_size);
reduced_cellID.resize(new_size);
std::cout << "after reduce_by_key" << std::endl;
PRINTER(density);
PRINTER(reduced_cellID);
thrust::device_vector<float> final_density(cell_count);
thrust::scatter(density.begin(), density.end(), reduced_cellID.begin(), final_density.begin());
PRINTER(final_density);
}
compile using
nvcc -std=c++11 density.cu -o density
output
input data
cellID: -1 1 0 2 1
mass: 0.5 1 2 3 4
after sort_by_key
cellID: -1 0 1 1 2
mass: 0.5 2 1 4 3
after reduce_by_key
density: 2 5 3
reduced_cellID: 0 1 2
final_density: 2 5 3 0 0 0 0 0 0 0
I have been trying to use thrust function reduce_by_key on device vectors. In the documentation they have given example on host_vectors instead of any device vector. The main problem I am getting is in storing the return value of the function. To be more specific here is my code:
thrust::device_vector<int> hashValueVector(10)
thrust::device_vector<int> hashKeysVector(10)
thrust::device_vector<int> d_pathId(10);
thrust::device_vector<int> d_freqError(10); //EDITED
thrust::pair<thrust::detail::normal_iterator<thrust::device_ptr<int> >,thrust::detail::normal_iterator<thrust::device_ptr<int> > > new_end; //THE PROBLEM
new_end = thrust::reduce_by_key(hashKeysVector.begin(),hashKeysVector.end(),hashValueVector.begin(),d_pathId.begin(),d_freqError.begin());
I tried declaring them as device_ptr first in the definition since for host_vectors too they have used pointers in the definition in the documentation. But I am getting compilation error when I try that then I read the error statement and converted the declaration to the above, this is compiling fine but I am not sure whether this is the right way to define or not.
I there any other standard/clean way of declaring that (the "new_end" variable)? Please comment if my question is not clear somewhere.
EDIT: I have edited the declaration of d_freqError. It was supposed to be int I wrote it as hashElem by mistake, sorry for that.
There is a problem in the setup of your reduce_by_key operation:
new_end = thrust::reduce_by_key(hashKeysVector.begin(),hashKeysVector.end(),hashValueVector.begin(),d_pathId.begin(),d_freqError.begin());
Notice that in the documentation it states:
OutputIterator2 is a model of Output Iterator and and InputIterator2's value_type is convertible to OutputIterator2's value_type.
The value type of your InputIterator2 (i.e. hashValueVector.begin()) is int . The value type of your OutputIterator2 is struct hashElem. Thrust is not going to know how to convert an int to a struct hashElem.
Regarding your question, it should not be difficult to capture the return entity from reduce_by_key. According to the documentation it is a thrust pair of two iterators, and these iterators should be consistent with (i.e. of the same vector type and value type) as your keys iterator type and your values iterator type, respectively.
Here's an updated sample based on what you posted, which compiles cleanly:
$ cat t353.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/pair.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/copy.h>
typedef thrust::device_vector<int>::iterator dIter;
int main(){
thrust::device_vector<int> hashValueVector(10);
thrust::device_vector<int> hashKeysVector(10);
thrust::device_vector<int> d_pathId(10);
thrust::device_vector<int> d_freqError(10);
thrust::sequence(hashValueVector.begin(), hashValueVector.end());
thrust::fill(hashKeysVector.begin(), hashKeysVector.begin()+5, 1);
thrust::fill(hashKeysVector.begin()+6, hashKeysVector.begin()+10, 2);
thrust::pair<dIter, dIter> new_end;
new_end = thrust::reduce_by_key(hashKeysVector.begin(),hashKeysVector.end(),hashValueVector.begin(),d_pathId.begin(),d_freqError.begin());
std::cout << "Number of results are: " << new_end.first - d_pathId.begin() << std::endl;
thrust::copy(d_pathId.begin(), new_end.first, std::ostream_iterator<int>(std::cout, "\n"));
thrust::copy(d_freqError.begin(), new_end.second, std::ostream_iterator<int>(std::cout, "\n"));
}
$ nvcc -arch=sm_20 -o t353 t353.cu
$ ./t353
Number of results are: 3
1
0
2
10
5
30
$