error : identifier "atomicOr" is undefined in Thrust program - thrust

I have found that the Cuda atomicOr function is not recognized in my Thrust program compiled in Visual Studio 2012.
I have read that all header files should already be included when the NVidia nvcc compiler is invoked. Most postings on this issue state that this must mean the architectural settings are incorrect.
I have tried it with these settings based on other postings:
How to set CUDA compiler flags in Visual Studio 2010?
...as well as using:
http://s1240.photobucket.com/user/fireshot8888/media/cuda_settings.png.html
main.cpp:
#include <thrust/device_vector.h>
#include <cstdlib>
#include <iostream>
#include "cuda.h"
using namespace std;
//Visual C++ compiled main function to launch the GPU calling code
int main(int argc, char *argv[])
{
//Just some random data hand keyed to make it a complete example for stack overflow while not being too complicated
float data[] = {1.2, 3.4, 3.4, 3.3, 4.4, 4.4, 4.4, 3.4, 4.4, 4.4,
1.2, 3.4, 3.4, 3.3, 4.4, 4.4, 4.4, 3.4, 4.4, 4.4};
thrust::host_vector<float> h_data(data, data+20); //Holds the contents of the file as they are read; it will be cleared once we are done with it.
const int numVars = 10;
int numBins = 4;
int rowCount = 2;
doHistogramGPU(numVars, h_data, numBins, rowCount);
return 0;
}
cuda.cu:
#include "cuda.h"
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
//I GAVE THIS A TRY BUT IT DID NOT FIX MY ISSUE::::
#include <cuda_runtime.h>
#include <cuda.h>
using namespace std;
//Function to call the kernel
void doHistogramGPU(int numVars, thrust::host_vector<float> h_buffer, int numBins, int numRecords)
{
int dataSize = sizeof(BYTE_UNIT);
int shiftSize = dataSize - 1;
thrust::device_vector<float> d_buffer(h_buffer.begin(), h_buffer.end());
int bitVectorSize = ceil(numRecords * numVars / (float)dataSize);
thrust::device_vector<BYTE_UNIT> d_bitData(bitVectorSize * numBins);
thrust::counting_iterator<int> counter(0);
auto zipInFirst = thrust::make_zip_iterator(thrust::make_tuple(d_buffer.begin(), counter));
auto zipInLast = thrust::make_zip_iterator(thrust::make_tuple(d_buffer.end(), counter + d_buffer.size()));
float minValues[] = {579.8, 72.16, 0.000385, 7.576e-005, 6.954e-005, 0, 0, 2.602e-012, 1.946e-013, 7.393e-015};
float maxValues[] = {1053, 22150, 0.7599, 0.7596, 0.24, 0.2398, 0.1623, 1.167e-007, 4.518e-006, 5.322e-008};
//Get things loaded onto the device then call the kernel
thrust::device_vector<float> d_minValues(minValues, minValues+10);
thrust::device_vector<float> d_maxValues(maxValues, maxValues+10);
thrust::device_ptr<float> minDevPtr = &d_minValues[0];
thrust::device_ptr<float> maxDevPtr = &d_maxValues[0];
thrust::device_ptr<BYTE_UNIT> dataDevPtr = &d_bitData[0];
//Invoke the Thrust Kernel
thrust::for_each(zipInFirst, zipInLast, BinFinder(thrust::raw_pointer_cast(dataDevPtr), thrust::raw_pointer_cast(minDevPtr), thrust::raw_pointer_cast(maxDevPtr), numVars, numBins, numRecords));
cout << endl;
return;
}
cuda.h:
#ifndef CUDA_H
#define CUDA_H
#include <thrust/device_vector.h>
#include <iostream>
//I tried these here, too...
#include <cuda_runtime.h>
#include <cuda.h>
using namespace std;
typedef long BYTE_UNIT; //32 bit storage
void doHistogramGPU(int numvars, thrust::host_vector<float> h_buffer, int numBins, int numRecords);
struct BinFinder
{
BYTE_UNIT * data;
float * rawMinVector;
float * rawMaxVector;
int numVars;
int numBins;
int numRecords;
BinFinder(BYTE_UNIT * data, float * rawMinVector, float * rawMaxVector, int numVars, int numBins, int numRecords)
{
this -> data = data;
this -> rawMinVector = rawMinVector;
this -> rawMaxVector = rawMaxVector;
this -> numVars = numVars;
this -> numBins = numBins;
this -> numRecords = numRecords;
}
//This kernel converts the multidimensional bin representation to a single dimensional representation
template <typename Tuple>
__device__ void operator()( Tuple param )
{
int dataSize = sizeof(BYTE_UNIT);
int shiftSize = dataSize - 1;
int bitVectorSize = ceil(numRecords * numVars / float(dataSize));
float value = thrust::get<0>(param);
int id = thrust::get<1>(param);
//Look up the min and max values for this data column using the index
float min = rawMinVector[id % numVars];
float max = rawMaxVector[id % numVars];
//Calculate the bin id
float percentage = (value - min) / float(max - min);
char bin = percentage * numBins;
if (bin == numBins)
{
bin--;
}
//////////////////////////////////////////////////////////////
//Set a 1 in the appropriate bitvector for the calculated bin
//////////////////////////////////////////////////////////////
//What I originally tried to do that appeared to have generated race conditions (using data from a file):
//data[bin * bitVectorSize + id / dataSize] |= (1 << (shiftSize - id % dataSize));
//What I've been trying to do now that generates a compilation error:
atomicOr(data + (bin * bitVectorSize + id / dataSize), 1 << (shiftSize - id % dataSize)); //<----THIS DOESN'T COMPILE!!!!!!!!!
}
};
#endif
nvcc command for cuda.cu (which includes my cuda.h file):
"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v6.0/bin/nvcc.exe" "C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu" -c -o "C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/build/CMakeFiles/CudaLib.dir//Debug/CudaLib_generated_cuda.cu.obj" -ccbin "C:/Program Files (x86)/Microsoft Visual Studio 11.0/VC/bin" -m64 -Xcompiler ,\"/DWIN32\",\"/D_WINDOWS\",\"/W3\",\"/GR\",\"/EHsc\",\"/D_DEBUG\",\"/MDd\",\"/Zi\",\"/Ob0\",\"/Od\",\"/RTC1\" -DNVCC "-IC:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v6.0/include" "-IC:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v6.0/include"
Full error output by nvcc:
1>nvcc : warning : The 'compute_10' and 'sm_10' architectures are deprecated, and may be removed in a future release.
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(107): warning : variable "minValues" was declared but never referenced
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(108): warning : variable "maxValues" was declared but never referenced
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(462): warning : variable "shiftSize" was declared but never referenced
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(602): warning : initial value of reference to non-const must be an lvalue
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(618): warning : dynamic initialization in unreachable code
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(522): warning : variable "shiftSize" was declared but never referenced
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(975): warning : initial value of reference to non-const must be an lvalue
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(993): warning : initial value of reference to non-const must be an lvalue
1>
1>C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(1022): warning : variable "shiftSize" was declared but never referenced
1>
1>c:\users\datahead8888\documents\visual studio 2012\projects\thrust-space-data\src\cuda.h(188): error : identifier "atomicOr" is undefined
1> detected during:
1> instantiation of "void BinFinder::operator()(Tuple) [with Tuple=thrust::detail::tuple_of_iterator_references]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/detail/function.h(119): here
1> instantiation of "Result thrust::detail::device_function::operator()(const Argument &) const [with Function=BinFinder, Result=void, Argument=thrust::detail::tuple_of_iterator_references, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/system/cuda/detail/for_each.inl(82): here
1> instantiation of "thrust::system::cuda::detail::for_each_n_detail::for_each_n_closure::result_type thrust::system::cuda::detail::for_each_n_detail::for_each_n_closure::operator()() [with RandomAccessIterator=thrust::zip_iterator>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, Size=unsigned int, UnaryFunction=BinFinder, Context=thrust::system::cuda::detail::detail::blocked_thread_array]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/system/cuda/detail/detail/launch_closure.inl(49): here
1> instantiation of "void thrust::system::cuda::detail::detail::launch_closure_by_value(Closure) [with Closure=thrust::system::cuda::detail::for_each_n_detail::for_each_n_closure>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, unsigned int, BinFinder, thrust::system::cuda::detail::detail::blocked_thread_array>]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/system/cuda/detail/detail/launch_closure.inl(77): here
1> instantiation of "thrust::system::cuda::detail::detail::closure_launcher_base::launch_function_t thrust::system::cuda::detail::detail::closure_launcher_base::get_launch_function() [with Closure=thrust::system::cuda::detail::for_each_n_detail::for_each_n_closure>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, unsigned int, BinFinder, thrust::system::cuda::detail::detail::blocked_thread_array>, launch_by_value=true]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/system/cuda/detail/detail/launch_closure.inl(185): here
1> [ 2 instantiation contexts not shown ]
1> instantiation of "thrust::tuple thrust::system::cuda::detail::for_each_n_detail::configure_launch(Size) [with Closure=thrust::system::cuda::detail::for_each_n_detail::for_each_n_closure>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, unsigned int, BinFinder, thrust::system::cuda::detail::detail::blocked_thread_array>, Size=long long]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/system/cuda/detail/for_each.inl(163): here
1> instantiation of "RandomAccessIterator thrust::system::cuda::detail::for_each_n(thrust::system::cuda::detail::execution_policy &, RandomAccessIterator, Size, UnaryFunction) [with DerivedPolicy=thrust::system::cuda::detail::tag, RandomAccessIterator=thrust::zip_iterator>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, Size=long long, UnaryFunction=BinFinder]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/system/cuda/detail/for_each.inl(191): here
1> instantiation of "RandomAccessIterator thrust::system::cuda::detail::for_each(thrust::system::cuda::detail::execution_policy &, RandomAccessIterator, RandomAccessIterator, UnaryFunction) [with DerivedPolicy=thrust::system::cuda::detail::tag, RandomAccessIterator=thrust::zip_iterator>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, UnaryFunction=BinFinder]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/detail/for_each.inl(43): here
1> instantiation of "InputIterator thrust::for_each(const thrust::detail::execution_policy_base &, InputIterator, InputIterator, UnaryFunction) [with DerivedPolicy=thrust::system::cuda::detail::tag, InputIterator=thrust::zip_iterator>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, UnaryFunction=BinFinder]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include\thrust/detail/for_each.inl(57): here
1> instantiation of "InputIterator thrust::for_each(InputIterator, InputIterator, UnaryFunction) [with InputIterator=thrust::zip_iterator>, thrust::counting_iterator, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, UnaryFunction=BinFinder]"
1> C:/Users/datahead8888/Documents/Visual Studio 2012/Projects/thrust-space-data/src/cuda.cu(597): here
1>
1> 1 error detected in the compilation of "C:/Users/DATAHE~1/AppData/Local/Temp/tmpxft_00001f78_00000000-8_cuda.cpp1.ii".
1> cuda.cu

The reason it is undefined is because you are not specifying the project settings correctly to compile for an architecture (cc1.1 or higher) that supports atomics.
You will need to modify the settings for the compile operation to compile for an architecture that your GPU supports as well as one that supports atomics.
Your compile command includes no architectural switches at all, so the default architecture (cc1.0) is being targetted. This architecture does not support atomics, and also is deprecated in CUDA 6, so the compiler issues a warning to let you know you are compiling for a deprecated architecture.
You'll need to study the available questions and documentation to learn how to set the target architecture, and you must be sure to not include cc1.0, or the compile will fail. (For example, in this question that you linked, use the methods discussed in the answers, not in the question. The method described in the question does not work. And read all the answers, noting that there are both project properties places and file-specific places where this setting can be made.)
If you're having difficulty getting the settings arranged, you might try opening a CUDA sample project that depends on atomics, e.g. simple atomic intrinsics and remove the existing code from that project, and place your code in it. You should then pick up the proper project settings from that project to use atomics.

Related

How to do tuple reduction in thrust (also return a tuple)?

Say I have a tuple consisting of two thrust::device_vector. I want the output to be a tuple consisting of two scalars, which are the sum of two vectors respectively. For example,
input tuple consisting of two vectors:
a: 3, 5, 2
b: 6, 1, 7
output tuple consisting of two scalars:
10
14
I think this should be extremely easy but somehow I still haven't figure out a way to do so.
My code
#include <iostream>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>
#include <thrust/tuple.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/zip_iterator.h>
template<typename T>
struct TestTuplePlus
{
__host__ __device__
thrust::tuple<T, T> operator()(thrust::tuple<T, T>& t0, thrust::tuple<T, T>& t1)
{
return thrust::make_tuple(thrust::get<0>(t0) + thrust::get<0>(t1), thrust::get<1>(t0) + thrust::get<1>(t1));
}
};
int main()
{
thrust::device_vector<float> a(3, 0);
thrust::device_vector<float> b(3, 0);
a[0] = 3;
a[1] = 5;
a[2] = 2;
b[0] = 6;
b[1] = 1;
b[2] = 7;
auto begin = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin()));
auto end = thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end()));
// reduce to a tuple
thrust::tuple<float, float> result = thrust::reduce(begin, end, thrust::make_tuple<float,float>(0,0), TestTuplePlus<float>()); // produce compilation error
return 0;
}
The compilation error:
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/detail/function.h(96): error: function "TestTuplePlus<T>::operator() [with T=float]" cannot be called with the given argument list
argument types are: (thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, thrust::detail::tuple_of_iterator_references<float &, float &, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>)
object type is: TestTuplePlus<float>
detected during:
instantiation of "Result thrust::detail::wrapped_function<Function, Result>::operator()(Argument1 &, const Argument2 &) const [with Function=TestTuplePlus<float>, Result=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, Argument1=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, Argument2=thrust::detail::tuple_of_iterator_references<thrust::device_reference<float>, thrust::device_reference<float>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/system/detail/sequential/reduce.h(61): here
instantiation of "OutputType thrust::system::detail::sequential::reduce(thrust::system::detail::sequential::execution_policy<DerivedPolicy> &, InputIterator, InputIterator, OutputType, BinaryFunction) [with DerivedPolicy=thrust::detail::seq_t, InputIterator=thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, OutputType=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryFunction=TestTuplePlus<float>]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/detail/reduce.inl(71): here
instantiation of "T thrust::reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &, InputIterator, InputIterator, T, BinaryFunction) [with DerivedPolicy=thrust::detail::seq_t, InputIterator=thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, T=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryFunction=TestTuplePlus<float>]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/system/cuda/detail/reduce.h(1022): here
instantiation of "T thrust::cuda_cub::reduce_n(thrust::cuda_cub::execution_policy<Derived> &, InputIt, Size, T, BinaryOp) [with Derived=thrust::cuda_cub::tag, InputIt=thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, Size=signed long, T=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryOp=TestTuplePlus<float>]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/system/cuda/detail/reduce.h(1037): here
instantiation of "T thrust::cuda_cub::reduce(thrust::cuda_cub::execution_policy<Derived> &, InputIt, InputIt, T, BinaryOp) [with Derived=thrust::cuda_cub::tag, InputIt=thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, T=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryOp=TestTuplePlus<float>]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/detail/reduce.inl(71): here
instantiation of "T thrust::reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &, InputIterator, InputIterator, T, BinaryFunction) [with DerivedPolicy=thrust::cuda_cub::tag, InputIterator=thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, T=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryFunction=TestTuplePlus<float>]"
/usr/local/cuda/bin/../targets/x86_64-linux/include/thrust/detail/reduce.inl(186): here
instantiation of "T thrust::reduce(InputIterator, InputIterator, T, BinaryFunction) [with InputIterator=thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, T=thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, BinaryFunction=TestTuplePlus<float>]"
To be honest, I have no idea how to fix it.
I actually find this post, but I haven't got it compiled either.
Anyway, is there an easy way to do tuple reduction in cuda?
The compile error arises, strangely enough, due to the fact that thrust is actually passing a different tuple type for the first and second arguments to your functor. This can be deduced from this difference:
error: function "TestTuplePlus<T>::operator() [with T=float]" cannot be called with the given argument list
argument types are: (thrust::tuple<float, float, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, thrust::detail::tuple_of_iterator_references<float &, float &, thrust::null_type, ...
For the first argument we are told:
argument types are: (thrust::tuple<float, float, thrust::null_type, ...
For the second argument we are told:
thrust::detail::tuple_of_iterator_references<float &, float &, thrust::null_type, ...
The first tuple contains two float quantities. The second tuple contains two float references. These are not the same type package. As a result, there is not one single adaptation of:
thrust::tuple<T, T>
which can conform to both types. Therefore there is no possible single instantiation of your templated functor that can conform to both.
We can work around this by allowing the templated functor to have two templated types, one for each argument. The code below demonstrates one possible solution:
$ cat t1727.cu
#include <iostream>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>
#include <thrust/tuple.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/zip_iterator.h>
struct TestTuplePlus
{
template<typename T, typename T1>
__host__ __device__
thrust::tuple<T, T> operator()(thrust::tuple<T, T> t0, thrust::tuple<T1, T1> t1)
{
return thrust::make_tuple(thrust::get<0>(t0) + thrust::get<0>(t1), thrust::get<1>(t0) + thrust::get<1>(t1));
}
};
int main()
{
thrust::device_vector<float> a(3, 0);
thrust::device_vector<float> b(3, 0);
a[0] = 3;
a[1] = 5;
a[2] = 2;
b[0] = 6;
b[1] = 1;
b[2] = 7;
auto begin = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin()));
auto end = thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end()));
// reduce to a tuple
thrust::tuple<float, float> result = thrust::reduce(begin, end, thrust::make_tuple<float,float>(0,0), TestTuplePlus()); // produce compilation error
std::cout << "a sum: " << thrust::get<0>(result) << " b sum: " << thrust::get<1>(result) << std::endl;
return 0;
}
$ nvcc -std=c++11 t1727.cu -o t1727
$ ./t1727
a sum: 10 b sum: 14
$
(CUDA 10.1.243)
I'm sure other approaches are possible. Note that I have elected to template the operator itself, rather than the entire struct. This eliminates the need to specify the template type in the host code. Again, I'm sure other approaches are possible.
I won't be able to respond to questions related to "why does thrust work this way?"
If you find this behavior troublesome, you might wish to file a thrust issue.
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the questions in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.
The compilation error is due to missing "const" qualifier in your functor, i.e., it should be:
thrust::tuple<T, T> operator()(const thrust::tuple<T, T>& t0, const thrust::tuple<T, T>& t1)

Compilation error using FindCUDA.cmake and Thrust with THRUST_DEVICE_SYSTEM_OMP

I recently discovered that Thrust was able to handle automatic OMP and TBB parallelisation in addition to its classic cuda capability.
Although I was able to use this extremely verstile feature on a simple example, my cmake configuration generated compilation error, maybe I am using FindCUDA.cmake the wrong way, or maybe this module cannot be used with Thrust this way ?
Here is my Test.cu file:
#include <thrust/device_vector.h>
#include <cstdio>
struct cuda_hello
{
__host__ __device__
void operator()(int x)
{
printf("Hello, world from Cuda!\n");
}
};
int main()
{
thrust::device_vector<int> cuda_vec(1, 0);
thrust::for_each(cuda_vec.begin(),cuda_vec.end(),cuda_hello());
}
And, the working compile line:
nvcc Test.cu -lgomp -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP -Xcompiler -fopenmp -gencode arch=compute_30,code=sm_30 -o Executable.exe
Now, the cmakefile that cannot generate a Makefiles that compiles properly under linux when using THRUST_DEVICE_SYSTEM_OMP (compile for device of CC 3.0):
PROJECT(ExecutableCmake)
set (OUTPUT_NAME ExecutableCmake)
cmake_minimum_required (VERSION 2.8)
#test variable
#set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA)
set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_OMP)
#set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_TBB)
########################################
#### Cuda Part ####
########################################
find_package(CUDA REQUIRED)
list( APPEND CUDA_NVCC_FLAGS -gencode arch=compute_30,code=sm_30 -DTHRUST_DEVICE_SYSTEM=${THRUST_DEVICE_SYSTEM} )
set (sources_gpu_cuda
Test.cu
)
########################################
#### /Cuda Part ####
########################################
########################################
#### OMP Part ####
########################################
set(omp_deps gomp)
########################################
#### /OMP Part ####
########################################
set (sources
#cuda source files
${sources_gpu_cuda}
)
cuda_add_executable(${OUTPUT_NAME} ${sources} ${headers})
target_link_libraries (${OUTPUT_NAME} ${omp_deps})
The compile errors are of type:
/usr/local/cuda/include/thrust/system/omp/detail/for_each.inl(53): error: incomplete type is not allowed
detected during:
instantiation of "RandomAccessIterator thrust::system::omp::detail::for_each_n(thrust::system::omp::detail::execution_policy<DerivedPolicy> &, RandomAccessIterator, Size, UnaryFunction) [with DerivedPolicy=thrust::system::omp::detail::tag, RandomAccessIterator=thrust::device_ptr<int>, Size=unsigned long, UnaryFunction=thrust::detail::host_generate_functor<thrust::detail::fill_functor<int>>]"
/usr/local/cuda/include/thrust/detail/for_each.inl(69): here
instantiation of "InputIterator thrust::for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &, InputIterator, Size, UnaryFunction) [with DerivedPolicy=thrust::system::omp::detail::tag, InputIterator=thrust::device_ptr<int>, Size=unsigned long, UnaryFunction=thrust::detail::host_generate_functor<thrust::detail::fill_functor<int>>]"
/usr/local/cuda/include/thrust/system/detail/generic/generate.inl(52): here
instantiation of "OutputIterator thrust::system::detail::generic::generate_n(thrust::execution_policy<ExecutionPolicy> &, OutputIterator, Size, Generator) [with ExecutionPolicy=thrust::system::omp::detail::tag, OutputIterator=thrust::device_ptr<int>, Size=unsigned long, Generator=thrust::detail::fill_functor<int>]"
/usr/local/cuda/include/thrust/detail/generate.inl(56): here
instantiation of "OutputIterator thrust::generate_n(const thrust::detail::execution_policy_base<DerivedPolicy> &, OutputIterator, Size, Generator) [with DerivedPolicy=thrust::system::omp::detail::tag, OutputIterator=thrust::device_ptr<int>, Size=unsigned long, Generator=thrust::detail::fill_functor<int>]"
/usr/local/cuda/include/thrust/system/detail/generic/fill.h(45): here
instantiation of "OutputIterator thrust::system::detail::generic::fill_n(thrust::execution_policy<DerivedPolicy> &, OutputIterator, Size, const T &) [with DerivedPolicy=thrust::system::omp::detail::tag, OutputIterator=thrust::device_ptr<int>, Size=unsigned long, T=int]"
/usr/local/cuda/include/thrust/detail/fill.inl(50): here
[ 6 instantiation contexts not shown ]
instantiation of "void thrust::detail::contiguous_storage<T, Alloc>::uninitialized_fill_n(thrust::detail::contiguous_storage<T, Alloc>::iterator, thrust::detail::contiguous_storage<T, Alloc>::size_type, const thrust::detail::contiguous_storage<T, Alloc>::value_type &) [with T=int, Alloc=thrust::device_malloc_allocator<int>]"
/usr/local/cuda/include/thrust/detail/vector_base.inl(164): here
instantiation of "void thrust::detail::vector_base<T, Alloc>::fill_init(thrust::detail::vector_base<T, Alloc>::size_type, const T &) [with T=int, Alloc=thrust::device_malloc_allocator<int>]"
/usr/local/cuda/include/thrust/detail/vector_base.inl(139): here
instantiation of "void thrust::detail::vector_base<T, Alloc>::init_dispatch(IteratorOrIntegralType, IteratorOrIntegralType, thrust::detail::true_type) [with T=int, Alloc=thrust::device_malloc_allocator<int>, IteratorOrIntegralType=int]"
/usr/local/cuda/include/thrust/detail/vector_base.inl(224): here
instantiation of "thrust::detail::vector_base<T, Alloc>::vector_base(InputIterator, InputIterator) [with T=int, Alloc=thrust::device_malloc_allocator<int>, InputIterator=int]"
/usr/local/cuda/include/thrust/device_vector.h(148): here
instantiation of "thrust::device_vector<T, Alloc>::device_vector(InputIterator, InputIterator) [with T=int, Alloc=thrust::device_malloc_allocator<int>, InputIterator=int]"
/usr/local/cuda/include/thrust/system/omp/detail/for_each.inl(53): error: incomplete type is not allowed
detected during:
instantiation of "RandomAccessIterator thrust::system::omp::detail::for_each_n(thrust::system::omp::detail::execution_policy<DerivedPolicy> &, RandomAccessIterator, Size, UnaryFunction) [with DerivedPolicy=thrust::system::omp::detail::tag, RandomAccessIterator=thrust::detail::normal_iterator<thrust::device_ptr<int>>, Size=long, UnaryFunction=cuda_hello]"
(89): here
instantiation of "RandomAccessIterator thrust::system::omp::detail::for_each(thrust::system::omp::detail::execution_policy<DerivedPolicy> &, RandomAccessIterator, RandomAccessIterator, UnaryFunction) [with DerivedPolicy=thrust::system::omp::detail::tag, RandomAccessIterator=thrust::detail::normal_iterator<thrust::device_ptr<int>>, UnaryFunction=cuda_hello]"
/usr/local/cuda/include/thrust/detail/for_each.inl(43): here
instantiation of "InputIterator thrust::for_each(const thrust::detail::execution_policy_base<DerivedPolicy> &, InputIterator, InputIterator, UnaryFunction) [with DerivedPolicy=thrust::system::omp::detail::tag, InputIterator=thrust::detail::normal_iterator<thrust::device_ptr<int>>, UnaryFunction=cuda_hello]"
/usr/local/cuda/include/thrust/detail/for_each.inl(57): here
instantiation of "InputIterator thrust::for_each(InputIterator, InputIterator, UnaryFunction) [with InputIterator=thrust::detail::normal_iterator<thrust::device_ptr<int>>, UnaryFunction=cuda_hello]"
2 errors detected in the compilation of "/tmp/tmpxft_00002d3a_00000000-6_Test.cpp1.ii".
CMake Error at ExecutableCmake_generated_Test.cu.o.cmake:264 (message):
Error generating file
make[2]: *** [CMakeFiles/ExecutableCmake.dir/./ExecutableCmake_generated_Test.cu.o] Erreur 1
make[1]: *** [CMakeFiles/ExecutableCmake.dir/all] Erreur 2
make: *** [all] Erreur 2
Those errors look exactly like the ones I get when I put cuda code in non .cu files, but I dont know cmake well enough to understand why this problem arise.
Thank you in advance for your help
It looks like you are missing a few flags for nvcc. Adding this worked for me:
list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fopenmp)

Interpret CUDA profiler log file

Here is the log file of running CUDA profiler (nvprof) on some codes, which have a mix of Thrust, cublas and curand. The first is a kernel I wrote, so no problem there. But I'm not sure how to interpret 2nd to 5th lines, which took up substantial run time.
> Time(%) Time Calls Avg Min Max Name % s ms ms ms
>
> 28.12 6.82 24,543.00 0.28 0.01 0.64 dev_update_dW1(doub....)
> 23.78 5.77 12,272.00 0.47 0.46 0.49 void thrust::system::cud....
> 14.32 3.47 12,272.00 0.28 0.28 0.29 void thrust::system::cud....
> 10.82 2.62 12,272.00 0.21 0.21 0.22 void thrust::system::cud....
> 4.93 1.20 24,544.00 0.05 0.05 0.05 void thrust::system::cud....
> 3.98 0.96 12,272.00 0.08 0.08 0.09 Act_dAct(double*, long, double*, double*)
The 2nd to 5th lines are printed below in full:
2nd line : void thrust::system::cuda::detail::detail::launch_closure_by_value>, thrust::counting_iterator<__int64, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, __int64, thrust::tuple, thrust::detail::normal_iterator, thrust::system::cuda::detail::tag, thrust::use_default, thrust::use_default>>, thrust::system::detail::generic::detail::max_element_reduction>, thrust::system::cuda::detail::detail::blocked_thread_array>>(double)
3rd line : void thrust::system::cuda::detail::detail::launch_closure_by_value>, thrust::detail::normal_iterator>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, unsigned int, thrust::detail::device_unary_transform_functor, thrust::system::cuda::detail::detail::blocked_thread_array>>(double)
4th line : void thrust::system::cuda::detail::detail::launch_closure_by_value>, double, thrust::use_default>, __int64, double, thrust::detail::normal_iterator>, thrust::plus, thrust::system::cuda::detail::detail::blocked_thread_array>>(exp_functor)
5th line : void thrust::system::cuda::detail::detail::launch_closure_by_value, unsigned int, thrust::detail::device_generate_functor>, thrust::system::cuda::detail::detail::blocked_thread_array>>(double)
EDIT :
I have this function (softmax) that uses max_element and transform_reduce
void Softmax_ThrustMatrix(thrust::device_vector<double>& mat, int Nrow, int Ncol, thrust::device_vector<double>& Outmat) {
thrust::device_vector<double> x(Ncol, 0.0);
thrust::device_vector<double> v(Ncol, 0.0);
thrust::device_vector<double>::iterator mx;
double tmp = 0.0, logsm=0.0;
dim3 grid, block;
block.x = 16;
block.y = 1;
grid.x = Ncol / block.x + 1;
grid.y = 1;
for ( int i=0; i < Nrow; i++ ) {
GetRow<<<grid,block>>>(thrust::raw_pointer_cast(&mat[0]), i, Nrow, Ncol, thrust::raw_pointer_cast(&x[0]));
mx = thrust::max_element(x.begin(), x.end());
tmp = thrust::transform_reduce(x.begin(), x.end(), exp_functor(*mx), 0.0, thrust::plus<double>() );
logsm = *mx + log(tmp);
thrust::transform(x.begin(), x.end(), v.begin(), exp_functor(logsm));
SetRow<<<grid,block>>>(thrust::raw_pointer_cast(&v[0]), i, Nrow, Ncol, thrust::raw_pointer_cast(&Outmat[0]));
}
}
Thrust code at a low level is not any different than CUDA code (at least for thrust code targetting a GPU). Thrust, as a template library, abstracts away many aspects of CUDA at the source code level, but the profiler doesn't know any difference between thrust code and ordinary cuda code.
Lines 2-5 represent the profiler data on 4 kernel launches. It's evident from their syntax that they are probably not kernels you wrote - they are emanating from within the depths of thrust template functions.
"Launch closure" is thrust-under-the-hood-speak for a kernel launched by thrust to perform some function. Since you have 3 thrust calls in the code you have shown, and are also showing GetRow and SetRow kernels that you wrote, and those kernels don't show up in your profiler output anywhere, it's not evident to me that the profiler output you have shown is related to the code you have shown. You haven't shown the code that calls the kernels that do appear in your output ( dev_update_dW1 and Act_dAct), so it seems fairly clear to me that the code you have shown is not useful for further interpretation of your profiler output.
In any event, lines 2-5 represent CUDA kernels, launched by thrust, that are emanating from thrust calls in your code (somewhere).
Note that it's also possible for thrust to launch kernels for some other non-obvious purposes, such as instantiation of device vectors.

Why is my inclusive scan code 2x faster on CPU than on a GPU?

I wrote a short CUDA program that uses the highly-optimized CUB library to demonstrate that one core from an old, quad-core Intel Q6600 processor (all four are supposedly capable of ~30 GFLOPS/sec) can do an inclusive scan (or cumulative/prefix sum if you rather) on 100,000 elements faster than an Nvidia 750 Ti (supposedly capable of 1306 GFLOPS/sec of single precision). Why is this the case?
The source code is:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cub/cub.cuh>
#include <stdio.h>
#include <time.h>
#include <algorithm>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void fillArrayWithRandom(float* inputArray, int inputN)
{
for (int i = 0; i < inputN; i++)
{
inputArray[i] = (float)rand() / float(RAND_MAX);
}
}
void inclusiveSum_CPU(float *inputArray, float *inputSummedArray, int inputN)
{
for (int i = 0; i < inputN; i++)
{
if (i > 0)
{
inputSummedArray[i] = inputSummedArray[i - 1] + inputArray[i];
}
else
{
inputSummedArray[i] = inputArray[i];
}
}
}
int main()
{
int N = 100000; //1 hundred thousand elements
float numSimulations = 10000;
//Make Host Arrays
float* testArray_CPU = (float *)malloc(sizeof(float)*N);
fillArrayWithRandom(testArray_CPU, N);
float* testArrayOutput_CPU = (float *)malloc(sizeof(float)*N);
//Make GPU Arrays
float* testArray_GPU;
gpuErrchk(cudaMalloc(&testArray_GPU, N*sizeof(float)));
gpuErrchk(cudaMemcpy(testArray_GPU, testArray_CPU, N*sizeof(float), cudaMemcpyHostToDevice));
float* testArrayOutput_GPU;
gpuErrchk(cudaMalloc(&testArrayOutput_GPU, N*sizeof(float)));
//Initiate the benchmark variables
clock_t begin_CPU, end_CPU;
float time_spent_GPU, time_spent_CPU;
//GPU prep
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, testArray_GPU, testArrayOutput_GPU, N);
gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));
//GPU Timing
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaEventRecord(start, 0));
for (int i = 0; i < numSimulations; i++)
{
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, testArray_GPU, testArrayOutput_GPU, N);
}
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaEventRecord(stop, 0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time_spent_GPU, start, stop));
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
time_spent_GPU = (float)(time_spent_GPU / 1000);
float avg_GPU = time_spent_GPU / numSimulations;
printf("Avg. GPU Simulation Time: %.17g [sim/sec]\n", avg_GPU);
//CPU Timing
begin_CPU = clock();
for (int i = 0; i < numSimulations; i++)
{
inclusiveSum_CPU(testArray_CPU, testArrayOutput_CPU, N);
}
end_CPU = clock();
time_spent_CPU = (float)(end_CPU - begin_CPU) / CLOCKS_PER_SEC;
float avg_CPU = time_spent_CPU / numSimulations;
printf("Avg. CPU Simulation Time: %.17g [sim/sec]\n", avg_CPU);
printf("GPU/CPU Timing:%.17gx \n", avg_GPU / avg_CPU);
return 0;
}
And the output when I run it on my machine is:
Avg. GPU Simulation Time: 0.0011999999405816197 [sim/sec]
Avg. CPU Simulation Time: 0.00059999997029080987 [sim/sec]
GPU/CPU Timing:2x
Also, here are my compiling flags and output:
1>------ Build started: Project: speedTest, Configuration: Debug Win32 ------
1> Compiling CUDA source file kernel.cu...
1>
1> C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" -gencode=arch=compute_50,code=\"sm_50,compute_50\" --use-local-env --cl-version 2013 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin" -rdc=true -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o Debug\kernel.cu.obj "C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest\kernel.cu"
1> kernel.cu
1>
1> C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" -dlink -o Debug\speedTest.device-link.obj -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib -gencode=arch=compute_50,code=sm_50 -G --machine 32 Debug\kernel.cu.obj
1> cudart.lib
1> kernel32.lib
1> user32.lib
1> gdi32.lib
1> winspool.lib
1> comdlg32.lib
1> advapi32.lib
1> shell32.lib
1> ole32.lib
1> oleaut32.lib
1> uuid.lib
1> odbc32.lib
1> odbccp32.lib
1> kernel.cu.obj
1> speedTest.vcxproj -> C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\Debug\speedTest.exe
1> copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart*.dll" "C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\Debug\"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart32_65.dll
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart64_65.dll
1> 2 file(s) copied.
========== Build: 1 succeeded, 0 failed, 0 up-to-date, 0 skipped ==========
Thanks to Robert Crovella, it turns out I was using the "Debug" mode that is notoriously slow instead of "Release" mode.

Thrust: how to get the number of elements copied by the copy_if function when using device_ptr

I am using the thrust::copy_if function of the Thrust library coupled with counting iterators to get the indices of nonzero elements in an array. I also need to get the number of copied elements.
I am using the code from the 'counting_iterator.cu' example, except that in my application I need to reuse pre-allocated arrays, so I wrap them with thrust::device_ptr and then pass them to the thrust::copy_if function. This is the code:
using namespace thrust;
int output[5];
thrust::device_ptr<int> tp_output = device_pointer_cast(output);
float stencil[5];
stencil[0] = 0;
stencil[1] = 0;
stencil[2] = 1;
stencil[3] = 0;
stencil[4] = 1;
device_ptr<float> tp_stencil = device_pointer_cast(stencil);
device_vector<int>::iterator output_end = copy_if(make_counting_iterator<int>(0),
make_counting_iterator<int>(5),
tp_stencil,
tp_output,
_1 == 1);
int number_of_ones = output_end - tp_output;
If I comment the last line of code, the function fills correctly the output array. However, when I uncomment it, I get the following compilation error:
1>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include\thrust/iterator/iterator_adaptor.h(223): error : no operator "-" matches these operands
1> operand types are: int *const - const thrust::device_ptr
1> detected during:
1> instantiation of "thrust::iterator_adaptor::difference_type thrust::iterator_adaptor::distance_to(const thrust::iterator_adaptor &) const [with Derived=thrust::detail::normal_iterator>, Base=thrust::device_ptr, Value=thrust::use_default, System=thrust::use_default, Traversal=thrust::use_default, Reference=thrust::use_default, Difference=thrust::use_default, OtherDerived=thrust::device_ptr, OtherIterator=int *, V=signed int, S=thrust::device_system_tag, T=thrust::random_access_traversal_tag, R=thrust::device_reference, D=ptrdiff_t]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include\thrust/iterator/iterator_facade.h(181): here
1> instantiation of "Facade1::difference_type thrust::iterator_core_access::distance_from(const Facade1 &, const Facade2 &, thrust::detail::true_type) [with Facade1=thrust::detail::normal_iterator>, Facade2=thrust::device_ptr]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include\thrust/iterator/iterator_facade.h(202): here
1> instantiation of "thrust::detail::distance_from_result::type thrust::iterator_core_access::distance_from(const Facade1 &, const Facade2 &) [with Facade1=thrust::detail::normal_iterator>, Facade2=thrust::device_ptr]"
1> C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include\thrust/iterator/iterator_facade.h(506): here
1> instantiation of "thrust::detail::distance_from_result, thrust::iterator_facade>::type thrust::operator-(const thrust::iterator_facade &, const thrust::iterator_facade &) [with Derived1=thrust::detail::normal_iterator>, Value1=signed int, System1=thrust::device_system_tag, Traversal1=thrust::random_access_traversal_tag, Reference1=thrust::device_reference, Difference1=signed int, Derived2=thrust::device_ptr, Value2=signed int, System2=thrust::device_system_tag, Traversal2=thrust::random_access_traversal_tag, Reference2=thrust::device_reference, Difference2=signed int]"
1> C:/ProgramData/NVIDIA Corporation/CUDA Samples/v5.5/7_CUDALibraries/nsgaIIparallelo_23ott/rank_cuda.cu(70): here
If I use thrust::device_vector for the output array instead, everything is okay:
using namespace thrust;
thrust::device_vector<int> output(5);
float stencil[5];
stencil[0] = 0;
stencil[1] = 0;
stencil[2] = 1;
stencil[3] = 0;
stencil[4] = 1;
device_ptr<float> tp_stencil = device_pointer_cast(stencil);
device_vector<int>::iterator output_end = copy_if(make_counting_iterator<int>(0),
make_counting_iterator<int>(5),
tp_stencil,
output.begin(),
_1 == 1);
int number_of_ones = output_end - output.begin();
Can you suggest any solution to this problem? Thank you.
Try using device_ptr instead of device_vector::iterator in your copy_if call:
thrust::device_ptr<int> output_end = copy_if(make_counting_iterator<int>(0),
make_counting_iterator<int>(5),
tp_stencil,
tp_output,
_1 == 1);