Maximum value of thrust device_vector - cuda

i am trying to find the maximum value and it's location of a thrust::device_vecotr.
the mechanism below can save the position of the maximum value, however, i couldn't find the max_val.
i have cout statements to track the running order and where it crashes. it seems to be it crash on this line
int max_val = *iter;
it shows this result:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): invalid argument
1234567
here is the code
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1";
thrust::host_vector<int>h_vec(5);
h_vec.push_back(10);
h_vec.push_back(11);
h_vec.push_back(12);
h_vec.push_back(13);
h_vec.push_back(14);
std::cout<<"2";
thrust::device_vector<int>d_vec(5);
std::cout<<"3";
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4";
// print_vector("D_Vec",d_vec);
std::cout<<"5";
thrust::device_vector<int>::iterator iter=thrust::max(d_vec.begin(),d_vec.end());
std::cout<<"6";
unsigned int position = iter - d_vec.begin();
std::cout<<"7";
int max_val = *iter;
std::cout<<"8";
std::cout<<"Max Val= "<<14<<" #"<<position<< std::endl;
return 0;
}
Help .. please. also, if there is a better way to extract the maximum value and its position in device_vector using THRUST library it is more than appreciated.

You're not using vectors correctly. push_back() adds an element onto the end of an existing vector. It's clear that you want to replace existing elements.
Also, the thrust algorithm you want is thrust::max_element, not thrust::max
Here's a fully worked code with those issues fixed:
$ cat t1229.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1" <<std::endl;
thrust::host_vector<int>h_vec(5);
h_vec[0] = 10;
h_vec[1] = 11;
h_vec[2] = 12;
h_vec[3] = 13;
h_vec[4] = 14;
std::cout<<"2" << std::endl;
thrust::device_vector<int>d_vec(5);
std::cout<<"3" << std::endl;
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4" << std::endl;
// print_vector("D_Vec",d_vec);
std::cout<<"5" << std::endl;
thrust::device_vector<int>::iterator iter=thrust::max_element(d_vec.begin(),d_vec.end());
std::cout<<"6" << std::endl;
unsigned int position = iter - d_vec.begin();
std::cout<<"7" << std::endl;
int max_val = d_vec[position];
std::cout<<"8" << std::endl;
std::cout<<"Max Val= "<<max_val<<" #"<<position<< std::endl;
return 0;
}
$ nvcc -o t1229 t1229.cu
$ ./t1229
1
2
3
4
5
6
7
8
Max Val= 14 #4
$

Related

Trouble using thrust complex(double) vector with sinus function

I have some problems using the sin function on a Thrust complex double vector on the device and on the host: it's seem like computation is done in float.
With thrust::device_vector< thrust::complex<double> > and thrust::host_vector< thrust::complex<double> >, I obtain:
sin( 1+0i ) == (0.8414709568023682,0)
with std::complex<double>:
sin( 1+0i ) == (0.8414709848078965,0)
and std::complex<float> :
sin( 1+0i ) == (0.8414709568023682,0)
What mistake did I do in my code? In the compilation process I used
nvcc test.cu -o test
Here is the full code:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <iostream>
#include <iomanip>
#include <complex>
#include <cmath>
template <typename Vector>
void Print(Vector &V){
for (int i=0;i<V.size();i++)
std::cout << V[i] << " ";
std::cout << "\n";
}
template <typename T>
struct sin_functor : public thrust::unary_function< T , T >
{
__host__ __device__
T operator()( T x) const
{
return sin( x );
}
};
template <typename Vector>
void ThrustComputation(){
typedef typename Vector::value_type Tvec;
Vector A(2);
A[0]=Tvec(1.,0.);A[1]=Tvec(1.,1.);
std::cout << "A: " << std::endl;
std::cout << " ";Print<Vector>(A);
Vector B(A.size());
thrust::transform(A.begin(),A.end(),B.begin(), sin_functor<Tvec>());
std::cout << "B =sin(A): " << std::endl;
std::cout << " ";Print<Vector>(B);
}
template <typename T>
void stdComputation(){
std::complex<T> sA[2];
sA[0]=std::complex<T>(1.,0.);
sA[1]=std::complex<T>(1.,1.);
std::cout << "sA: " << std::endl;
std::cout << " " << sA[0] << " " << sA[1] << std::endl;
std::cout << "sin(sA): " << std::endl;
std::cout << " " << sin(sA[0]) << " " << sin(sA[1]) << std::endl;
}
int main(int argc, char **argv)
{
std::cout << std::setprecision(16);
std::cout << "Thrust: Computation on GPU device (double)\n";
ThrustComputation<thrust::device_vector< thrust::complex<double> > >();
std::cout << "Thrust: Computation on host (double)\n";
ThrustComputation<thrust::host_vector< thrust::complex<double> > >();
std::cout << "std: Computation (double)\n";
stdComputation<double>();
std::cout << "std: Computation (float)\n";
stdComputation<float>();
return 0;
}
The output on my computer (Ubuntu 14.04 LTS, cuda 7.5) is :
Thrust: Computation on GPU device (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
Thrust: Computation on host (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
std: Computation (double)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709848078965,0) (1.298457581415977,0.6349639147847361)
std: Computation (float)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709568023682,0) (1.298457503318787,0.6349638700485229)
This would appear to be a genuine bug in the thrust library. A quick scan of the code on github led me to this, which might be the culprit. It seems that thrust's double precision csinh function, upon which complex sin relies, has an accidental intermediate cast to float, which is probably causing the loss of precision you have observed. As suggested in comments, you should report this as a bug.

CUDA Thrust copy transformed result only if it satisfies a predicate

I want to perform a transformation on a input thrust::device_vector and only copy the result to the output vector if the result satisfies a predicate. So the number of results could be less than the size of the input device_vector (similar to the output vector of thrust::copy_if). I have not found a way to do this with thrust::transform_if. Currently I can do this with thrust::transform and thrust::remove_if as shown in the example below:
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random {
__host__ __device__ add_random() {}
__device__ int operator()(const int n, const int x) const {
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x > 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(5),
d_x.begin(),
d_x.begin(),
add_random());
std::cout << "after adding random number:" << std::endl;
std::ostream_iterator<int> o(std::cout, " ");
thrust::copy(d_x.begin(), d_x.end(), o);
std::cout << std::endl;
thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy(d_x.begin(), new_end, o);
std::cout << std::endl;
return 0;
}
Which gives the output:
after adding random number:
18 4 8 7 11
after removing values greater than 6:
4
I would like to avoid copying the results to memory twice, first by thrust::transform and then by thrust::remove_if in the above example. Is it possible to get the above output with a single transformation function? How can I do this? My biggest concern is the computational cost, so any optimized solution, even if it doesn't use the Thrust library would be great.
Welcome to the world of thrust fancy iterators. You can get a quick overview of some fancy iterator types by looking at the thrust quick start guide. In particular, a thrust transform iterator can frequently be used to replace a thrust transform operation that is applied to the input of another thrust algorithm, "fusing" the two algorithms into a single operation.
Here's a worked example applied to your case:
$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
__host__ __device__ int operator()(thrust::tuple<int, int> t) const {
int n = thrust::get<0>(t);
int x = thrust::get<1>(t);
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x < 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::device_vector<int> d_r(5);
int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
We've replaced your transform operation with a transform iterator applied to the same two inputs. Since you have two inputs to your transform operation, we're using a zip iterator to combine these, and the transform functor has also been reworked slightly to accept that tuple as its input.
Converted your remove_if to a copy_if, to work with the transform iterator as input. This requires a slight change in the logic of the copy predicate.

Why does computing L2 norm with cuBLAS result in an error?

Edit 2: include the more full program
Edit 1: include the full program
I'm trying to compute the L2 norm of a vector using cuBLAS. My code is as follows
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
CUDA_SAFE_CALL(cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost));
cout << "GPU Matrix of Size: " << nrows << "x" << ncols << endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
cout << fixed << setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
cout << endl;
}
free(hostA);
cout << endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
thrust::counting_iterator<unsigned int> index_sequence_begin(rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
srand(clock());
cout << "# Running NMT" << endl;
//ParseOpts(argc, argv);
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasXnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
cout << "nrm2 = " << nrm << endl;
}
Here, CUBLAS_SAFE_CALL is defined as follows
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << endl; \
cout << " Code: " << stat << endl; \
exit(1); \
} \
}
GPU_Random_Vector and GPU_Print_Matrix have been confirmed to work before. Also, cublasHandle[singleGPU] has been initialized before being called. When I ran the program, I had the following output
// GPU_Print_Matrix
GPU Matrix of Size: 10x1
0.0652332678
0.0747700930
0.0274266358
-0.0885794610
-0.0192640368
-0.0942506194
0.0283640027
-0.0411146656
-0.0460337885
-0.0970785618
cuBlas Error: nmt.cu:2252
Code: 14
What is going on? And is there any reference for how can I interpret the error number of cuBLAS? Thanks a ton.
CUBLAS error 14 is CUBLAS_STATUS_INTERNAL_ERROR and would usually mean that the internal device to host copy at the end of the L2 norm call failed. But why that happened is impossible to say without some context about what else your code was doing.
If the code you posted is assembled and fleshed out into a complete demo case (with the trivial random number seeding mistake correct) like this:
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cublas_v2.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
typedef float real_t;
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
std::cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << std::endl; \
std::cout << " Code: " << stat << std::endl; \
exit(1); \
} \
}
#define PRINT_PRECISION (6)
struct RANDOM
{
real_t a, b;
__host__ __device__
RANDOM(real_t _a=0, real_t _b=1) : a(_a), b(_b) {};
__host__ __device__
real_t operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost);
std::cout << "GPU Matrix of Size: " << nrows << "x" << ncols << std::endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
std::cout << std::fixed << std::setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
std::cout << std::endl;
}
free(hostA);
std::cout << std::endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
const real_t initRange = 10;
thrust::counting_iterator<unsigned int> index_sequence_begin(std::rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
std::srand(std::time(0));
std::cout << "# Running NMT" << std::endl;
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasSnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
std::cout << "nrm2 = " << nrm << std::endl;
}
and compiled and run like this (CUDA 6.5 if that matters):
>nvcc -arch=sm_21 -run runkkari.cu -lcublas
runkkari.cu
Creating library a.lib and object a.exp
# Running NMT
GPU Matrix of Size: 10x1
-5.712992
8.181723
-0.086308
-6.177320
-5.442665
-2.889552
-1.555665
6.506872
-6.800190
8.024273
nrm2 = 18.196394
It works as expected. You should be able to compile and run this to confirm this yourself. So from this we can only conclude that you have another problem which you have failed to describe. But perhaps this helps to narrow down the list of possibilities.

Getting ifstream from istream and calling functions via overloaded >> operator

I've created an object, PDBParser, to extract information from a PDB file. Now I am trying to overload the >> and << operators so that I can use them from the main as so:
inFile >> MyPDBParser;
outfile << MyPDBParser;
I've got the << operator all set, but I can't seem to get the >> operator to work properly.
Here is the .h file for the PDBParser class to give you a better idea of what's going on:
#include <iostream>
#include <cstdlib>
#include "FloatArray.h"
#include "IntArray.h"
#include "Atom.h"
#include "AtomArray.h"
using namespace std;
class PDBParser
{
friend ostream& operator<<(ostream& _ostream, PDBParser &rhs);
friend istream& operator>>(istream& _istream, PDBParser &rhs);
public:
PDBParser();
PDBParser(string atom1, string atom2, int separation);
PDBParser(const PDBParser& orig);
virtual ~PDBParser();
void grabAtoms(ifstream &infile);
void findAtoms();
void setAtom1(string rhs);
void setAtom2(string rhs);
void setSeparation(int rhs);
string getAtom1();
string getAtom2();
int getSeparation();
private:
string atom1s;
string atom2s;
int separation;
AtomArray *atoms1;
AtomArray *atoms2;
AtomArray *matches1;
AtomArray *matches2;
FloatArray *x1;
FloatArray *y1;
FloatArray *z1;
FloatArray *x2;
FloatArray *y2;
FloatArray *z2;
IntArray *allsequence;
ifstream backupinfile;
void trim(string &rhs);
void incrementArrays(int newElements);
};
Essentially what I need the >> operator to do is get the infile from the istream and then call the grabAtoms(infile) and findAtoms() functions for this instance of the PDBParser object.
Here's what I have now, which doesn't work. Please forgive the commented lines, as they were things I was attempting to make work. I tried adding the backupinfile object to the PDBParser class just to make things work, so normally it didn't have this and doesn't use it.
istream & operator>>(istream & _istream, PDBParser &rhs)
{
// ifstream in;
// _istream >> rhs.grabAtoms(in) >> rhs.findAtoms();
_istream >> rhs.backupinfile;
rhs.grabAtoms(rhs.backupinfile);
rhs.findAtoms();
return _istream;
}
I've established that the issue here is that my function needs to receive an ifstream object and I can't figure out how to get that from the istream object.
Here's my working << overload just for the heck of it:
ostream & operator<<(ostream & _ostream, PDBParser &rhs)
{
for(int i=0; i < rhs.x1->getSize(); i++)
{
_ostream.precision(3);
_ostream << fixed;
_ostream << setprecision (3) << rhs.x1->get(i) << " ";
_ostream << setprecision (3) << rhs.y1->get(i) << " ";
_ostream << setprecision (3) << rhs.z1->get(i) << " ";
_ostream << setprecision (3) << rhs.x2->get(i) << " ";
_ostream << setprecision (3) << rhs.y2->get(i) << " ";
_ostream << setprecision (3) << rhs.z2->get(i) << endl;
}
return _ostream;
}
Thanks

Best way to separate parallel MPI part in sequential program

I've have a huge sequntial program, in which I want to parallelize some algo with MPI and CUDA. How correctly separate sequential part from parallel? The problem lies in nesting of parallel algo, and using of slurm or loadLeveler as well(e.g. on my MPI cluster I can't write something like: mpirun -n 1 a.out: -n 2 b.out).
Example:
int main()
{
funcA();
}
void funcA()
{
funcB();
}
void funcB()
{
parallel algo starts here....
}
I've found a great solution, for this problem. This is sample code:
#include <iostream>
#include <mpi.h>
#include <unistd.h>
using namespace std;
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int r;
MPI_Comm_rank(MPI_COMM_WORLD, &r);
if (r == 0) {
cout << "[GUI]Start perfoming initialization...." << endl;
sleep(2);
cout << "[GUI]Send command to start execution...." << endl;
int command = 1;
//TODO: now it's hardcoded to send data to 1 proc
MPI_Send(&command, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
cout << "[GUI]Waiting for execution results..." << endl;
int buf[5];
MPI_Recv(&buf, 5, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for (int i=0; i<5; i++)
{
cout << "buf["<< i << "] = " << buf[i] << endl;
}
} else {
int command;
MPI_Recv(&command, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
cout << "Received command: " << command << endl;
if (command == 1) {
cout << "[ALGO]Receive command to start execution" << endl;
sleep(2);
cout << "[ALGO]Send computed data..." << endl;
int buf[5] = {5,4,3,2,1};
MPI_Send(&buf, 5, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}