I have some problems using the sin function on a Thrust complex double vector on the device and on the host: it's seem like computation is done in float.
With thrust::device_vector< thrust::complex<double> > and thrust::host_vector< thrust::complex<double> >, I obtain:
sin( 1+0i ) == (0.8414709568023682,0)
with std::complex<double>:
sin( 1+0i ) == (0.8414709848078965,0)
and std::complex<float> :
sin( 1+0i ) == (0.8414709568023682,0)
What mistake did I do in my code? In the compilation process I used
nvcc test.cu -o test
Here is the full code:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <iostream>
#include <iomanip>
#include <complex>
#include <cmath>
template <typename Vector>
void Print(Vector &V){
for (int i=0;i<V.size();i++)
std::cout << V[i] << " ";
std::cout << "\n";
}
template <typename T>
struct sin_functor : public thrust::unary_function< T , T >
{
__host__ __device__
T operator()( T x) const
{
return sin( x );
}
};
template <typename Vector>
void ThrustComputation(){
typedef typename Vector::value_type Tvec;
Vector A(2);
A[0]=Tvec(1.,0.);A[1]=Tvec(1.,1.);
std::cout << "A: " << std::endl;
std::cout << " ";Print<Vector>(A);
Vector B(A.size());
thrust::transform(A.begin(),A.end(),B.begin(), sin_functor<Tvec>());
std::cout << "B =sin(A): " << std::endl;
std::cout << " ";Print<Vector>(B);
}
template <typename T>
void stdComputation(){
std::complex<T> sA[2];
sA[0]=std::complex<T>(1.,0.);
sA[1]=std::complex<T>(1.,1.);
std::cout << "sA: " << std::endl;
std::cout << " " << sA[0] << " " << sA[1] << std::endl;
std::cout << "sin(sA): " << std::endl;
std::cout << " " << sin(sA[0]) << " " << sin(sA[1]) << std::endl;
}
int main(int argc, char **argv)
{
std::cout << std::setprecision(16);
std::cout << "Thrust: Computation on GPU device (double)\n";
ThrustComputation<thrust::device_vector< thrust::complex<double> > >();
std::cout << "Thrust: Computation on host (double)\n";
ThrustComputation<thrust::host_vector< thrust::complex<double> > >();
std::cout << "std: Computation (double)\n";
stdComputation<double>();
std::cout << "std: Computation (float)\n";
stdComputation<float>();
return 0;
}
The output on my computer (Ubuntu 14.04 LTS, cuda 7.5) is :
Thrust: Computation on GPU device (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
Thrust: Computation on host (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
std: Computation (double)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709848078965,0) (1.298457581415977,0.6349639147847361)
std: Computation (float)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709568023682,0) (1.298457503318787,0.6349638700485229)
This would appear to be a genuine bug in the thrust library. A quick scan of the code on github led me to this, which might be the culprit. It seems that thrust's double precision csinh function, upon which complex sin relies, has an accidental intermediate cast to float, which is probably causing the loss of precision you have observed. As suggested in comments, you should report this as a bug.
Related
i am trying to find the maximum value and it's location of a thrust::device_vecotr.
the mechanism below can save the position of the maximum value, however, i couldn't find the max_val.
i have cout statements to track the running order and where it crashes. it seems to be it crash on this line
int max_val = *iter;
it shows this result:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): invalid argument
1234567
here is the code
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1";
thrust::host_vector<int>h_vec(5);
h_vec.push_back(10);
h_vec.push_back(11);
h_vec.push_back(12);
h_vec.push_back(13);
h_vec.push_back(14);
std::cout<<"2";
thrust::device_vector<int>d_vec(5);
std::cout<<"3";
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4";
// print_vector("D_Vec",d_vec);
std::cout<<"5";
thrust::device_vector<int>::iterator iter=thrust::max(d_vec.begin(),d_vec.end());
std::cout<<"6";
unsigned int position = iter - d_vec.begin();
std::cout<<"7";
int max_val = *iter;
std::cout<<"8";
std::cout<<"Max Val= "<<14<<" #"<<position<< std::endl;
return 0;
}
Help .. please. also, if there is a better way to extract the maximum value and its position in device_vector using THRUST library it is more than appreciated.
You're not using vectors correctly. push_back() adds an element onto the end of an existing vector. It's clear that you want to replace existing elements.
Also, the thrust algorithm you want is thrust::max_element, not thrust::max
Here's a fully worked code with those issues fixed:
$ cat t1229.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1" <<std::endl;
thrust::host_vector<int>h_vec(5);
h_vec[0] = 10;
h_vec[1] = 11;
h_vec[2] = 12;
h_vec[3] = 13;
h_vec[4] = 14;
std::cout<<"2" << std::endl;
thrust::device_vector<int>d_vec(5);
std::cout<<"3" << std::endl;
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4" << std::endl;
// print_vector("D_Vec",d_vec);
std::cout<<"5" << std::endl;
thrust::device_vector<int>::iterator iter=thrust::max_element(d_vec.begin(),d_vec.end());
std::cout<<"6" << std::endl;
unsigned int position = iter - d_vec.begin();
std::cout<<"7" << std::endl;
int max_val = d_vec[position];
std::cout<<"8" << std::endl;
std::cout<<"Max Val= "<<max_val<<" #"<<position<< std::endl;
return 0;
}
$ nvcc -o t1229 t1229.cu
$ ./t1229
1
2
3
4
5
6
7
8
Max Val= 14 #4
$
In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.
Using complex numbers from cuComplex.h is not a preferable option since I do a lot of arithmetic on the array and cuComplex doesnt have defined operators such as * +=.
This is how I defined array which I want to transpose
thrust::complex<float> u[xmax][xmax];
I have found this https://github.com/jtravs/cuda_complex, but using it as such:
#include "cuComplex.hpp"
doesnt allow me to use mentioned operators when compiled with nvcc
error: no operator "+=" matches these operands
operand types are: cuComplex += cuComplex
Is there some solution to this? Code from github is old and there may lay the issue or maybe I am using it wrong
EDIT: Here is code which works, only difference from talonmies code is adding simple kernel and pointer to same data but being thrust::complex
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
__global__ void test(thrust::complex<double>* u) {
u[0] += thrust::complex<double>(3.3,3.3);
}
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
cout << u[0][0] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
test<<<1,1>>>(d_vTest);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[0][0] << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
Despite your protestations to the contrary, the C++ standard library complex (or thrust::complex) most certainly does work with CUBLAS. The cuComplex and cuDoubleComplex are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.
A simple modification to the code you posted in comments works exactly as you might imagine:
#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
complex<double> u[xmax][xmax];
size_t arrSize = sizeof(complex<double>) * xmax * xmax;
fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
u[49][51] += complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
complex<double> alpha(1.0, 0.0);
complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
built and run like so:
~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)
The only modifications required are explicit casts of the std::complex<double> types to cuDoubleComplex. Do that and everything works as expected.
Use thrust, the code looks almost identical:
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
size_t arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>
__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
u[49 + 51*xmax] *= 2.0;
}
int main()
{
int xmax = 100;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, _d_u, xmax,
_beta, _d_u, xmax,
_d_v, xmax);
thrust::complex<double> u[xmax][xmax];
thrust::copy(d_u.begin(), d_u.end(), &u[0][0]);
std::cout << "Before:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
thrust::copy(d_v.begin(), d_v.end(), &u[0][0]);
std::cout << "After:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
return 0;
}
Edit 2: include the more full program
Edit 1: include the full program
I'm trying to compute the L2 norm of a vector using cuBLAS. My code is as follows
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
CUDA_SAFE_CALL(cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost));
cout << "GPU Matrix of Size: " << nrows << "x" << ncols << endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
cout << fixed << setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
cout << endl;
}
free(hostA);
cout << endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
thrust::counting_iterator<unsigned int> index_sequence_begin(rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
srand(clock());
cout << "# Running NMT" << endl;
//ParseOpts(argc, argv);
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasXnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
cout << "nrm2 = " << nrm << endl;
}
Here, CUBLAS_SAFE_CALL is defined as follows
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << endl; \
cout << " Code: " << stat << endl; \
exit(1); \
} \
}
GPU_Random_Vector and GPU_Print_Matrix have been confirmed to work before. Also, cublasHandle[singleGPU] has been initialized before being called. When I ran the program, I had the following output
// GPU_Print_Matrix
GPU Matrix of Size: 10x1
0.0652332678
0.0747700930
0.0274266358
-0.0885794610
-0.0192640368
-0.0942506194
0.0283640027
-0.0411146656
-0.0460337885
-0.0970785618
cuBlas Error: nmt.cu:2252
Code: 14
What is going on? And is there any reference for how can I interpret the error number of cuBLAS? Thanks a ton.
CUBLAS error 14 is CUBLAS_STATUS_INTERNAL_ERROR and would usually mean that the internal device to host copy at the end of the L2 norm call failed. But why that happened is impossible to say without some context about what else your code was doing.
If the code you posted is assembled and fleshed out into a complete demo case (with the trivial random number seeding mistake correct) like this:
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cublas_v2.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
typedef float real_t;
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
std::cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << std::endl; \
std::cout << " Code: " << stat << std::endl; \
exit(1); \
} \
}
#define PRINT_PRECISION (6)
struct RANDOM
{
real_t a, b;
__host__ __device__
RANDOM(real_t _a=0, real_t _b=1) : a(_a), b(_b) {};
__host__ __device__
real_t operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost);
std::cout << "GPU Matrix of Size: " << nrows << "x" << ncols << std::endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
std::cout << std::fixed << std::setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
std::cout << std::endl;
}
free(hostA);
std::cout << std::endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
const real_t initRange = 10;
thrust::counting_iterator<unsigned int> index_sequence_begin(std::rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
std::srand(std::time(0));
std::cout << "# Running NMT" << std::endl;
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasSnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
std::cout << "nrm2 = " << nrm << std::endl;
}
and compiled and run like this (CUDA 6.5 if that matters):
>nvcc -arch=sm_21 -run runkkari.cu -lcublas
runkkari.cu
Creating library a.lib and object a.exp
# Running NMT
GPU Matrix of Size: 10x1
-5.712992
8.181723
-0.086308
-6.177320
-5.442665
-2.889552
-1.555665
6.506872
-6.800190
8.024273
nrm2 = 18.196394
It works as expected. You should be able to compile and run this to confirm this yourself. So from this we can only conclude that you have another problem which you have failed to describe. But perhaps this helps to narrow down the list of possibilities.
First off I'd like to say I really do like the CUDA documentation it's really great and resourceful although I'm finding it hard to find out what is supported in what version. I'm using CUDA driver version 5.0 with compute capability 2.0 and was wondering if cudaHostAllocWriteCombined is supported?
In my code:
float *d_data, h_data;
h_data = new float[A];
assert(cudaHostAlloc((void **)&d_data, A * sizeof(float), cudaHostAllocWriteCombined) == cudaSuccess);
cudaError_t err = cudaMemcpy(d_data, h_data, A * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << cudaGetErrorString(err) << std::endl;
return false;
}
error returns invalid argument, however if I use cudaHostAllocDefault it seems to work fine, I understand how it works fast writing slow reading and that's why I would like to use it.
Your usage of h_data is incorrect. new returns a pointer, which should be assigned to the correct variable type. Replace h_data with *h_data in your declaration, and your code will be more-or-less correct, and cudaMemcpy should not throw an invalid argument error.
The following complete code shows the correction and compiles and runs without error for me on CUDA 6:
#include <iostream>
#include <assert.h>
#define A 1024
int main(){
float *d_data, *h_data;
h_data = new float[A];
cudaError_t err = cudaHostAlloc((void **)&d_data, A * sizeof(float), cudaHostAllocWriteCombined);
if (err != cudaSuccess)
{
std::cout << "cudaHostAlloc fail " << std::endl;
std::cout << cudaGetErrorString(err) << std::endl;
return 1;
}
err = cudaMemcpy(d_data, h_data, A * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << "cudaMemcpy fail" << std::endl;
std::cout << cudaGetErrorString(err) << std::endl;
return 1;
}
return 0;
}
I've have a huge sequntial program, in which I want to parallelize some algo with MPI and CUDA. How correctly separate sequential part from parallel? The problem lies in nesting of parallel algo, and using of slurm or loadLeveler as well(e.g. on my MPI cluster I can't write something like: mpirun -n 1 a.out: -n 2 b.out).
Example:
int main()
{
funcA();
}
void funcA()
{
funcB();
}
void funcB()
{
parallel algo starts here....
}
I've found a great solution, for this problem. This is sample code:
#include <iostream>
#include <mpi.h>
#include <unistd.h>
using namespace std;
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int r;
MPI_Comm_rank(MPI_COMM_WORLD, &r);
if (r == 0) {
cout << "[GUI]Start perfoming initialization...." << endl;
sleep(2);
cout << "[GUI]Send command to start execution...." << endl;
int command = 1;
//TODO: now it's hardcoded to send data to 1 proc
MPI_Send(&command, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
cout << "[GUI]Waiting for execution results..." << endl;
int buf[5];
MPI_Recv(&buf, 5, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for (int i=0; i<5; i++)
{
cout << "buf["<< i << "] = " << buf[i] << endl;
}
} else {
int command;
MPI_Recv(&command, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
cout << "Received command: " << command << endl;
if (command == 1) {
cout << "[ALGO]Receive command to start execution" << endl;
sleep(2);
cout << "[ALGO]Send computed data..." << endl;
int buf[5] = {5,4,3,2,1};
MPI_Send(&buf, 5, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}