Best way to separate parallel MPI part in sequential program - cuda

I've have a huge sequntial program, in which I want to parallelize some algo with MPI and CUDA. How correctly separate sequential part from parallel? The problem lies in nesting of parallel algo, and using of slurm or loadLeveler as well(e.g. on my MPI cluster I can't write something like: mpirun -n 1 a.out: -n 2 b.out).
Example:
int main()
{
funcA();
}
void funcA()
{
funcB();
}
void funcB()
{
parallel algo starts here....
}

I've found a great solution, for this problem. This is sample code:
#include <iostream>
#include <mpi.h>
#include <unistd.h>
using namespace std;
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int r;
MPI_Comm_rank(MPI_COMM_WORLD, &r);
if (r == 0) {
cout << "[GUI]Start perfoming initialization...." << endl;
sleep(2);
cout << "[GUI]Send command to start execution...." << endl;
int command = 1;
//TODO: now it's hardcoded to send data to 1 proc
MPI_Send(&command, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
cout << "[GUI]Waiting for execution results..." << endl;
int buf[5];
MPI_Recv(&buf, 5, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for (int i=0; i<5; i++)
{
cout << "buf["<< i << "] = " << buf[i] << endl;
}
} else {
int command;
MPI_Recv(&command, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
cout << "Received command: " << command << endl;
if (command == 1) {
cout << "[ALGO]Receive command to start execution" << endl;
sleep(2);
cout << "[ALGO]Send computed data..." << endl;
int buf[5] = {5,4,3,2,1};
MPI_Send(&buf, 5, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}

Related

Check special character in string array using isalpha

So I am trying to take words from a file and save them into a dynamic array, and then print the array sorted alphabetically along with the concordance of each word in the array. I am facing a problem in identifying special characters/ numbers and deleting them from the array. I want to use the isalpha function as it is in the assignment prompt. When I run my code, it works perfectly but special characters and numbers are not eliminated from the array. Any idea on how to do this using isalpha?
#include <iostream>
#include <stream>
#include <iomanip>
#include <cstdlib>
#include <algorithm>
#include <cctype>
#include <string>
using namespace std;
void loadData();
void checkSpecialCharacters(string wordPtr, int size);
void alphabeticalSort(string *wordPtr, int size);
void frequencyCounter(string *wordptr, int size);
int main()
{
loadData();
return 0;
}
void loadData()
{
string fileName;
cout << "This program processes any text file to give you the concordance for each word present, and how many times it appears in the file." << endl;
cout << "Please enter the name of the text file you want to process followed by '.txt': " << endl;
cin >> fileName;
ifstream dataFile(fileName);
if (dataFile.fail())
{
cerr << fileName << " could not be opened." << endl; //error message if file opening fails
exit(-1);
}
string word;
int size = 0;
while (dataFile >> word)
{
size++;
}
dataFile.clear();
dataFile.seekg(0);
string* wordPtr = new string[size];
int ctr = 0;
for (int i = 0; i < size; i++)
{
dataFile >> wordPtr[i];
checkSpecialCharacters(wordPtr[i], size);
std::transform(wordPtr[i].begin(), wordPtr[i].end(), wordPtr[i].begin(), ::tolower);
cout << wordPtr[i] << endl;
}
dataFile.close();
alphabeticalSort(wordPtr, size);
frequencyCounter(wordPtr, size);
delete[] wordPtr;
}
void checkSpecialCharacters(string wordPtr, int size)
{
for (int i = 0; i < size; i++)
{
if (isalpha(wordPtr[i]) == true)
{
for (int j = 0; j < size; j++)
{
wordPtr[j] = wordPtr[j + 1];
cout << wordPtr[j];
}
}
}
}
void alphabeticalSort(string *wordPtr, int size)
{
int i, j;
string temp; //temporary holding variable
for (i = 0; i < (size - 1); i++)
{
for (j = 0; j < (size - 1); j++)
{
if ((wordPtr[j])>(wordPtr[j + 1]))
{
temp = wordPtr[j];
wordPtr[j] = wordPtr[j + 1];
wordPtr[j + 1] = temp;
}
}
}
}
void frequencyCounter(string *wordPtr, int size)
{
string finalFileName;
cout << "Please enter the name of the file that you want to store the concordance in followed by .txt: " << endl;
cin >> finalFileName;
ofstream concordanceFile(finalFileName, ios::out);
if (concordanceFile.fail())
{
cerr << finalFileName << " could not be opened." << endl;
exit(-1);
}
int frequency = 1;
int index = 1;
string element = wordPtr[0];
while (index < size)
{
if (wordPtr[index - 1] == wordPtr[index]) // check if element is equal to previous element
{
frequency++;
index++;
}
else
{
concordanceFile.setf(ios::left);
concordanceFile << setw(10) << element << " " << setw(10) << frequency << endl;
cout.setf(ios::left);
cout << setw(10) << element << " " << setw(10) << frequency << endl;
element = wordPtr[index];
index++;
frequency = 1; //reset frequency
}
}
cout << "Concordance data saved in " << finalFileName << " successfully!" << endl;
}

Trouble using thrust complex(double) vector with sinus function

I have some problems using the sin function on a Thrust complex double vector on the device and on the host: it's seem like computation is done in float.
With thrust::device_vector< thrust::complex<double> > and thrust::host_vector< thrust::complex<double> >, I obtain:
sin( 1+0i ) == (0.8414709568023682,0)
with std::complex<double>:
sin( 1+0i ) == (0.8414709848078965,0)
and std::complex<float> :
sin( 1+0i ) == (0.8414709568023682,0)
What mistake did I do in my code? In the compilation process I used
nvcc test.cu -o test
Here is the full code:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <iostream>
#include <iomanip>
#include <complex>
#include <cmath>
template <typename Vector>
void Print(Vector &V){
for (int i=0;i<V.size();i++)
std::cout << V[i] << " ";
std::cout << "\n";
}
template <typename T>
struct sin_functor : public thrust::unary_function< T , T >
{
__host__ __device__
T operator()( T x) const
{
return sin( x );
}
};
template <typename Vector>
void ThrustComputation(){
typedef typename Vector::value_type Tvec;
Vector A(2);
A[0]=Tvec(1.,0.);A[1]=Tvec(1.,1.);
std::cout << "A: " << std::endl;
std::cout << " ";Print<Vector>(A);
Vector B(A.size());
thrust::transform(A.begin(),A.end(),B.begin(), sin_functor<Tvec>());
std::cout << "B =sin(A): " << std::endl;
std::cout << " ";Print<Vector>(B);
}
template <typename T>
void stdComputation(){
std::complex<T> sA[2];
sA[0]=std::complex<T>(1.,0.);
sA[1]=std::complex<T>(1.,1.);
std::cout << "sA: " << std::endl;
std::cout << " " << sA[0] << " " << sA[1] << std::endl;
std::cout << "sin(sA): " << std::endl;
std::cout << " " << sin(sA[0]) << " " << sin(sA[1]) << std::endl;
}
int main(int argc, char **argv)
{
std::cout << std::setprecision(16);
std::cout << "Thrust: Computation on GPU device (double)\n";
ThrustComputation<thrust::device_vector< thrust::complex<double> > >();
std::cout << "Thrust: Computation on host (double)\n";
ThrustComputation<thrust::host_vector< thrust::complex<double> > >();
std::cout << "std: Computation (double)\n";
stdComputation<double>();
std::cout << "std: Computation (float)\n";
stdComputation<float>();
return 0;
}
The output on my computer (Ubuntu 14.04 LTS, cuda 7.5) is :
Thrust: Computation on GPU device (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
Thrust: Computation on host (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
std: Computation (double)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709848078965,0) (1.298457581415977,0.6349639147847361)
std: Computation (float)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709568023682,0) (1.298457503318787,0.6349638700485229)
This would appear to be a genuine bug in the thrust library. A quick scan of the code on github led me to this, which might be the culprit. It seems that thrust's double precision csinh function, upon which complex sin relies, has an accidental intermediate cast to float, which is probably causing the loss of precision you have observed. As suggested in comments, you should report this as a bug.

Maximum value of thrust device_vector

i am trying to find the maximum value and it's location of a thrust::device_vecotr.
the mechanism below can save the position of the maximum value, however, i couldn't find the max_val.
i have cout statements to track the running order and where it crashes. it seems to be it crash on this line
int max_val = *iter;
it shows this result:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): invalid argument
1234567
here is the code
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1";
thrust::host_vector<int>h_vec(5);
h_vec.push_back(10);
h_vec.push_back(11);
h_vec.push_back(12);
h_vec.push_back(13);
h_vec.push_back(14);
std::cout<<"2";
thrust::device_vector<int>d_vec(5);
std::cout<<"3";
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4";
// print_vector("D_Vec",d_vec);
std::cout<<"5";
thrust::device_vector<int>::iterator iter=thrust::max(d_vec.begin(),d_vec.end());
std::cout<<"6";
unsigned int position = iter - d_vec.begin();
std::cout<<"7";
int max_val = *iter;
std::cout<<"8";
std::cout<<"Max Val= "<<14<<" #"<<position<< std::endl;
return 0;
}
Help .. please. also, if there is a better way to extract the maximum value and its position in device_vector using THRUST library it is more than appreciated.
You're not using vectors correctly. push_back() adds an element onto the end of an existing vector. It's clear that you want to replace existing elements.
Also, the thrust algorithm you want is thrust::max_element, not thrust::max
Here's a fully worked code with those issues fixed:
$ cat t1229.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1" <<std::endl;
thrust::host_vector<int>h_vec(5);
h_vec[0] = 10;
h_vec[1] = 11;
h_vec[2] = 12;
h_vec[3] = 13;
h_vec[4] = 14;
std::cout<<"2" << std::endl;
thrust::device_vector<int>d_vec(5);
std::cout<<"3" << std::endl;
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4" << std::endl;
// print_vector("D_Vec",d_vec);
std::cout<<"5" << std::endl;
thrust::device_vector<int>::iterator iter=thrust::max_element(d_vec.begin(),d_vec.end());
std::cout<<"6" << std::endl;
unsigned int position = iter - d_vec.begin();
std::cout<<"7" << std::endl;
int max_val = d_vec[position];
std::cout<<"8" << std::endl;
std::cout<<"Max Val= "<<max_val<<" #"<<position<< std::endl;
return 0;
}
$ nvcc -o t1229 t1229.cu
$ ./t1229
1
2
3
4
5
6
7
8
Max Val= 14 #4
$

std::for_each and std::vector destructor call

I have the following problem with std::for_each and a functor proxy object.
See the following code:
struct Functor {
std::vector<int> data;
const unsigned mID;
static unsigned id;
Functor() : mID(id++) {
std::cout << "Functor constructed with id: " << mID << std::endl;
}
~Functor() {
std::cout << "Functor dtor: " << mID << std::endl;
}
void operator() (int i) {
std::cout << "Functor print: " << i << std::endl;
data.push_back(i);
std::cout << "Dump: ";
for(int i = 0; i < data.size(); ++i)
std::cout << data[i] << " ";
std::cout << std::endl;
}
};
unsigned Functor::id = 0;
From above, the proxy object simply does 2 things, it prints the data out to the CMD and it stores a copy for itself to use. Below is the example use case of the object:
int main () {
std::vector<int> intvec;
for(int i = 0; i < 10; ++i)
intvec.push_back(i);
Functor myfunctor;
std::for_each(intvec.begin(), intvec.end(), myfunctor);
std::cout << "Data in my functor: " << myfunctor.data.size() << std::endl;
for(int i = 0; i < myfunctor.data.size(); ++i)
std::cout << "myfunctor data: " << myfunctor.data[i] << std::endl;
return 0;
}
This is the part it gets really fishy for me. The output generated is that my functor proxy object is constructed once but deconstructed three times! Something is bypassing the construction call.
Also as a result of the destructor being called at the end of the std::for_each, the Functor.data is empty as well!
Is there a way to make sure data inside Functor is kept persistent? I wish to keep track of the state of my functor when used inside functions such as std::for_each (Basically any given std algorithm function that can take in a unary functor)
Do note that I am using c++03 ONLY. Many thanks.
[...] my functor proxy object is constructed once but deconstructed three times! Something is bypassing the construction call.
Not quite. Your class is default-constructed once, but is also copy-constructed twice. You don't log the copy construction, so it doesn't show up on your output.
If you add a logging copy constructor, you'll see "Functor constructed with id: 0" printed three times:
Functor(const Functor& other) : mID(other.mID) {
std::cout << "Functor constructed with id: " << mID << std::endl;
}

Why does computing L2 norm with cuBLAS result in an error?

Edit 2: include the more full program
Edit 1: include the full program
I'm trying to compute the L2 norm of a vector using cuBLAS. My code is as follows
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
CUDA_SAFE_CALL(cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost));
cout << "GPU Matrix of Size: " << nrows << "x" << ncols << endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
cout << fixed << setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
cout << endl;
}
free(hostA);
cout << endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
thrust::counting_iterator<unsigned int> index_sequence_begin(rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
srand(clock());
cout << "# Running NMT" << endl;
//ParseOpts(argc, argv);
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasXnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
cout << "nrm2 = " << nrm << endl;
}
Here, CUBLAS_SAFE_CALL is defined as follows
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << endl; \
cout << " Code: " << stat << endl; \
exit(1); \
} \
}
GPU_Random_Vector and GPU_Print_Matrix have been confirmed to work before. Also, cublasHandle[singleGPU] has been initialized before being called. When I ran the program, I had the following output
// GPU_Print_Matrix
GPU Matrix of Size: 10x1
0.0652332678
0.0747700930
0.0274266358
-0.0885794610
-0.0192640368
-0.0942506194
0.0283640027
-0.0411146656
-0.0460337885
-0.0970785618
cuBlas Error: nmt.cu:2252
Code: 14
What is going on? And is there any reference for how can I interpret the error number of cuBLAS? Thanks a ton.
CUBLAS error 14 is CUBLAS_STATUS_INTERNAL_ERROR and would usually mean that the internal device to host copy at the end of the L2 norm call failed. But why that happened is impossible to say without some context about what else your code was doing.
If the code you posted is assembled and fleshed out into a complete demo case (with the trivial random number seeding mistake correct) like this:
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cublas_v2.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
typedef float real_t;
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
std::cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << std::endl; \
std::cout << " Code: " << stat << std::endl; \
exit(1); \
} \
}
#define PRINT_PRECISION (6)
struct RANDOM
{
real_t a, b;
__host__ __device__
RANDOM(real_t _a=0, real_t _b=1) : a(_a), b(_b) {};
__host__ __device__
real_t operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost);
std::cout << "GPU Matrix of Size: " << nrows << "x" << ncols << std::endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
std::cout << std::fixed << std::setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
std::cout << std::endl;
}
free(hostA);
std::cout << std::endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
const real_t initRange = 10;
thrust::counting_iterator<unsigned int> index_sequence_begin(std::rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
std::srand(std::time(0));
std::cout << "# Running NMT" << std::endl;
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasSnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
std::cout << "nrm2 = " << nrm << std::endl;
}
and compiled and run like this (CUDA 6.5 if that matters):
>nvcc -arch=sm_21 -run runkkari.cu -lcublas
runkkari.cu
Creating library a.lib and object a.exp
# Running NMT
GPU Matrix of Size: 10x1
-5.712992
8.181723
-0.086308
-6.177320
-5.442665
-2.889552
-1.555665
6.506872
-6.800190
8.024273
nrm2 = 18.196394
It works as expected. You should be able to compile and run this to confirm this yourself. So from this we can only conclude that you have another problem which you have failed to describe. But perhaps this helps to narrow down the list of possibilities.