CUDA: complex matrix multiplication [duplicate] - cuda

In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.
Using complex numbers from cuComplex.h is not a preferable option since I do a lot of arithmetic on the array and cuComplex doesnt have defined operators such as * +=.
This is how I defined array which I want to transpose
thrust::complex<float> u[xmax][xmax];
I have found this https://github.com/jtravs/cuda_complex, but using it as such:
#include "cuComplex.hpp"
doesnt allow me to use mentioned operators when compiled with nvcc
error: no operator "+=" matches these operands
operand types are: cuComplex += cuComplex
Is there some solution to this? Code from github is old and there may lay the issue or maybe I am using it wrong
EDIT: Here is code which works, only difference from talonmies code is adding simple kernel and pointer to same data but being thrust::complex
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
__global__ void test(thrust::complex<double>* u) {
u[0] += thrust::complex<double>(3.3,3.3);
}
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
cout << u[0][0] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
test<<<1,1>>>(d_vTest);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[0][0] << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}

Despite your protestations to the contrary, the C++ standard library complex (or thrust::complex) most certainly does work with CUBLAS. The cuComplex and cuDoubleComplex are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.
A simple modification to the code you posted in comments works exactly as you might imagine:
#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
complex<double> u[xmax][xmax];
size_t arrSize = sizeof(complex<double>) * xmax * xmax;
fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
u[49][51] += complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
complex<double> alpha(1.0, 0.0);
complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
built and run like so:
~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)
The only modifications required are explicit casts of the std::complex<double> types to cuDoubleComplex. Do that and everything works as expected.
Use thrust, the code looks almost identical:
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
size_t arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>
__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
u[49 + 51*xmax] *= 2.0;
}
int main()
{
int xmax = 100;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, _d_u, xmax,
_beta, _d_u, xmax,
_d_v, xmax);
thrust::complex<double> u[xmax][xmax];
thrust::copy(d_u.begin(), d_u.end(), &u[0][0]);
std::cout << "Before:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
thrust::copy(d_v.begin(), d_v.end(), &u[0][0]);
std::cout << "After:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
return 0;
}

Related

Trouble using thrust complex(double) vector with sinus function

I have some problems using the sin function on a Thrust complex double vector on the device and on the host: it's seem like computation is done in float.
With thrust::device_vector< thrust::complex<double> > and thrust::host_vector< thrust::complex<double> >, I obtain:
sin( 1+0i ) == (0.8414709568023682,0)
with std::complex<double>:
sin( 1+0i ) == (0.8414709848078965,0)
and std::complex<float> :
sin( 1+0i ) == (0.8414709568023682,0)
What mistake did I do in my code? In the compilation process I used
nvcc test.cu -o test
Here is the full code:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <iostream>
#include <iomanip>
#include <complex>
#include <cmath>
template <typename Vector>
void Print(Vector &V){
for (int i=0;i<V.size();i++)
std::cout << V[i] << " ";
std::cout << "\n";
}
template <typename T>
struct sin_functor : public thrust::unary_function< T , T >
{
__host__ __device__
T operator()( T x) const
{
return sin( x );
}
};
template <typename Vector>
void ThrustComputation(){
typedef typename Vector::value_type Tvec;
Vector A(2);
A[0]=Tvec(1.,0.);A[1]=Tvec(1.,1.);
std::cout << "A: " << std::endl;
std::cout << " ";Print<Vector>(A);
Vector B(A.size());
thrust::transform(A.begin(),A.end(),B.begin(), sin_functor<Tvec>());
std::cout << "B =sin(A): " << std::endl;
std::cout << " ";Print<Vector>(B);
}
template <typename T>
void stdComputation(){
std::complex<T> sA[2];
sA[0]=std::complex<T>(1.,0.);
sA[1]=std::complex<T>(1.,1.);
std::cout << "sA: " << std::endl;
std::cout << " " << sA[0] << " " << sA[1] << std::endl;
std::cout << "sin(sA): " << std::endl;
std::cout << " " << sin(sA[0]) << " " << sin(sA[1]) << std::endl;
}
int main(int argc, char **argv)
{
std::cout << std::setprecision(16);
std::cout << "Thrust: Computation on GPU device (double)\n";
ThrustComputation<thrust::device_vector< thrust::complex<double> > >();
std::cout << "Thrust: Computation on host (double)\n";
ThrustComputation<thrust::host_vector< thrust::complex<double> > >();
std::cout << "std: Computation (double)\n";
stdComputation<double>();
std::cout << "std: Computation (float)\n";
stdComputation<float>();
return 0;
}
The output on my computer (Ubuntu 14.04 LTS, cuda 7.5) is :
Thrust: Computation on GPU device (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
Thrust: Computation on host (double)
A:
(1,0) (1,1)
B =sin(A):
(0.8414709568023682,0) (1.298457622528076,0.6349639296531677)
std: Computation (double)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709848078965,0) (1.298457581415977,0.6349639147847361)
std: Computation (float)
sA:
(1,0) (1,1)
sin(sA):
(0.8414709568023682,0) (1.298457503318787,0.6349638700485229)
This would appear to be a genuine bug in the thrust library. A quick scan of the code on github led me to this, which might be the culprit. It seems that thrust's double precision csinh function, upon which complex sin relies, has an accidental intermediate cast to float, which is probably causing the loss of precision you have observed. As suggested in comments, you should report this as a bug.

Maximum value of thrust device_vector

i am trying to find the maximum value and it's location of a thrust::device_vecotr.
the mechanism below can save the position of the maximum value, however, i couldn't find the max_val.
i have cout statements to track the running order and where it crashes. it seems to be it crash on this line
int max_val = *iter;
it shows this result:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): invalid argument
1234567
here is the code
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1";
thrust::host_vector<int>h_vec(5);
h_vec.push_back(10);
h_vec.push_back(11);
h_vec.push_back(12);
h_vec.push_back(13);
h_vec.push_back(14);
std::cout<<"2";
thrust::device_vector<int>d_vec(5);
std::cout<<"3";
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4";
// print_vector("D_Vec",d_vec);
std::cout<<"5";
thrust::device_vector<int>::iterator iter=thrust::max(d_vec.begin(),d_vec.end());
std::cout<<"6";
unsigned int position = iter - d_vec.begin();
std::cout<<"7";
int max_val = *iter;
std::cout<<"8";
std::cout<<"Max Val= "<<14<<" #"<<position<< std::endl;
return 0;
}
Help .. please. also, if there is a better way to extract the maximum value and its position in device_vector using THRUST library it is more than appreciated.
You're not using vectors correctly. push_back() adds an element onto the end of an existing vector. It's clear that you want to replace existing elements.
Also, the thrust algorithm you want is thrust::max_element, not thrust::max
Here's a fully worked code with those issues fixed:
$ cat t1229.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/extrema.h>
#include <iostream>
#include <iomanip>
template <typename Vector>
void print_vector(const std::string& name, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << " " << std::setw(20) << name << " ";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
int main()
{
std::cout<<"1" <<std::endl;
thrust::host_vector<int>h_vec(5);
h_vec[0] = 10;
h_vec[1] = 11;
h_vec[2] = 12;
h_vec[3] = 13;
h_vec[4] = 14;
std::cout<<"2" << std::endl;
thrust::device_vector<int>d_vec(5);
std::cout<<"3" << std::endl;
thrust::copy_n(h_vec.begin(),5,d_vec.begin());
std::cout<<"4" << std::endl;
// print_vector("D_Vec",d_vec);
std::cout<<"5" << std::endl;
thrust::device_vector<int>::iterator iter=thrust::max_element(d_vec.begin(),d_vec.end());
std::cout<<"6" << std::endl;
unsigned int position = iter - d_vec.begin();
std::cout<<"7" << std::endl;
int max_val = d_vec[position];
std::cout<<"8" << std::endl;
std::cout<<"Max Val= "<<max_val<<" #"<<position<< std::endl;
return 0;
}
$ nvcc -o t1229 t1229.cu
$ ./t1229
1
2
3
4
5
6
7
8
Max Val= 14 #4
$

Why does computing L2 norm with cuBLAS result in an error?

Edit 2: include the more full program
Edit 1: include the full program
I'm trying to compute the L2 norm of a vector using cuBLAS. My code is as follows
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
CUDA_SAFE_CALL(cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost));
cout << "GPU Matrix of Size: " << nrows << "x" << ncols << endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
cout << fixed << setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
cout << endl;
}
free(hostA);
cout << endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
thrust::counting_iterator<unsigned int> index_sequence_begin(rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
srand(clock());
cout << "# Running NMT" << endl;
//ParseOpts(argc, argv);
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasXnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
cout << "nrm2 = " << nrm << endl;
}
Here, CUBLAS_SAFE_CALL is defined as follows
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << endl; \
cout << " Code: " << stat << endl; \
exit(1); \
} \
}
GPU_Random_Vector and GPU_Print_Matrix have been confirmed to work before. Also, cublasHandle[singleGPU] has been initialized before being called. When I ran the program, I had the following output
// GPU_Print_Matrix
GPU Matrix of Size: 10x1
0.0652332678
0.0747700930
0.0274266358
-0.0885794610
-0.0192640368
-0.0942506194
0.0283640027
-0.0411146656
-0.0460337885
-0.0970785618
cuBlas Error: nmt.cu:2252
Code: 14
What is going on? And is there any reference for how can I interpret the error number of cuBLAS? Thanks a ton.
CUBLAS error 14 is CUBLAS_STATUS_INTERNAL_ERROR and would usually mean that the internal device to host copy at the end of the L2 norm call failed. But why that happened is impossible to say without some context about what else your code was doing.
If the code you posted is assembled and fleshed out into a complete demo case (with the trivial random number seeding mistake correct) like this:
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cublas_v2.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
typedef float real_t;
#define CUBLAS_SAFE_CALL(call) \
{ \
const cublasStatus_t stat = call; \
if (stat != CUBLAS_STATUS_SUCCESS) { \
std::cout << "cuBlas Error: " << __FILE__ << ":" << __LINE__ << std::endl; \
std::cout << " Code: " << stat << std::endl; \
exit(1); \
} \
}
#define PRINT_PRECISION (6)
struct RANDOM
{
real_t a, b;
__host__ __device__
RANDOM(real_t _a=0, real_t _b=1) : a(_a), b(_b) {};
__host__ __device__
real_t operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
void GPU_Print_Matrix(real_t *A, int nrows, int ncols) {
real_t *hostA = (real_t*)malloc(nrows*ncols * sizeof(real_t));
cudaMemcpy(hostA, A, nrows*ncols * sizeof(real_t), cudaMemcpyDeviceToHost);
std::cout << "GPU Matrix of Size: " << nrows << "x" << ncols << std::endl;
for (int i = 0; i < nrows; ++i) {
for (int j = 0; j < ncols; ++j) {
std::cout << std::fixed << std::setprecision(PRINT_PRECISION) << hostA[j*nrows + i] << " ";
}
std::cout << std::endl;
}
free(hostA);
std::cout << std::endl;
}
void GPU_Random_Vector(thrust::device_vector <real_t> &vec) {
const real_t initRange = 10;
thrust::counting_iterator<unsigned int> index_sequence_begin(std::rand());
thrust::transform(index_sequence_begin, index_sequence_begin + vec.size(), vec.begin(), RANDOM(-initRange, initRange));
}
int main(int argc, char *argv[]) {
std::srand(std::time(0));
std::cout << "# Running NMT" << std::endl;
cublasHandle_t handle;
CUBLAS_SAFE_CALL(cublasCreate(&handle));
thrust::device_vector <real_t> x(10);
GPU_Random_Vector(x);
GPU_Print_Matrix(thrust::raw_pointer_cast(&x[0]), 10, 1);
real_t nrm = 0;
CUBLAS_SAFE_CALL(cublasSnrm2(handle, 10, thrust::raw_pointer_cast(&x[0]), 1, &nrm));
std::cout << "nrm2 = " << nrm << std::endl;
}
and compiled and run like this (CUDA 6.5 if that matters):
>nvcc -arch=sm_21 -run runkkari.cu -lcublas
runkkari.cu
Creating library a.lib and object a.exp
# Running NMT
GPU Matrix of Size: 10x1
-5.712992
8.181723
-0.086308
-6.177320
-5.442665
-2.889552
-1.555665
6.506872
-6.800190
8.024273
nrm2 = 18.196394
It works as expected. You should be able to compile and run this to confirm this yourself. So from this we can only conclude that you have another problem which you have failed to describe. But perhaps this helps to narrow down the list of possibilities.

CuFFT Double to Complex

i want to make a FFT from double to std::complex with the CuFFT Lib. My Code looks like
#include <complex>
#include <iostream>
#include <cufft.h>
#include <cuda_runtime_api.h>
typedef std::complex<double> Complex;
using namespace std;
int main(){
int n = 100;
double* in;
Complex* out;
in = (double*) malloc(sizeof(double) * n);
out = (Complex*) malloc(sizeof(Complex) * n/2+1);
for(int i=0; i<n; i++){
in[i] = 1;
}
cufftHandle plan;
plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
unsigned int mem_size = sizeof(double)*n;
cufftDoubleReal *d_in;
cufftDoubleComplex *d_out;
cudaMalloc((void **)&d_in, mem_size);
cudaMalloc((void **)&d_out, mem_size);
cudaMemcpy(d_in, in, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, mem_size, cudaMemcpyHostToDevice);
int succes = cufftExecD2Z(plan,(cufftDoubleReal *) d_in,(cufftDoubleComplex *) d_out);
cout << succes << endl;
cudaMemcpy(out, d_out, mem_size, cudaMemcpyDeviceToHost);
for(int i=0; i<n/2; i++){
cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl;
}
return 0;
}
but it seems to me this must be wrong, because i think the transformed values should be 1 0 0 0 0 .... or without the normalization 100 0 0 0 0 .... but i just get 0 0 0 0 0 ...
Furthermore i would like it more if the cufftExecD2Z would work in place, which should be possible but i haven't figured out how to correctly do so. Can anybody help?
Your code has a variety of errors. You should probably review cufft documentation as well as the sample codes.
You should do proper cuda error checking and proper cufft error checking on all API return values.
The return value of the cufftPlan1d function does not go into the plan:
plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
The function itself sets the plan (that is why you pass &plan to the function), then when you assign the return value into the plan, it ruins the plan set up by the function.
You correctly identified that the output can be of size ((N/2)+1), but then you didn't allocate space for it properly either on the host side:
out = (Complex*) malloc(sizeof(Complex) * n/2+1);
or on the device side:
unsigned int mem_size = sizeof(double)*n;
...
cudaMalloc((void **)&d_out, mem_size);
The following code has some of the above problems fixed, enough to get your desired result (100, 0, 0, ...)
#include <complex>
#include <iostream>
#include <cufft.h>
#include <cuda_runtime_api.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef std::complex<double> Complex;
using namespace std;
int main(){
int n = 100;
double* in;
Complex* out;
#ifdef IN_PLACE
in = (double*) malloc(sizeof(Complex) * (n/2+1));
out = (Complex*)in;
#else
in = (double*) malloc(sizeof(double) * n);
out = (Complex*) malloc(sizeof(Complex) * (n/2+1));
#endif
for(int i=0; i<n; i++){
in[i] = 1;
}
cufftHandle plan;
cufftResult res = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);
if (res != CUFFT_SUCCESS) {cout << "cufft plan error: " << res << endl; return 1;}
cufftDoubleReal *d_in;
cufftDoubleComplex *d_out;
unsigned int out_mem_size = (n/2 + 1)*sizeof(cufftDoubleComplex);
#ifdef IN_PLACE
unsigned int in_mem_size = out_mem_size;
cudaMalloc((void **)&d_in, in_mem_size);
d_out = (cufftDoubleComplex *)d_in;
#else
unsigned int in_mem_size = sizeof(cufftDoubleReal)*n;
cudaMalloc((void **)&d_in, in_mem_size);
cudaMalloc((void **)&d_out, out_mem_size);
#endif
cudaCheckErrors("cuda malloc fail");
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy H2D fail");
res = cufftExecD2Z(plan,d_in, d_out);
if (res != CUFFT_SUCCESS) {cout << "cufft exec error: " << res << endl; return 1;}
cudaMemcpy(out, d_out, out_mem_size, cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy D2H fail");
for(int i=0; i<n/2; i++){
cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl;
}
return 0;
}
Review the documentation on what is necessary to do an in-place transform in the real to complex case. The above code can be recompiled with -DIN_PLACE to see the behavior for an in-place transform, and the necessary code changes.

Best way to separate parallel MPI part in sequential program

I've have a huge sequntial program, in which I want to parallelize some algo with MPI and CUDA. How correctly separate sequential part from parallel? The problem lies in nesting of parallel algo, and using of slurm or loadLeveler as well(e.g. on my MPI cluster I can't write something like: mpirun -n 1 a.out: -n 2 b.out).
Example:
int main()
{
funcA();
}
void funcA()
{
funcB();
}
void funcB()
{
parallel algo starts here....
}
I've found a great solution, for this problem. This is sample code:
#include <iostream>
#include <mpi.h>
#include <unistd.h>
using namespace std;
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int r;
MPI_Comm_rank(MPI_COMM_WORLD, &r);
if (r == 0) {
cout << "[GUI]Start perfoming initialization...." << endl;
sleep(2);
cout << "[GUI]Send command to start execution...." << endl;
int command = 1;
//TODO: now it's hardcoded to send data to 1 proc
MPI_Send(&command, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
cout << "[GUI]Waiting for execution results..." << endl;
int buf[5];
MPI_Recv(&buf, 5, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for (int i=0; i<5; i++)
{
cout << "buf["<< i << "] = " << buf[i] << endl;
}
} else {
int command;
MPI_Recv(&command, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
cout << "Received command: " << command << endl;
if (command == 1) {
cout << "[ALGO]Receive command to start execution" << endl;
sleep(2);
cout << "[ALGO]Send computed data..." << endl;
int buf[5] = {5,4,3,2,1};
MPI_Send(&buf, 5, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}