MPI. Running mpi in function - function

I want to run some function using mpi from main but I don't know how it should be. It looks like:
#define MAXSIZE 100
int main (int argc, char **argv) {
int i;
float matrixA[MAXSIZE][MAXSIZE], matrixB[MAXSIZE][MAXSIZE], matrixC[MAXSIZE][MAXSIZE];
for(i=0;i<10;i++){
multiply(matrixA, matrixB, matrixC);
}
}
void multiply(float matrixA[MAXSIZE][MAXSIZE], float matrixB[MAXSIZE][MAXSIZE], float matrixC[MAXSIZE][MAXSIZE]) {
int rank; //process rank
int size; //number of processes
MPI_Init(&argc, &argv); //initialize MPI operations
MPI_Comm_rank(MPI_COMM_WORLD, &rank); //get the rank
MPI_Comm_size(MPI_COMM_WORLD, &size); //get number of processes
...someoperation...
MPI_Finalize();
}
I know how to run basic MPI without using other functions but I need this construction.

In an application instance, MPI can be initialized at most once. So the code structure you provided will not work.
the correct structure for your program is as follows:
#define MAXSIZE 100
int main (int argc, char **argv) {
int i;
float matrixA[MAXSIZE][MAXSIZE], matrixB[MAXSIZE][MAXSIZE], matrixC[MAXSIZE][MAXSIZE];
int rank; //process rank
int size; //number of processes
MPI_Init(&argc, &argv); //initialize MPI operations
MPI_Comm_rank(MPI_COMM_WORLD, &rank); //get the rank
MPI_Comm_size(MPI_COMM_WORLD, &size); //get number of processes
for(i=0;i<10;i++){
multiply(matrixA, matrixB, matrixC);
}
MPI_Finalize();
}
void multiply(float matrixA[MAXSIZE][MAXSIZE], float matrixB[MAXSIZE][MAXSIZE], float matrixC[MAXSIZE][MAXSIZE]) {
...someoperation...
}

This might be helpful to you.In your program, there is a for loop
for(i=0;i<10;i++)
{
multiply(matrixA, matrixB, matrixC);
}
I feel that you are trying to execute multiplication 10 times.You can give each multiplication to one process.So you can use the command, mpirun -np 10 executable.

Related

Does bool variable in kernel need to be synchronized

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

Optimizing memory access for complex numbers

I have a kernel that operates on complex numbers, and I am loading the values like this:
thrust::complex<float> x = X[tIdx];
where X is in global memory. When I profile this kernel with nvvp, I find that it is memory bandwidth-limited and the profiler suggests that I improve the memory access pattern:
Global Load L2 Transactions/Access=8, Ideal Transactions/Access=4
The disassembly confirms that this line is indeed split into two 32-bit loads, producing a strided access pattern:
LDG.E R9, [R16];
LDG.E R11, [R16+0x4];
How can I get this to compile into a single 64-bit load?
Potential solutions
I realize this is pretty closely related to this earlier question but the proposed solutions (change the global memory layout or use shared memory) seem less ideal than a 64-bit load.
The NVidia developer blog suggests reinterpret_cast to a vector data type such as float2, but I'm a little hazy about how this fits in with pointer aliasing rules.
I must also confess that this is somewhat of a theoretical question. For this particular kernel, I'm limited by the device memory bandwidth, so halving the # of L2 transactions shouldn't significantly improve the overall performance. But I anticipate working with more complex numbers in my future, and if there's a simple solution then I'd like to start using it now.
The basic problem here is that the compiler seems to need explicit alignment specifications for a type before it will generate vector load and store instructions. Consider the following trivial example:
class __align__(8) cplx0
{
public:
__device__ __host__ cplx0(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
class cplx1
{
public:
__device__ __host__ cplx1(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
template<typename T>
__global__ void memsetkernel(T* out, const T val, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = val;
}
template<typename T>
__global__ void memcpykernel(const T* __restrict__ in, T* __restrict__ out, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = in[tid];
}
template<typename T>
void memcpy(const T* in, T* out, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memcpykernel<T><<<nblocks, nthreads>>>(in, out, Nitems);
cudaDeviceSynchronize();
}
template<typename T>
void memset(T* in, const T value, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memsetkernel<T><<<nblocks, nthreads>>>(in, value, Nitems);
cudaDeviceSynchronize();
}
int main(void)
{
const int Nitems = 1 << 24;
typedef cplx0 fcomplex0;
typedef cplx1 fcomplex1;
{
fcomplex0* in;
fcomplex0* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex0));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex0>(in, fcomplex0(1.0f,1.0f), Nitems);
memcpy<fcomplex0>(in, out, Nitems);
}
cudaFree(in);
cudaFree(out);
}
{
fcomplex1* in;
fcomplex1* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex1));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex1>(in, fcomplex1(1.0f,1.0f), Nitems);
memcpy<fcomplex1>(in, out, Nitems);
cudaDeviceSynchronize();
}
cudaFree(in);
cudaFree(out);
}
cudaDeviceReset();
return 0;
}
Here we has two home-baked complex types, one with explicit alignment specifications, and one without. Otherwise they are identical. Putting them through a naïve mempcy and memset kernels in this test harness allows us to inspect the code generation behaviour of the toolchain for each type and benchmark the performance.
Firstly, the code. For cplx0 class, which has explicit 8-byte alignment, the compiler emits vectorized loads and stores in both kernels:
memcpykernel
ld.global.nc.v2.f32 {%f5, %f6}, [%rd17];
st.global.v2.f32 [%rd18], {%f5, %f6};
memsetkernel
st.global.v2.f32 [%rd11], {%f1, %f2};
whereas for the cplx1 case, it does not:
memcpykernel
ld.global.nc.f32 %f1, [%rd16];
ld.global.nc.f32 %f2, [%rd16+4];
st.global.f32 [%rd15+4], %f2;
st.global.f32 [%rd15], %f1;
memsetkernel
st.global.f32 [%rd11+4], %f2;
st.global.f32 [%rd11], %f1;
Looking at performance, there is a non-trivial difference in performance for the memset case (CUDA 8 release toolkit, GTX 970 with Linux 367.48 driver):
$ nvprof ./complex_types
==29074== NVPROF is profiling process 29074, command: ./complex_types
==29074== Profiling application: ./complex_types
==29074== Profiling result:
Time(%) Time Calls Avg Min Max Name
33.04% 19.264ms 10 1.9264ms 1.9238ms 1.9303ms void memcpykernel<cplx1>(cplx1 const *, cplx1*, int)
32.72% 19.080ms 10 1.9080ms 1.9055ms 1.9106ms void memcpykernel<cplx0>(cplx0 const *, cplx0*, int)
19.15% 11.165ms 10 1.1165ms 1.1120ms 1.1217ms void memsetkernel<cplx1>(cplx1*, cplx1, int)
15.09% 8.7985ms 10 879.85us 877.67us 884.13us void memsetkernel<cplx0>(cplx0*, cplx0, int)
The Thrust templated complex type does not have an explicit alignment definition (although it potentially could via specialization, although that would somewhat defeat the purpose). So your only choice here is to either make your own version of the Thrust type with explicit alignment, or use another complex type which does (like the cuComplex type which CUBLAS and CUFFT use).

CUDA function to increase an array values

when I use this code in cuda it only increase a[0],a[1],a[2] other was 0 (didn't increased)
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[i]),1);
}
when I write
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[6]),1);
}
it didn't increase a[6]
what's wrong? sorry
all of the code is this
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[6]),1);
}
int main()
{
//=============================================
int aaa[10]={0};
int *q;
cudaMalloc((void**)&q,100);
cudaMemcpy(q,aaa,10,cudaMemcpyHostToDevice);
inc2<<<100,100>>>(q);
cudaMemcpy(aaa,q,10,cudaMemcpyDeviceToHost);
printf("\n\n");
for(int i=0;i<10;i++){
printf("%d\t",aaa[i]);
}
cudaFree(q);
return 0;
}
First of all, you should use proper cuda error checking any time you are having trouble with a CUDA code.
You may be confused about the size parameters associated with functions like cudaMalloc or cudaMemcpy. They represent a size in bytes. So this:
cudaMemcpy(aaa,q,10,cudaMemcpyDeviceToHost);
only transfers 10 bytes, which is 2.5 int quantities. If you want to see the modified value of a[6], you're going to have to transfer more than the first 2 int quantities in a.
If you modify these lines:
cudaMemcpy(q,aaa,40,cudaMemcpyHostToDevice);
^^
and:
cudaMemcpy(aaa,q,40,cudaMemcpyDeviceToHost);
^^
I think you'll have better results.

Adding values on GPU

i have a class called Product.
Each product has a value and i want to add these values on GPU. I filled my array on host side
int * h_A, * d_A;
h_A = (int*) malloc(enterNum * sizeof(int));
cudaMalloc((void **) &d_A, enterNum * sizeof(int));
Product p("Product", price);
h_A[i] = p.getValue();
while (i < enterNum) {
i++;
cout << "Enter product name:";
cin >> desc;
cout << "Enter product price:";
cin >> price;
Product p("Product", price);
h_A[i] = p.getValue();
}
cudaMemcpy(d_A, h_A, enterNum, cudaMemcpyHostToDevice);
priceSum<<<enterNum, 1024>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(result, result2, enterNum, cudaMemcpyDeviceToHost);
here cudaMemcpy function gives error because i dont use pointer. What can i do here? I dont need to use pointer here isn't it?
this is my summation function:
__global__ void priceSum(int *dA, int count, int result) {
int tid = blockIdx.x;
if (tid < count){
result+= dA[tid];
}
}
full code:
using namespace std;
#include "cuda_runtime.h"
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <stdlib.h>
class Product {
private:
char * description;
int productCode;
int value;
static int lastCode;
public:
Product(char* descriptionP, int valueP) {
productCode = ++lastCode;
value = valueP;
description = new char[strlen(descriptionP) + 1];
strcpy(description, descriptionP);
}
Product(Product& other) {
productCode = ++lastCode;
description = new char[strlen(other.description) + 1];
strcpy(description, other.description);
}
~Product() {
delete[] description;
}
char* getDescription() const {
return description;
}
void setDescription(char* description) {
this->description = description;
}
int getValue() const {
return value;
}
void setValue(int value) {
this->value = value;
}
};
int Product::lastCode = 1000;
__global__ void priceSum(int *dA, int count, int * result) {
int tid = blockIdx.x;
if (tid < count)
result+= dA[tid];
}
int main(void) {
int enterNum, price, * result = 0;
string desc;
const char * desc2;
cout << "How many products do you want to enter?";
cin >> enterNum;
int * h_A, * d_A;
h_A = (int*) malloc(enterNum * sizeof(int));
cudaMalloc((void **) &d_A, enterNum * sizeof(int));
int i = 0;
while (i < enterNum) {
cout << "Enter product name:";
cin >> desc;
cout << "Enter product price:";
cin >> price;
Product p("Product", price);
h_A[i] = p.getValue();
i++;
}
cudaMemcpy(d_A, h_A, enterNum * sizeof(int), cudaMemcpyHostToDevice);
priceSum<<<enterNum, 1>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(&result2, result, enterNum, cudaMemcpyDeviceToHost);
cout << result2;
return 0;
}
You should show the definition of result in your host code, but I assume it is:
int result;
based on how you are passing it to your priceSum kernel.
You have more than 1 problem here.
In your priceSum kernel, you are summing the values in dA[] and storing the answer in result. But you have passed the variable result to the kernel by value instead of by reference so the value you are modifying is local to the function, and will not show up anywhere else. When a function in C needs to modify a variable that is passed to it via the parameter list, and the modified variable is to show up in the function calling context, it's necessary to pass that parameter by reference (i.e. using a pointer) rather than by value. Note this is based on the C programming language and is not specific to CUDA. So you should rewrite your kernel definition as:
__global__ void priceSum(int *dA, int count, int *result) {
Regarding your cudaMemcpy call, there are several issues that need to be cleaned up. First, we need the storage for result to be properly created using cudaMalloc (before the kernel is called, because the kernel will store something there.) Next, we need to fix the parameter list of the cudaMemcpy call itself. So your host code should be rewritten as:
cudaMemcpy(d_A, h_A, enterNum, cudaMemcpyHostToDevice);
int *result;
cudaMalloc((void **)&result, sizeof(int));
priceSum<<<enterNum, 1024>>>(d_A,enterNum,result);
int result2 = 0;
cudaMemcpy(&result2, result, sizeof(int), cudaMemcpyDeviceToHost);
There appear to be other problems with your code, around the grouping of data for threads and blocks. But you haven't shown enough of your program for me to make sense of it. So let me point out that your code shows only a single value for result (and result2), yet the way your kernel is written, each thread will add its value of dA[tid] to result. You can't have a bunch of threads all updating a single value in global memory with no control mechanism, and expect to get a sensible result. Problems like this are usually best handled with a classical parallel reduction algorithm, but for the sake of simplicity, to try and get something working, you can use atomics:
atomicAdd(result, dA[tid]);
Sorry, but your kernel just makes no sense at all. You are using blockIdx.x as your tid variable, but let's note that blockIdx.x is a number that is the same for every thread in a particular block. So then going on to have every thread add dA[tid] to result in this fashion just doesn't make sense. I believe it will make more sense if you change your kernel invocation to:
priceSum<<<enterNum, 1>>>(d_A,enterNum,result);

CUDA random number generating

I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.
here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}
You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.