cuFFT and streams

cuFFT and streams - cuda

I'm trying to launch multiple CUDA FFT kernels asynchronously using streams.
For that, I'm creating my streams, cuFFT forward and inverse plans as follows:
streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)
{
cudaStreamCreate(&streams[i]);
CHECK_ERROR(5)
cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
CHECK_ERROR(5)
cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
CHECK_ERROR(5)
cufftSetStream(plansF[i],streams[i]);
CHECK_ERROR(5)
cufftSetStream(plansI[i],streams[i]);
CHECK_ERROR(5)
}
In the main function, I'm launching forward FFTs as follows:
for(w=1;w<q;w++)
{
cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
CHECK_ERROR(8)
k += rect_small_real;
j += rect_small_complex;
}
I also have other kernels that I launch asynchronously with the same streams.
When I profile my application using Visual Profiler 5.0, I see that all kernels except the CUDA FFT (both forward and inverse) run in parallel and overlap. FFT kernels do run in different streams, but they do not overlap, as they actually run sequentially.
Can anyone tell me what is my problem?
My environment is VS 2008, 64 bit, Windows 7.
Thanks.

This is a worked example of cuFFT execution and memcopies using streams in CUDA on the Kepler architecture.
Here is the code:
#include <stdio.h>
#include <cufft.h>
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 5000;
// --- Host input data initialization
float2 *h_in1 = new float2[N];
float2 *h_in2 = new float2[N];
float2 *h_in3 = new float2[N];
for (int i = 0; i < N; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out1 = new float2[N];
float2 *h_out2 = new float2[N];
float2 *h_out3 = new float2[N];
for (int i = 0; i < N; i++) {
h_out1[i].x = 0.f;
h_out1[i].y = 0.f;
h_out2[i].x = 0.f;
h_out2[i].y = 0.f;
h_out3[i].x = 0.f;
h_out3[i].y = 0.f;
}
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaHostUnregister(h_out1));
gpuErrchk(cudaHostUnregister(h_out2));
gpuErrchk(cudaHostUnregister(h_out3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out1;
delete[] h_out2;
delete[] h_out3;
cudaDeviceReset();
return 0;
}
Please, add cuFFT error check according to CUFFT error handling.
Below, some profiling information when testing the above algorithm on a Kepler K20c card is provided. As you will see, you will achieve a true overlap between computation and memory transfers only provided that you have a sufficiently large N.
N = 5000
N = 50000
N = 500000

The problem is in the hardware you use.
All CUDA capable GPUs are capable of executing a kernel and copying data in both ways concurrently. However, only devices with Compute Capability 3.5 have the feature named Hyper-Q.
Briefly, in these GPU's several (16 I suppose) hardware kernel queues are implemented. In previous GPU's one one hardware queue is available.
This means that cudaStreams are only virtual and their usage for old hardware makes sense only in case of overlapping computations and memory copying. Of course this is valid not only for cuFFT but also for your own kernels too!
Please look deeply inside the output of visual profiler. You may unintentionally think of the timeline visualization as of the exact data for GPU execution. However it is not that simple. There're several lines in which displayed data may refer to timepoint in which the kernel launch line was executed (usually orange ones). And this line correspond to execution of specific kernel on GPU (blue rectangles). The same is for memory transfers (the exact time is shown as light brown rectangles).
Hope, I helped you to solve your problem.

Here's a riff on #JackOLantern's code that allows easy variation of the number of FFTs, FFT length, and stream count to experiment with GPU utilization in nvvp.
// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val) __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )
template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
if (code) {
std::cout << "CUDA error at "
<< file << ":" << line << std::endl
<< "error code: " << (unsigned int) code
<< " type: \"" << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
<< "func: \"" << func << "\""
<< std::endl;
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
int main(int argc, char *argv[]) {
// Number of FFTs to compute.
const int NUM_DATA = 64;
// Length of each FFT.
const int N = 1048576;
// Number of GPU streams across which to distribute the FFTs.
const int NUM_STREAMS = 4;
// Allocate and initialize host input data.
float2 **h_in = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_in[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_in[ii][jj].x = (float) 1.f;
h_in[ii][jj].y = (float) 0.f;
}
}
// Allocate and initialize host output data.
float2 **h_out = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_out[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_out[ii][jj].x = 0.f;
h_out[ii][jj].y = 0.f;
}
}
// Pin host input and output memory for cudaMemcpyAsync.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));
}
// Allocate pointers to device input and output arrays.
float2 **d_in = new float2 *[NUM_STREAMS];
float2 **d_out = new float2 *[NUM_STREAMS];
// Allocate intput and output arrays on device.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));
}
// Create CUDA streams.
cudaStream_t streams[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamCreate(&streams[ii]));
}
// Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int ii = 0; ii < NUM_STREAMS; ii++) {
cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
cufftSetStream(plans[ii], streams[ii]);
}
// Fill streams with async memcopies and FFTs.
for (int ii = 0; ii < NUM_DATA; ii++) {
int jj = ii % NUM_STREAMS;
check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));
}
// Wait for calculations to complete.
for(int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamSynchronize(streams[ii]));
}
// Free memory and streams.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostUnregister(h_in[ii]));
check_cuda_errors(cudaHostUnregister(h_out[ii]));
check_cuda_errors(cudaFree(d_in[ii]));
check_cuda_errors(cudaFree(d_out[ii]));
delete[] h_in[ii];
delete[] h_out[ii];
check_cuda_errors(cudaStreamDestroy(streams[ii]));
}
delete plans;
cudaDeviceReset();
return 0;
}

Related

issues of cuBLAS performance on batched complex linear system solver

I am new to cuda and cuBlas, and recently I am trying to use batched cuBlas API to solve multiple systems of linear equations. Here's my code:
The size of the matrix is N, and the number of matrices (batch size) is numOfMat.
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h> // deal with complex numbers
#include <cuda_profiler_api.h>
using namespace std::chrono;
#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
std::random_device device;
std::mt19937 generator(device());
std::uniform_real_distribution<double> distribution(1., 5.);
high_resolution_clock::time_point t1;
high_resolution_clock::time_point t2;
double duration = 0;
double duration_1 = 0;
// step 1: cuda solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
cublasStatus_t stat;
int* PivotArray;
int* infoArray;
cudaError_t cudaStatUnified1 = cudaSuccess;
cudaError_t cudaStatUnified2 = cudaSuccess;
const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
std::cout <<"unified memory allocated unsuccessful!"<<std::endl;
//ALLOCATE MEMORY - using unified memory
cuDoubleComplex** h_A;
cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
for (int nm = 1; nm < numOfMat; nm++)
h_A[nm] = h_A[nm-1]+ N * N;
cuDoubleComplex** h_b;
cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
for (int nm = 1; nm < numOfMat; nm++)
h_b[nm] = h_b[nm-1] + N;
// FILL MATRICES
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
// FILL COEFFICIENTS
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
t1 = high_resolution_clock::now();
// step 2: Perform CUBLAS LU solver
stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
gpuErrchk( cudaDeviceSynchronize() );
// check if the input matrix is singular
/*for (int i = 0; i < numOfMat; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}*/
// step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple RHSs
// --- Function overrides b as a result
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
gpuErrchk( cudaDeviceSynchronize() );
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
gpuErrchk( cudaDeviceSynchronize() );
t2 = high_resolution_clock::now();
duration = duration_cast<microseconds>(t2 - t1).count();
std::cout<<duration<<std::endl;
}
The code works fine, but when I plot the computational time versus the number of matrices, the curve looks like this:
My question is: why does the computational time show linear to the number of matrices? Intuitively, the curve should look to be flat when the batch size is large in some extent. However, when the batch size reaches up to 500,000, the time still appears to be linear to the batch size.
How can it be? Is there any explanation behind such a circumstance?

I think you need to look more closely at your data. If I run a modification of your code on Google Colab (Tesla T4) I get this:
Which looks largely like your figure. But look more closely (log scales help):
You can clearly see that up to a certain point, the runtime is largely independent of the number of matrices (around 2^8 = 64), but then scaling is linear as sizes increase. That is the transition from being able to parallelize the workload to reaching parallel capacity and having to schedule many parallel groups of operations to execute the workload. You might infer that for this particular GPU, the GPU run out of parallel capacity at between 64 and 128 concurrent operations (The T4 has 40 SM, so it might well be 80 if an SM could accommodate 2 operations per SM concurrently), after which runtime scales with multiples of that limiting size.
This is completely normal behaviour for any parallel computation architecture I am familiar with.

Asynchronous executions of CUDA memory copies and cuFFT

I have a CUDA program for calculating FFTs of, let's say, size 50000. Currently, I copy the whole array to the GPU and execute the cuFFT. Now, I am trying to optimize the programm and the NVIDIA Visual Profiler tells me to hide the memcopy by concurrency with parallel computations. My question is:
Is it possible, for example, to copy the first 5000 Elements, then start calculating, then copying the next bunch of data in parallel to calculations etc?
Since a DFT is basically a sum over the time values multiplied with a complex exponential function, I think that it should possible to calculate the FFT "blockwise".
Does cufft support this? Is it in general a good computational idea?
EDIT
To be more clear, I do not want to calculate different FFTs parallel on different arrays. Lets say I have a big trace of a sinusoidal signal in the time domain and I want to know which frequencies are in the signal. My Idea is to copy, for example, one third of the signal length to the GPU, then the next third and calculate the FFT with the first third of the already copied input values parallel. Then copy the last third and update the output values until all the time values are processed. So in the end there should be one output array with a peak at the frequency of the sinus.

Please, take into account the comments above and, in particular, that:
If you calculate the FFT over Npartial elements, you will have an output of Npartial elements;
(following Robert Crovella) All the data required for the cuFFT must be resident on the device, before the cuFFT call is launched, so that you will not be able to break the data into pieces for a single cuFFT operation, and begin that operation before all pieces are on the GPU; furthermore, a cuFFT call is opaque;
Taking into account the above two points, I think you can only "emulate" what you like to achieve if you properly use zero padding in the way illustrated by the code below. As you will see, letting N to be the data size, by dividing the data in NUM_STREAMS chunks, the code performs NUM_STREAMS zero padded and streamed cuFFT calls of size N. After the cuFFT, you have to combine (sum) the partial results.
#include <stdio.h>
#include <cufft.h>
#define BLOCKSIZE 32
#define NUM_STREAMS 3
/**********/
/* iDivUp */
/*********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/******************/
/* SUMMING KERNEL */
/******************/
__global__ void kernel(float2 *vec1, float2 *vec2, float2 *vec3, float2 *out, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
out[tid].x = vec1[tid].x + vec2[tid].x + vec3[tid].x;
out[tid].y = vec1[tid].y + vec2[tid].y + vec3[tid].y;
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 600000;
const int Npartial = N / NUM_STREAMS;
// --- Host input data initialization
float2 *h_in1 = new float2[Npartial];
float2 *h_in2 = new float2[Npartial];
float2 *h_in3 = new float2[Npartial];
for (int i = 0; i < Npartial; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out = new float2[N];
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, Npartial*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out; gpuErrchk(cudaMalloc((void**)&d_out, N*sizeof(float2)));
// --- Zero padding
gpuErrchk(cudaMemset(d_in1, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in2, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in3, 0, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(&d_in2[Npartial], h_in2, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(&d_in3[2*Npartial], h_in3, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
kernel<<<iDivUp(BLOCKSIZE,N), BLOCKSIZE>>>(d_out1, d_out2, d_out3, d_out, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_out, d_out, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("i = %i; real(h_out) = %f; imag(h_out) = %f\n", i, h_out[i].x, h_out[i].y);
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
cudaDeviceReset();
return 0;
}
This is the timeline of the above code when run on a Kepler K20c card. As you can see, the computation overlaps the async memory transfers.

Getting CUDA Thrust to use a CUDA stream of your choice

Looking at kernel launches within the code of CUDA Thrust, it seems they always use the default stream. Can I make Thrust use a stream of my choice? Am I missing something in the API?

I want to update the answer provided by talonmies following the release of Thrust 1.8 which introduces the possibility of indicating the CUDA execution stream as
thrust::cuda::par.on(stream)
see also
Thrust Release 1.8.0.
In the following, I'm recasting the example in
False dependency issue for the Fermi architecture
in terms of CUDA Thrust APIs.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust\device_vector.h>
#include <thrust\execution_policy.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
struct BinaryOp{ __host__ __device__ int operator()(const int& o1,const int& o2) { return o1 * o2; } };
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), BinaryOp());
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_out[offset + streamSize/2]), BinaryOp());
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++) {
//printf("%i %i\n", h_out[i], h_checkResults[i]);
sum += h_checkResults[i] - h_out[i];
}
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page.
The Visual Profiler timeline shows the concurrency of CUDA Thrust operations and memory transfers

No you are not missing anything (at least up to the release snapshot which ships with CUDA 6.0).
The original Thrust tag based dispatch system deliberately abstracts all of the underlying CUDA API calls away, sacrificing some performance for ease of use and consistency (keep in mind that thrust has backends other than CUDA). If you want that level of flexibility, you will need to try another library (CUB, for example).
In versions since the CUDA 7.0 snapshot it has become possible to set a stream of choice for thrust operations via the execution policy and dispatch feature.

CUDA timing for multi-gpu applications

This is the standard way timing in CUDA is performed:
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// Something to be timed
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Time: %f ms\n", time);
In the CUDA simpleP2P (peer-to-peer) example, timing is performed in this way:
cudaEvent_t start, stop;
float time;
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&start,eventflags);
cudaEventCreateWithFlags(&stop,eventflags);
cudaEventRecord(start,0);
// Something to be timed
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
My questions are:
Why, for the P2P example, timing has been performed by cudaEventCreateWithFlags with the cudaEventBlockingSync?
Is it something needed in, generally speaking, all multi-gpu applications (including peer-to-peer memcopy timings?
Thanks.

After almost three years, I'm answering my own question.
To this end, I'll consider my examples in Concurrency in CUDA multi-GPU executions where it has been underlined how using asynchronous copies enables achieving true multi-GPU concurrency. In particular, I will consider Test case #8 of that post.
The full code as well as the profiler timeline for Test case #8 are reported here for the sake of clarity.
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
T *d_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Breadth-first" approach - async
double *inputMatrices; gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
Timing the asynchronous copies - concurrency is destroyed
Now, let us begin by timing the asynchronous copies. A possible way to do so, is using the following snippet:
float time[numGPUs];
cudaEvent_t start[numGPUs], stop[numGPUs];
// --- "Breadth-first" approach - async
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time: %3.1f ms \n", time[k]);
Unfortunately, this way of timing destroys concurrency, as it is possible to appreciate from the profiler timeline below:
Timing the asynchronous copies - concurrency is preserved
To avoid this problem, a possibility is to launch the GPU tasks as OpenMP threads as follows:
int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;
// --- "Breadth-first" approach - async
omp_set_num_threads(numGPUs);
#pragma omp parallel
{
unsigned int k = omp_get_thread_num();
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
printf("Thread nr. %i; Elapsed time: %3.1f ms \n", k, time[k]);
}
As it can be seen from the profiler timeline, concurrency is preserved.
Timing the kernel launches - concurrency is destroyed
The same happens when timing the kernel launches. Using the following snippet, concurrency is destroyed.
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time: %3.1f ms \n", time[k]);
Timing the kernel launches - concurrency is preserved
Opposite to the above, using OpenMP, concurrency is preserved.
int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;
omp_set_num_threads(numGPUs);
#pragma omp parallel
{
unsigned int k = omp_get_thread_num();
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
printf("Thread nr. %i; Elapsed time: %3.1f ms \n", k, time[k]);
}

Concurrency of CUDA default stream with created streams

I created streams in this way:
cudaStream_t stream0;
cudaStream_t stream1;
cudaStreamCreate( &stream0);
cudaStreamCreate( &stream1);
I run the kernel functions like
singlecore<<<1,1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);
The two kernels are not executed currently. But if I execute the first kernel in stream1 as:
singlecore<<<1,1,0,stream1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);
they will execute currently.
I wonder if the kernel function in default stream can not be executed currently.

Yes there is a limitation on cuda commands issued to the default stream. Referring to the C programming guide section on implicit synchronization:
"Two commands from different streams cannot run concurrently if any one of the following operations is issued in-between them by the host thread:
...
•any CUDA command to the default stream,
"
So as a general rule of thumb, for overlapped copy and compute operations, it's easiest to program all such operations in a set of non-default streams. There's a bit of a loophole (which you've discovered) where it's possible to get overlap with commands issued in the default stream (and other streams), but it requires careful understanding of the restrictions between the default stream and other streams, as well as careful attention to the order in which you issue commands. A good example is explained in the C programming guide. Read all the way through the section on "overlapping behavior".
In your first example, the kernel issued to the default stream blocks execution of the kernel issued to the other stream. In your second example, you can have concurrency because the kernel issued to the non-default stream does not block the execution of the kernel issued to the default stream.

I want to update Robert Crovella's answer in the light of the newly issue CUDA 7.0 which, as of March 2015, is in the Release Candidate version.
With CUDA 7.0, default streams are regular streams in the sense that commands in the default stream may run concurrently with commands in non-default streams. A more detailed explanation of this new feature can be found at
CUDA 7 Streams Simplify Concurrency
This feature can be simply enabled by the additional --default stream per-thread compilation option.
At the page linked above, an example worked out by Mark Harris can be found. Here, I want to resume the example I posted at False dependency issue for the Fermi architecture. In particular, in the new example below, although I'm creating 3 streams, I'm not using anymore the first one and adopting the default stream in its place.
This is the timeline produced without the --default stream per-thread compilation option:
As you can see, the execution in the default stream does not exploit concurrency.
On this other side, this is the timeline produced with the --default stream per-thread compilation option:
As you can see now, the default stream execution overlaps with the other two streams execution.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int N)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = N;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
int offset = 0;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, 0);
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
kernel<<<subKernelBlock, nThreads>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
for(int i = 1; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, 0);
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, 0);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++)
sum += h_checkResults[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

cuFFT and streams - cuda

Related

issues of cuBLAS performance on batched complex linear system solver

Asynchronous executions of CUDA memory copies and cuFFT

Getting CUDA Thrust to use a CUDA stream of your choice

CUDA timing for multi-gpu applications

Concurrency of CUDA default stream with created streams

Categories

Resources