Getting CUDA Thrust to use a CUDA stream of your choice - cuda

Looking at kernel launches within the code of CUDA Thrust, it seems they always use the default stream. Can I make Thrust use a stream of my choice? Am I missing something in the API?

I want to update the answer provided by talonmies following the release of Thrust 1.8 which introduces the possibility of indicating the CUDA execution stream as
thrust::cuda::par.on(stream)
see also
Thrust Release 1.8.0.
In the following, I'm recasting the example in
False dependency issue for the Fermi architecture
in terms of CUDA Thrust APIs.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust\device_vector.h>
#include <thrust\execution_policy.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
struct BinaryOp{ __host__ __device__ int operator()(const int& o1,const int& o2) { return o1 * o2; } };
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), BinaryOp());
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_out[offset + streamSize/2]), BinaryOp());
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++) {
//printf("%i %i\n", h_out[i], h_checkResults[i]);
sum += h_checkResults[i] - h_out[i];
}
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page.
The Visual Profiler timeline shows the concurrency of CUDA Thrust operations and memory transfers

No you are not missing anything (at least up to the release snapshot which ships with CUDA 6.0).
The original Thrust tag based dispatch system deliberately abstracts all of the underlying CUDA API calls away, sacrificing some performance for ease of use and consistency (keep in mind that thrust has backends other than CUDA). If you want that level of flexibility, you will need to try another library (CUB, for example).
In versions since the CUDA 7.0 snapshot it has become possible to set a stream of choice for thrust operations via the execution policy and dispatch feature.

Related

cudaMallocManaged and cudaDeviceSynchronize()

I have the following two mostly identical example codes. code1.cu use cudaMalloc and cudaMemcpy to handling device/host variable value exchange.
The code2.cu use cudaMallocManaged and thus cudaMemcpy is not needed. When cudaMallocManaged is used, I have to include cudaDeviceSynchronize() to get the correct results, while for the one with cudaMalloc, this is not needed. I would appreciate some hint on why this is happening
code2.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMallocManaged((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
tot =*d_tot;
//
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
code1.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMalloc((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
//
cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0
After remove the comment on cudaDeviceSynchronize(),
GPU: tot: 8.79609e+12
CUDA kernel launches are asynchronous. That means that they execute independently of the CPU thread that launched them.
Because of this asynchronous launch, the CUDA kernel is not guaranteed to be finished (or even started) by the time your CPU thread code begins testing the result.
Therefore it is necessary to wait until the GPU kernel is complete, and cudaDeviceSynchronize() does exactly that. cudaMemcpy also has a synchronizing effect, so when you remove the cudaMemcpy operations, you lose that synchronization, but cudaDeviceSynchronize() restores it.

Does accessing mapped pinned host (or a peer device) memory require GPU copy engine?

Assume the GPU has one execution engine and one copy engine.
When inside a CUDA kernel the threads access the host memory, does it make the copy engine busy? Does it consequently block all asynchronous memory copy operations to/from the device in other streams?
If inside the CUDA kernel threads access the peer device memory, does it make copy engines in both devices busy?
I'm trying to provide an answer to the first question only
When inside a CUDA kernel the threads access the host memory, does it make the copy engine busy? Does it consequently block all asynchronous memory copy operations to/from the device in other streams?
I have written down the below simple code. It contains two kernels, one explicitly using mapped pinned host memory, namely kernel2, and one not explicitly using mapped pinned host memory, namely kernel1. The code uses three streams to check if the use of mapped pinned host memory disrupt concurrency or not.
Here is the code:
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************************/
/* KERNEL FUNCTION - VERSION 1 */
/*******************************/
__global__ void kernel1(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
/*******************************/
/* KERNEL FUNCTION - VERSION 2 */
/*******************************/
__global__ void kernel2(const int *in, int *out, int* cnt, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = cnt[i] * in[i] * in[i];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int dataSize = 6000000;
// --- Host side memory allocations
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
// --- Host side memory initialization
for(int i = 0; i < dataSize; i++) h_in[i] = 5;
for(int i = 0; i < dataSize; i++) h_out[i] = 0;
// --- Registers host memory as page-locked, as required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in, dataSize * sizeof(int), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out, dataSize * sizeof(int), cudaHostRegisterPortable));
// --- Device side memory allocations
int *d_in = 0; gpuErrchk(cudaMalloc((void**)&d_in, dataSize * sizeof(int)));
int *d_out = 0; gpuErrchk(cudaMalloc((void**)&d_out, dataSize * sizeof(int)));
// --- Testing mapped pinned memory
int *cnt; gpuErrchk(cudaMallocHost((void**)&cnt, dataSize * sizeof(int)));
for(int i = 0; i < dataSize; i++) cnt[i] = 2;
int streamSize = dataSize / NUM_STREAMS;
size_t streamMemSize = dataSize * sizeof(int) / NUM_STREAMS;
// --- Setting kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
/**********/
/* CASE 1 */
/**********/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
kernel1<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel1<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
/**********/
/* CASE 2 */
/**********/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
kernel2<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], cnt, streamSize/2);
kernel2<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], cnt, streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in;
delete[] h_out;
gpuErrchk(cudaDeviceReset());
return 0;
}
From the below timeline, it seems that the usage of mapped pinned host memory in kernel2 does not disrupt concurrency. The algorithm has been tested on a GT540M card having a single copy engine.

CUDA timing for multi-gpu applications

This is the standard way timing in CUDA is performed:
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// Something to be timed
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Time: %f ms\n", time);
In the CUDA simpleP2P (peer-to-peer) example, timing is performed in this way:
cudaEvent_t start, stop;
float time;
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&start,eventflags);
cudaEventCreateWithFlags(&stop,eventflags);
cudaEventRecord(start,0);
// Something to be timed
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
My questions are:
Why, for the P2P example, timing has been performed by cudaEventCreateWithFlags with the cudaEventBlockingSync?
Is it something needed in, generally speaking, all multi-gpu applications (including peer-to-peer memcopy timings?
Thanks.
After almost three years, I'm answering my own question.
To this end, I'll consider my examples in Concurrency in CUDA multi-GPU executions where it has been underlined how using asynchronous copies enables achieving true multi-GPU concurrency. In particular, I will consider Test case #8 of that post.
The full code as well as the profiler timeline for Test case #8 are reported here for the sake of clarity.
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
T *d_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Breadth-first" approach - async
double *inputMatrices; gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
Timing the asynchronous copies - concurrency is destroyed
Now, let us begin by timing the asynchronous copies. A possible way to do so, is using the following snippet:
float time[numGPUs];
cudaEvent_t start[numGPUs], stop[numGPUs];
// --- "Breadth-first" approach - async
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time: %3.1f ms \n", time[k]);
Unfortunately, this way of timing destroys concurrency, as it is possible to appreciate from the profiler timeline below:
Timing the asynchronous copies - concurrency is preserved
To avoid this problem, a possibility is to launch the GPU tasks as OpenMP threads as follows:
int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;
// --- "Breadth-first" approach - async
omp_set_num_threads(numGPUs);
#pragma omp parallel
{
unsigned int k = omp_get_thread_num();
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
printf("Thread nr. %i; Elapsed time: %3.1f ms \n", k, time[k]);
}
As it can be seen from the profiler timeline, concurrency is preserved.
Timing the kernel launches - concurrency is destroyed
The same happens when timing the kernel launches. Using the following snippet, concurrency is destroyed.
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
}
for (int k = 0; k < numGPUs; k++) printf("Elapsed time: %3.1f ms \n", time[k]);
Timing the kernel launches - concurrency is preserved
Opposite to the above, using OpenMP, concurrency is preserved.
int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;
omp_set_num_threads(numGPUs);
#pragma omp parallel
{
unsigned int k = omp_get_thread_num();
gpuErrchk(cudaSetDevice(k));
cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
cudaEventRecord(start[k], 0);
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
cudaEventRecord(stop[k], 0);
cudaEventSynchronize(stop[k]);
cudaEventElapsedTime(&time[k], start[k], stop[k]);
printf("Thread nr. %i; Elapsed time: %3.1f ms \n", k, time[k]);
}

cuFFT and streams

I'm trying to launch multiple CUDA FFT kernels asynchronously using streams.
For that, I'm creating my streams, cuFFT forward and inverse plans as follows:
streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)
{
cudaStreamCreate(&streams[i]);
CHECK_ERROR(5)
cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
CHECK_ERROR(5)
cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
CHECK_ERROR(5)
cufftSetStream(plansF[i],streams[i]);
CHECK_ERROR(5)
cufftSetStream(plansI[i],streams[i]);
CHECK_ERROR(5)
}
In the main function, I'm launching forward FFTs as follows:
for(w=1;w<q;w++)
{
cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
CHECK_ERROR(8)
k += rect_small_real;
j += rect_small_complex;
}
I also have other kernels that I launch asynchronously with the same streams.
When I profile my application using Visual Profiler 5.0, I see that all kernels except the CUDA FFT (both forward and inverse) run in parallel and overlap. FFT kernels do run in different streams, but they do not overlap, as they actually run sequentially.
Can anyone tell me what is my problem?
My environment is VS 2008, 64 bit, Windows 7.
Thanks.
This is a worked example of cuFFT execution and memcopies using streams in CUDA on the Kepler architecture.
Here is the code:
#include <stdio.h>
#include <cufft.h>
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 5000;
// --- Host input data initialization
float2 *h_in1 = new float2[N];
float2 *h_in2 = new float2[N];
float2 *h_in3 = new float2[N];
for (int i = 0; i < N; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out1 = new float2[N];
float2 *h_out2 = new float2[N];
float2 *h_out3 = new float2[N];
for (int i = 0; i < N; i++) {
h_out1[i].x = 0.f;
h_out1[i].y = 0.f;
h_out2[i].x = 0.f;
h_out2[i].y = 0.f;
h_out3[i].x = 0.f;
h_out3[i].y = 0.f;
}
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaHostUnregister(h_out1));
gpuErrchk(cudaHostUnregister(h_out2));
gpuErrchk(cudaHostUnregister(h_out3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out1;
delete[] h_out2;
delete[] h_out3;
cudaDeviceReset();
return 0;
}
Please, add cuFFT error check according to CUFFT error handling.
Below, some profiling information when testing the above algorithm on a Kepler K20c card is provided. As you will see, you will achieve a true overlap between computation and memory transfers only provided that you have a sufficiently large N.
N = 5000
N = 50000
N = 500000
The problem is in the hardware you use.
All CUDA capable GPUs are capable of executing a kernel and copying data in both ways concurrently. However, only devices with Compute Capability 3.5 have the feature named Hyper-Q.
Briefly, in these GPU's several (16 I suppose) hardware kernel queues are implemented. In previous GPU's one one hardware queue is available.
This means that cudaStreams are only virtual and their usage for old hardware makes sense only in case of overlapping computations and memory copying. Of course this is valid not only for cuFFT but also for your own kernels too!
Please look deeply inside the output of visual profiler. You may unintentionally think of the timeline visualization as of the exact data for GPU execution. However it is not that simple. There're several lines in which displayed data may refer to timepoint in which the kernel launch line was executed (usually orange ones). And this line correspond to execution of specific kernel on GPU (blue rectangles). The same is for memory transfers (the exact time is shown as light brown rectangles).
Hope, I helped you to solve your problem.
Here's a riff on #JackOLantern's code that allows easy variation of the number of FFTs, FFT length, and stream count to experiment with GPU utilization in nvvp.
// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val) __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )
template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
if (code) {
std::cout << "CUDA error at "
<< file << ":" << line << std::endl
<< "error code: " << (unsigned int) code
<< " type: \"" << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
<< "func: \"" << func << "\""
<< std::endl;
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
int main(int argc, char *argv[]) {
// Number of FFTs to compute.
const int NUM_DATA = 64;
// Length of each FFT.
const int N = 1048576;
// Number of GPU streams across which to distribute the FFTs.
const int NUM_STREAMS = 4;
// Allocate and initialize host input data.
float2 **h_in = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_in[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_in[ii][jj].x = (float) 1.f;
h_in[ii][jj].y = (float) 0.f;
}
}
// Allocate and initialize host output data.
float2 **h_out = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_out[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_out[ii][jj].x = 0.f;
h_out[ii][jj].y = 0.f;
}
}
// Pin host input and output memory for cudaMemcpyAsync.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));
}
// Allocate pointers to device input and output arrays.
float2 **d_in = new float2 *[NUM_STREAMS];
float2 **d_out = new float2 *[NUM_STREAMS];
// Allocate intput and output arrays on device.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));
}
// Create CUDA streams.
cudaStream_t streams[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamCreate(&streams[ii]));
}
// Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int ii = 0; ii < NUM_STREAMS; ii++) {
cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
cufftSetStream(plans[ii], streams[ii]);
}
// Fill streams with async memcopies and FFTs.
for (int ii = 0; ii < NUM_DATA; ii++) {
int jj = ii % NUM_STREAMS;
check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));
}
// Wait for calculations to complete.
for(int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamSynchronize(streams[ii]));
}
// Free memory and streams.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostUnregister(h_in[ii]));
check_cuda_errors(cudaHostUnregister(h_out[ii]));
check_cuda_errors(cudaFree(d_in[ii]));
check_cuda_errors(cudaFree(d_out[ii]));
delete[] h_in[ii];
delete[] h_out[ii];
check_cuda_errors(cudaStreamDestroy(streams[ii]));
}
delete plans;
cudaDeviceReset();
return 0;
}

Concurrency of CUDA default stream with created streams

I created streams in this way:
cudaStream_t stream0;
cudaStream_t stream1;
cudaStreamCreate( &stream0);
cudaStreamCreate( &stream1);
I run the kernel functions like
singlecore<<<1,1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);
The two kernels are not executed currently. But if I execute the first kernel in stream1 as:
singlecore<<<1,1,0,stream1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);
they will execute currently.
I wonder if the kernel function in default stream can not be executed currently.
Yes there is a limitation on cuda commands issued to the default stream. Referring to the C programming guide section on implicit synchronization:
"Two commands from different streams cannot run concurrently if any one of the following operations is issued in-between them by the host thread:
...
•any CUDA command to the default stream,
"
So as a general rule of thumb, for overlapped copy and compute operations, it's easiest to program all such operations in a set of non-default streams. There's a bit of a loophole (which you've discovered) where it's possible to get overlap with commands issued in the default stream (and other streams), but it requires careful understanding of the restrictions between the default stream and other streams, as well as careful attention to the order in which you issue commands. A good example is explained in the C programming guide. Read all the way through the section on "overlapping behavior".
In your first example, the kernel issued to the default stream blocks execution of the kernel issued to the other stream. In your second example, you can have concurrency because the kernel issued to the non-default stream does not block the execution of the kernel issued to the default stream.
I want to update Robert Crovella's answer in the light of the newly issue CUDA 7.0 which, as of March 2015, is in the Release Candidate version.
With CUDA 7.0, default streams are regular streams in the sense that commands in the default stream may run concurrently with commands in non-default streams. A more detailed explanation of this new feature can be found at
CUDA 7 Streams Simplify Concurrency
This feature can be simply enabled by the additional --default stream per-thread compilation option.
At the page linked above, an example worked out by Mark Harris can be found. Here, I want to resume the example I posted at False dependency issue for the Fermi architecture. In particular, in the new example below, although I'm creating 3 streams, I'm not using anymore the first one and adopting the default stream in its place.
This is the timeline produced without the --default stream per-thread compilation option:
As you can see, the execution in the default stream does not exploit concurrency.
On this other side, this is the timeline produced with the --default stream per-thread compilation option:
As you can see now, the default stream execution overlaps with the other two streams execution.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int N)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = N;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
int offset = 0;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, 0);
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
kernel<<<subKernelBlock, nThreads>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
for(int i = 1; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, 0);
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, 0);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++)
sum += h_checkResults[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}