Asynchronous executions of CUDA memory copies and cuFFT

Asynchronous executions of CUDA memory copies and cuFFT - cuda

I have a CUDA program for calculating FFTs of, let's say, size 50000. Currently, I copy the whole array to the GPU and execute the cuFFT. Now, I am trying to optimize the programm and the NVIDIA Visual Profiler tells me to hide the memcopy by concurrency with parallel computations. My question is:
Is it possible, for example, to copy the first 5000 Elements, then start calculating, then copying the next bunch of data in parallel to calculations etc?
Since a DFT is basically a sum over the time values multiplied with a complex exponential function, I think that it should possible to calculate the FFT "blockwise".
Does cufft support this? Is it in general a good computational idea?
EDIT
To be more clear, I do not want to calculate different FFTs parallel on different arrays. Lets say I have a big trace of a sinusoidal signal in the time domain and I want to know which frequencies are in the signal. My Idea is to copy, for example, one third of the signal length to the GPU, then the next third and calculate the FFT with the first third of the already copied input values parallel. Then copy the last third and update the output values until all the time values are processed. So in the end there should be one output array with a peak at the frequency of the sinus.

Please, take into account the comments above and, in particular, that:
If you calculate the FFT over Npartial elements, you will have an output of Npartial elements;
(following Robert Crovella) All the data required for the cuFFT must be resident on the device, before the cuFFT call is launched, so that you will not be able to break the data into pieces for a single cuFFT operation, and begin that operation before all pieces are on the GPU; furthermore, a cuFFT call is opaque;
Taking into account the above two points, I think you can only "emulate" what you like to achieve if you properly use zero padding in the way illustrated by the code below. As you will see, letting N to be the data size, by dividing the data in NUM_STREAMS chunks, the code performs NUM_STREAMS zero padded and streamed cuFFT calls of size N. After the cuFFT, you have to combine (sum) the partial results.
#include <stdio.h>
#include <cufft.h>
#define BLOCKSIZE 32
#define NUM_STREAMS 3
/**********/
/* iDivUp */
/*********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/******************/
/* SUMMING KERNEL */
/******************/
__global__ void kernel(float2 *vec1, float2 *vec2, float2 *vec3, float2 *out, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
out[tid].x = vec1[tid].x + vec2[tid].x + vec3[tid].x;
out[tid].y = vec1[tid].y + vec2[tid].y + vec3[tid].y;
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 600000;
const int Npartial = N / NUM_STREAMS;
// --- Host input data initialization
float2 *h_in1 = new float2[Npartial];
float2 *h_in2 = new float2[Npartial];
float2 *h_in3 = new float2[Npartial];
for (int i = 0; i < Npartial; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out = new float2[N];
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, Npartial*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out; gpuErrchk(cudaMalloc((void**)&d_out, N*sizeof(float2)));
// --- Zero padding
gpuErrchk(cudaMemset(d_in1, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in2, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in3, 0, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(&d_in2[Npartial], h_in2, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(&d_in3[2*Npartial], h_in3, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
kernel<<<iDivUp(BLOCKSIZE,N), BLOCKSIZE>>>(d_out1, d_out2, d_out3, d_out, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_out, d_out, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("i = %i; real(h_out) = %f; imag(h_out) = %f\n", i, h_out[i].x, h_out[i].y);
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
cudaDeviceReset();
return 0;
}
This is the timeline of the above code when run on a Kepler K20c card. As you can see, the computation overlaps the async memory transfers.

Related

issues of cuBLAS performance on batched complex linear system solver

I am new to cuda and cuBlas, and recently I am trying to use batched cuBlas API to solve multiple systems of linear equations. Here's my code:
The size of the matrix is N, and the number of matrices (batch size) is numOfMat.
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h> // deal with complex numbers
#include <cuda_profiler_api.h>
using namespace std::chrono;
#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
std::random_device device;
std::mt19937 generator(device());
std::uniform_real_distribution<double> distribution(1., 5.);
high_resolution_clock::time_point t1;
high_resolution_clock::time_point t2;
double duration = 0;
double duration_1 = 0;
// step 1: cuda solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
cublasStatus_t stat;
int* PivotArray;
int* infoArray;
cudaError_t cudaStatUnified1 = cudaSuccess;
cudaError_t cudaStatUnified2 = cudaSuccess;
const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
std::cout <<"unified memory allocated unsuccessful!"<<std::endl;
//ALLOCATE MEMORY - using unified memory
cuDoubleComplex** h_A;
cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
for (int nm = 1; nm < numOfMat; nm++)
h_A[nm] = h_A[nm-1]+ N * N;
cuDoubleComplex** h_b;
cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
for (int nm = 1; nm < numOfMat; nm++)
h_b[nm] = h_b[nm-1] + N;
// FILL MATRICES
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
// FILL COEFFICIENTS
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
t1 = high_resolution_clock::now();
// step 2: Perform CUBLAS LU solver
stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
gpuErrchk( cudaDeviceSynchronize() );
// check if the input matrix is singular
/*for (int i = 0; i < numOfMat; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}*/
// step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple RHSs
// --- Function overrides b as a result
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
gpuErrchk( cudaDeviceSynchronize() );
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
gpuErrchk( cudaDeviceSynchronize() );
t2 = high_resolution_clock::now();
duration = duration_cast<microseconds>(t2 - t1).count();
std::cout<<duration<<std::endl;
}
The code works fine, but when I plot the computational time versus the number of matrices, the curve looks like this:
My question is: why does the computational time show linear to the number of matrices? Intuitively, the curve should look to be flat when the batch size is large in some extent. However, when the batch size reaches up to 500,000, the time still appears to be linear to the batch size.
How can it be? Is there any explanation behind such a circumstance?

I think you need to look more closely at your data. If I run a modification of your code on Google Colab (Tesla T4) I get this:
Which looks largely like your figure. But look more closely (log scales help):
You can clearly see that up to a certain point, the runtime is largely independent of the number of matrices (around 2^8 = 64), but then scaling is linear as sizes increase. That is the transition from being able to parallelize the workload to reaching parallel capacity and having to schedule many parallel groups of operations to execute the workload. You might infer that for this particular GPU, the GPU run out of parallel capacity at between 64 and 128 concurrent operations (The T4 has 40 SM, so it might well be 80 if an SM could accommodate 2 operations per SM concurrently), after which runtime scales with multiples of that limiting size.
This is completely normal behaviour for any parallel computation architecture I am familiar with.

loop unrolling with dynamic parallelism decrease the time performance

I have a simple program to calculate square root, loop unrolling was done as
loop unrolling
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
//printf("%d\n",n);
for(int q=0;q<2;q++)
{
if(N<2000)
{
arr[idx+q] = arr[idx+q] * arr[idx+q];
}
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
then realizing that the loop calculation can be minimized with dynamic parallelism .
unrolling with dynamic parallelism was implemented as
unrolling with dynamic parallelism
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
square <<< 1,2 >>> (arr, n,idx);
}
__global__ void square(float *a, int N,int idx)
{
int tdx = blockIdx.x * blockDim.x + threadIdx.x;
printf("%d\n",N);
if(N<2000)
{
a[tdx+idx] = a[tdx+idx] * a[tdx+idx];
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
the implementation of dynamic parallelism with unrolling takes more time for executio than only unrolling. Aren,t we suppose to improve execution time with dynamic parallelism in such case?

Dynamic parallelism is mainly useful in cases where you have parallelism that is dynamic. That is: cases where you don't know how much parallelism you're going to need until you've done some calculation. Rather than transfer data back to the host which is then instantly fed into parameterising another launch, you launch from within the kernel. In this pattern, with memcpys between kernel launches avoided, you'll see speedup.
In your example above this is not the case. You could have just launched twice as many threads from the host. There's nothing dynamic required as there's no parallelism available there that you didn't know about at the time of the first kernel launch.
Furthermore, performance requirements for kernels launched using dynamic parallelism are similar to that of those launched from the host. You have to launch a reasonable amount of work or the launch latency will dominate your computation time.

cuFFT and streams

I'm trying to launch multiple CUDA FFT kernels asynchronously using streams.
For that, I'm creating my streams, cuFFT forward and inverse plans as follows:
streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)
{
cudaStreamCreate(&streams[i]);
CHECK_ERROR(5)
cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
CHECK_ERROR(5)
cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
CHECK_ERROR(5)
cufftSetStream(plansF[i],streams[i]);
CHECK_ERROR(5)
cufftSetStream(plansI[i],streams[i]);
CHECK_ERROR(5)
}
In the main function, I'm launching forward FFTs as follows:
for(w=1;w<q;w++)
{
cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
CHECK_ERROR(8)
k += rect_small_real;
j += rect_small_complex;
}
I also have other kernels that I launch asynchronously with the same streams.
When I profile my application using Visual Profiler 5.0, I see that all kernels except the CUDA FFT (both forward and inverse) run in parallel and overlap. FFT kernels do run in different streams, but they do not overlap, as they actually run sequentially.
Can anyone tell me what is my problem?
My environment is VS 2008, 64 bit, Windows 7.
Thanks.

This is a worked example of cuFFT execution and memcopies using streams in CUDA on the Kepler architecture.
Here is the code:
#include <stdio.h>
#include <cufft.h>
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 5000;
// --- Host input data initialization
float2 *h_in1 = new float2[N];
float2 *h_in2 = new float2[N];
float2 *h_in3 = new float2[N];
for (int i = 0; i < N; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out1 = new float2[N];
float2 *h_out2 = new float2[N];
float2 *h_out3 = new float2[N];
for (int i = 0; i < N; i++) {
h_out1[i].x = 0.f;
h_out1[i].y = 0.f;
h_out2[i].x = 0.f;
h_out2[i].y = 0.f;
h_out3[i].x = 0.f;
h_out3[i].y = 0.f;
}
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaHostUnregister(h_out1));
gpuErrchk(cudaHostUnregister(h_out2));
gpuErrchk(cudaHostUnregister(h_out3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out1;
delete[] h_out2;
delete[] h_out3;
cudaDeviceReset();
return 0;
}
Please, add cuFFT error check according to CUFFT error handling.
Below, some profiling information when testing the above algorithm on a Kepler K20c card is provided. As you will see, you will achieve a true overlap between computation and memory transfers only provided that you have a sufficiently large N.
N = 5000
N = 50000
N = 500000

The problem is in the hardware you use.
All CUDA capable GPUs are capable of executing a kernel and copying data in both ways concurrently. However, only devices with Compute Capability 3.5 have the feature named Hyper-Q.
Briefly, in these GPU's several (16 I suppose) hardware kernel queues are implemented. In previous GPU's one one hardware queue is available.
This means that cudaStreams are only virtual and their usage for old hardware makes sense only in case of overlapping computations and memory copying. Of course this is valid not only for cuFFT but also for your own kernels too!
Please look deeply inside the output of visual profiler. You may unintentionally think of the timeline visualization as of the exact data for GPU execution. However it is not that simple. There're several lines in which displayed data may refer to timepoint in which the kernel launch line was executed (usually orange ones). And this line correspond to execution of specific kernel on GPU (blue rectangles). The same is for memory transfers (the exact time is shown as light brown rectangles).
Hope, I helped you to solve your problem.

Here's a riff on #JackOLantern's code that allows easy variation of the number of FFTs, FFT length, and stream count to experiment with GPU utilization in nvvp.
// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val) __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )
template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
if (code) {
std::cout << "CUDA error at "
<< file << ":" << line << std::endl
<< "error code: " << (unsigned int) code
<< " type: \"" << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
<< "func: \"" << func << "\""
<< std::endl;
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
int main(int argc, char *argv[]) {
// Number of FFTs to compute.
const int NUM_DATA = 64;
// Length of each FFT.
const int N = 1048576;
// Number of GPU streams across which to distribute the FFTs.
const int NUM_STREAMS = 4;
// Allocate and initialize host input data.
float2 **h_in = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_in[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_in[ii][jj].x = (float) 1.f;
h_in[ii][jj].y = (float) 0.f;
}
}
// Allocate and initialize host output data.
float2 **h_out = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_out[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_out[ii][jj].x = 0.f;
h_out[ii][jj].y = 0.f;
}
}
// Pin host input and output memory for cudaMemcpyAsync.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));
}
// Allocate pointers to device input and output arrays.
float2 **d_in = new float2 *[NUM_STREAMS];
float2 **d_out = new float2 *[NUM_STREAMS];
// Allocate intput and output arrays on device.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));
}
// Create CUDA streams.
cudaStream_t streams[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamCreate(&streams[ii]));
}
// Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int ii = 0; ii < NUM_STREAMS; ii++) {
cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
cufftSetStream(plans[ii], streams[ii]);
}
// Fill streams with async memcopies and FFTs.
for (int ii = 0; ii < NUM_DATA; ii++) {
int jj = ii % NUM_STREAMS;
check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));
}
// Wait for calculations to complete.
for(int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamSynchronize(streams[ii]));
}
// Free memory and streams.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostUnregister(h_in[ii]));
check_cuda_errors(cudaHostUnregister(h_out[ii]));
check_cuda_errors(cudaFree(d_in[ii]));
check_cuda_errors(cudaFree(d_out[ii]));
delete[] h_in[ii];
delete[] h_out[ii];
check_cuda_errors(cudaStreamDestroy(streams[ii]));
}
delete plans;
cudaDeviceReset();
return 0;
}

One dimensional fftshift in CUDA

I'm setting up a one dimensional fftshift in CUDA. My code is the following
__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
double2 temp;
if(i< N/2)
{
temp.x = u_d[i].x;
temp.y = u_d[i].y;
u_d[i].x =u_d[i+N/2].x;
u_d[i].y =u_d[i+N/2].y;
u_d[i+N/2].x = temp.x;
u_d[i+N/2].y = temp.y;
}
}
Is there any way, smarter than that shown above, to perform the fftshift in CUDA?
Thanks in advance.
A PERHAPS BETTER SOLUTION
I found that perhaps the following solution could be a good alternative
__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < N)
{
double a = pow(-1.0,i&1);
u_d[i].x *= a;
u_d[i].y *= a;
}
}
It consists in multiplying the vector to be transformed by a sequence of 1s and -1s which is equivalent to the multiplication by exp(-jnpi) and thus to a shift in the conjugate domain.
You have to call this kernel before and after the application of the CUFFT.
One pro is that memory movements/swapping are avoided and the idea can be immediately extended to the 2D case, see CUDA Device To Device transfer expensive.
CONCERNING SYMMETRIC DATA
This solution seems not to be limited to symmetric data. Try for example the following Matlab code, applying the idea to a completely complex random matrix (Gaussian amplitude and uniform phase).
N1=512;
N2=256;
Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);
Im=Magnitude.*exp(j*Phase);
Transform=fftshift(fft2(ifftshift(Im)));
n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);
100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))
The returned normalized root mean square error will be 0, confirming that Transform=Im4.
IMPROVEMENT TO THE SPEED
Following the suggestion received at the NVIDIA Forum, improved speed can be achieved as by changing the instruction
double a = pow(-1.0,i&1);
to
double a = 1-2*(i&1);
to avoid the use of the slow routine pow.

After much time and the introduction of the callback functionality of cuFFT, I can provide a meaningful answer to my own question.
Above I was proposing a "perhaps better solution". After some testing, I have realized that, without using the callback cuFFT functionality, that solution is slower because it uses pow. Then, I have explored two alternatives to the use of pow, something like
float a = (float)(1-2*((int)offset%2));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
and
float2 out = ((float2*)d_in)[offset];
if ((int)offset&1) {
out.x = -out.x;
out.y = -out.y;
}
But, with standard cuFFT, all the above solutions require two separate kernel calls, one for the fftshift and one for the cuFFT execution call. However, with the new cuFFT callback functionality, the above alternative solutions can be embedded in the code as __device__ functions.
So, finally I ended up with the below comparison code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <cufft.h>
#include <cufftXt.h>
//#define DEBUG
#define BLOCKSIZE 256
/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2)
{
float2 temp = d_inout[tid];
d_inout[tid] = d_inout[tid + (N / 2)];
d_inout[tid + (N / 2)] = temp;
}
}
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float a = (float)(1-2*((int)offset%2));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float a = pow(-1.,(double)(offset&1));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float2 out = ((float2*)d_in)[offset];
if ((int)offset&1) {
out.x = -out.x;
out.y = -out.y;
}
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;
/********/
/* MAIN */
/********/
int main()
{
const int N = 131072;
printf("N = %d\n", N);
// --- Host side input array
float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
for (int i=0; i<N; i++) {
h_vect[i].x = (float)rand() / (float)RAND_MAX;
h_vect[i].y = (float)rand() / (float)RAND_MAX;
}
// --- Host side output arrays
float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));
// --- Device side input arrays
float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
// --- Device side output arrays
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/*******************************************/
/* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
/*******************************************/
cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Memory movements elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));
/****************************************/
/* CHESSBOARD MULTIPLICATION V1 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v1 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }
printf("Chessboard v1 test passed!\n");
/****************************************/
/* CHESSBOARD MULTIPLICATION V2 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v2 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }
printf("Chessboard v2 test passed!\n");
/****************************************/
/* CHESSBOARD MULTIPLICATION V3 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v3 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }
printf("Chessboard v3 test passed!\n");
return 0;
}
RESULTS ON A GTX 480
N Mem mov v1 v2 v3
131072 0.552 0.136 0.354 0.183
262144 0.536 0.175 0.451 0.237
524288 0.661 0.283 0.822 0.290
1048576 0.784 0.565 1.548 0.548
2097152 1.298 0.952 2.973 0.944
RESULTS ON A TESLA C2050
N Mem mov v1 v2 v3
131072 0.278 0.130 0.236 0.132
262144 0.344 0.202 0.374 0.206
524288 0.544 0.378 0.696 0.387
1048576 0.909 0.695 1.294 0.695
2097152 1.656 1.349 2.531 1.349
RESULTS ON A KEPLER K20c
N Mem mov v1 v2 v3
131072 0.077 0.076 0.136 0.076
262144 0.142 0.128 0.202 0.127
524288 0.268 0.229 0.374 0.230
1048576 0.516 0.433 0.717 0.435
2097152 1.019 0.853 1.400 0.855
Some more details have recently appeared at The 1D fftshift in CUDA by chessboard multiplication and at the GitHub page.

If space is not a concern (and are using fftshift for only one dimension), create u_d with size 1.5 x N, and write the first N/2 elements at the end. You can then move u_d to u_d + N / 2
Here is how you could do it.
double2 *u_d, *u_d_begin;
size_t bytes = N * sizeof(double2);
// This is different from bytes / 2 when N is odd
size_t half_bytes = (N / 2) * sizeof(double2);
CUDA_CHK(cudaMalloc( &u_d, bytes + half_bytes ));
u_d_begin = u_d;
...
// Do some processing and populate u_d;
...
// Copy first half to the end
CUDA_CHK(cudaMemcpy(u_d + N, u_d, half_bytes, cudaMemcpyDeviceToDevice));
u_d = u_d + N /2;

Thrust inside user written kernels

I am a newbie to Thrust. I see that all Thrust presentations and examples only show host code.
I would like to know if I can pass a device_vector to my own kernel? How?
If yes, what are the operations permitted on it inside kernel/device code?

As it was originally written, Thrust is purely a host side abstraction. It cannot be used inside kernels. You can pass the device memory encapsulated inside a thrust::device_vector to your own kernel like this:
thrust::device_vector< Foo > fooVector;
// Do something thrust-y with fooVector
Foo* fooArray = thrust::raw_pointer_cast( fooVector.data() );
// Pass raw array and its size to kernel
someKernelCall<<< x, y >>>( fooArray, fooVector.size() );
and you can also use device memory not allocated by thrust within thrust algorithms by instantiating a thrust::device_ptr with the bare cuda device memory pointer.
Edited four and half years later to add that as per #JackOLantern's answer, thrust 1.8 adds a sequential execution policy which means you can run single threaded versions of thrust's alogrithms on the device. Note that it still isn't possible to directly pass a thrust device vector to a kernel and device vectors can't be directly used in device code.
Note that it is also possible to use the thrust::device execution policy in some cases to have parallel thrust execution launched by a kernel as a child grid. This requires separate compilation/device linkage and hardware which supports dynamic parallelism. I am not certain whether this is actually supported in all thrust algorithms or not, but certainly works with some.

This is an update to my previous answer.
Starting from Thrust 1.8.1, CUDA Thrust primitives can be combined with the thrust::device execution policy to run in parallel within a single CUDA thread exploiting CUDA dynamic parallelism. Below, an example is reported.
#include <stdio.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include "TimingGPU.cuh"
#include "Utilities.cuh"
#define BLOCKSIZE_1D 256
#define BLOCKSIZE_2D_X 32
#define BLOCKSIZE_2D_Y 32
/*************************/
/* TEST KERNEL FUNCTIONS */
/*************************/
__global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {
const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);
}
__global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {
const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);
}
/********/
/* MAIN */
/********/
int main() {
const int Nrows = 64;
const int Ncols = 2048;
gpuErrchk(cudaFree(0));
// size_t DevQueue;
// gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount));
// DevQueue *= 128;
// gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue));
float *h_data = (float *)malloc(Nrows * Ncols * sizeof(float));
float *h_results = (float *)malloc(Nrows * sizeof(float));
float *h_results1 = (float *)malloc(Nrows * sizeof(float));
float *h_results2 = (float *)malloc(Nrows * sizeof(float));
float sum = 0.f;
for (int i=0; i<Nrows; i++) {
h_results[i] = 0.f;
for (int j=0; j<Ncols; j++) {
h_data[i*Ncols+j] = i;
h_results[i] = h_results[i] + h_data[i*Ncols+j];
}
}
TimingGPU timerGPU;
float *d_data; gpuErrchk(cudaMalloc((void**)&d_data, Nrows * Ncols * sizeof(float)));
float *d_results1; gpuErrchk(cudaMalloc((void**)&d_results1, Nrows * sizeof(float)));
float *d_results2; gpuErrchk(cudaMalloc((void**)&d_results2, Nrows * sizeof(float)));
gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Timing approach nr. 1 = %f\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<Nrows; i++) {
if (h_results1[i] != h_results[i]) {
printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
return 0;
}
}
timerGPU.StartCounter();
test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Timing approach nr. 2 = %f\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<Nrows; i++) {
if (h_results1[i] != h_results[i]) {
printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
return 0;
}
}
printf("Test passed!\n");
}
The above example performs reductions of the rows of a matrix in the same sense as Reduce matrix rows with CUDA, but it is done differently from the above post, namely, by calling CUDA Thrust primitives directly from user written kernels. Also, the above example serves to compare the performance of the same operations when done with two execution policies, namely, thrust::seq and thrust::device. Below, some graphs showing the difference in performance.
The performance has been evaluated on a Kepler K20c and on a Maxwell GeForce GTX 850M.

I would like to provide an updated answer to this question.
Starting from Thrust 1.8, CUDA Thrust primitives can be combined with the thrust::seq execution policy to run sequentially within a single CUDA thread (or sequentially within a single CPU thread). Below, an example is reported.
If you want parallel execution within a thread, then you may consider using CUB which provides reduction routines that can be called from within a threadblock, provided that your card enables dynamic parallelism.
Here is the example with Thrust
#include <stdio.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void test(float *d_A, int N) {
float sum = thrust::reduce(thrust::seq, d_A, d_A + N);
printf("Device side result = %f\n", sum);
}
int main() {
const int N = 16;
float *h_A = (float*)malloc(N * sizeof(float));
float sum = 0.f;
for (int i=0; i<N; i++) {
h_A[i] = i;
sum = sum + h_A[i];
}
printf("Host side result = %f\n", sum);
float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice));
test<<<1,1>>>(d_A, N);
}

If you mean to use the data allocated / processed by thrust yes you can, just get the raw pointer of the allocated data.
int * raw_ptr = thrust::raw_pointer_cast(dev_ptr);
if you want to allocate thrust vectors in the kernel I never tried but I don't think will work
and also if it works I don't think it will provide any benefit.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Asynchronous executions of CUDA memory copies and cuFFT - cuda

Related

issues of cuBLAS performance on batched complex linear system solver

loop unrolling with dynamic parallelism decrease the time performance

cuFFT and streams

One dimensional fftshift in CUDA

Thrust inside user written kernels

Categories

Resources