Measuring of const memory access latency gives strange results - cuda

I have 3 kernels that sum two numbers. The first adds two numbers in registers. The second takes one number from const memory and another one from register. The third takes two numbers from const memory.
According to article "Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking" latency of access to L1 constant cache is ~24, and latency of IADD instruction is 6 cycles for Pascal. Therefore for the first kernel I expected to get duration greater than 6, for the second greater than 24, and for the third greater than 24.
But, when I measure time with clock() I get 13, 12 and 214 respectively.
My videocard is GeForce GTX 1050 Mobile. CUDA 10.1.
Compilation command: nvcc -arch=sm_61 main.cu -o main
Below I present code of program and fragments of SASS code.
Program code:
#include <iostream>
#define RES_SIZE 10
__global__ void measureReg(int *res){
int a = res[0], b = res[1], c;
__shared__ int shdata[1];
for(int i=0;i<150;++i) __syncthreads(); // Covers latencies of accesses to global memory
unsigned int t1,t2;
t1 = clock();
asm volatile("add.s32 %0, %1, %2;": "=r"(c) : "r"(a), "r"(b) : "memory");
shdata[0] = c; //Prevents execution of clock() before add has finished
t2 = clock();
res[0] = t2 - t1;
}
__global__ void measureConst1(int *res, int n1){
int a,b;
a = res[0];
__shared__ int shdata[1];
for(int i=0;i<150;++i) __syncthreads();
unsigned int t1,t2;
t1 = clock();
asm volatile("add.s32 %0, %1, %2;": "=r"(b) : "r"(a), "r"(n1) : "memory");
shdata[0] = b;
t2 = clock();
res[0] = t2 - t1;
}
__global__ void measureConst2(int *res, int n1, int n2){
int a;
__shared__ int shdata[1];
unsigned int t1,t2;
t1 = clock();
asm volatile("add.s32 %0, %1, %2;": "=r"(a) : "r"(n1), "r"(n2) : "memory");
shdata[0] = a;
t2 = clock();
res[0] = t2 - t1;
}
int main(int argc, char** argv){
int hostRes[RES_SIZE], *devRes;
std::fill(hostRes, hostRes + RES_SIZE, 1);
cudaMalloc(&devRes, RES_SIZE*sizeof(int));
cudaMemcpy(devRes, hostRes, RES_SIZE*sizeof(int), cudaMemcpyHostToDevice);
measureReg<<<1,1>>>(devRes);
cudaMemcpy(hostRes, devRes, RES_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
std::cout<<"IADD with registers: "<<hostRes[0]<<std::endl;
measureConst1<<<1,1>>>(devRes, 10);
cudaMemcpy(hostRes, devRes, RES_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
std::cout<<"IADD with register and const mem: "<<hostRes[0]<<std::endl;
measureConst2<<<1,1>>>(devRes, 10, 20);
cudaMemcpy(hostRes, devRes, RES_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
std::cout<<"IADD with const mem: "<<hostRes[0]<<std::endl;
cudaFree(devRes);
return 0;
}
Fragments of SASS code:
/* measureReg */
CS2R R4, SR_CLOCKLO ;
IADD R0, R0, R5 ;
STS [RZ], R0 ;
CS2R R5, SR_CLOCKLO ;
/* measureConst1 */
CS2R R4, SR_CLOCKLO ;
IADD R0, R0, c[0x0][0x148] ;
STS [RZ], R0 ;
CS2R R5, SR_CLOCKLO ;
/* measureConst2 */
CS2R R2, SR_CLOCKLO ;
MOV R0, c[0x0][0x148] ;
IADD R0, R0, c[0x0][0x14c] ;
STS [RZ], R0 ;
CS2R R0, SR_CLOCKLO ;

This is broken:
int hostRes[RES_SIZE], *devRes;
std::fill(hostRes, hostRes + RES_SIZE, 1);
cudaMemcpy(devRes, hostRes, RES_SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&devRes, RES_SIZE*sizeof(int));
The compiler issues a warning to the effect that devRes is used before its value is set. You should not ignore these warnings. The correct sequence is:
int hostRes[RES_SIZE], *devRes;
std::fill(hostRes, hostRes + RES_SIZE, 1);
cudaMalloc(&devRes, RES_SIZE*sizeof(int));
cudaMemcpy(devRes, hostRes, RES_SIZE*sizeof(int), cudaMemcpyHostToDevice);
With that change, and compiling your code for sm_61 using CUDA 10.2, and focused on your measureConst2 function (although the loop unrolling behavior is the same for all) I observe SASS that looks like this:
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_61
Function : _Z13measureConst2Piii
.headerflags #"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fbc00fde007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ NOP ; /* 0x50b0000000070f00 */
/*0018*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001fd400ffe007ed */
/*0028*/ NOP ; /* 0x50b0000000070f00 */
/*0030*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/*0038*/ MEMBAR.CTA ; /* 0xef98000000070000 */
/* 0x001fb800fde007ef */
/*0048*/ NOP ; /* 0x50b0000000070f00 */
/*0050*/ NOP ; /* 0x50b0000000070f00 */
/*0058*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001fbc00fea007ff */
/*0068*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/*0070*/ MEMBAR.CTA ; /* 0xef98000000070000 */
The above sequence repeats 149 times due to compiler unrolling and optimization...
/*1f18*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001ffc00fdc007ef */
/*1f28*/ NOP ; /* 0x50b0000000070f00 */
/*1f30*/ NOP ; /* 0x50b0000000070f00 */
/*1f38*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/* 0x001fd800fcc007f5 */
/*1f48*/ MEMBAR.CTA ; /* 0xef98000000070000 */
/*1f50*/ CS2R R2, SR_CLOCKLO ; /* 0x50c8000005070002 */
/*1f58*/ MOV R0, c[0x0][0x148] ; /* 0x4c98078005270000 */
/* 0x003f98001e4007f4 */
/*1f68*/ IADD R0, R0, c[0x0][0x14c] ; /* 0x4c10000005370000 */
/*1f70*/ STS [RZ], R0 ; /* 0xef5c00000007ff00 */
/*1f78*/ CS2R R0, SR_CLOCKLO ; /* 0x50c8000005070000 */
/* 0x001fc800fe2007f1 */
/*1f88*/ IADD R0, -R2, R0 ; /* 0x5c12000000070200 */
/*1f90*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*1f98*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/* 0x001ffc00fde007f1 */
/*1fa8*/ STG.E [R2], R0 ; /* 0xeedc200000070200 */
/*1fb0*/ NOP ; /* 0x50b0000000070f00 */
/*1fb8*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*1fc8*/ BRA 0x1fc0 ; /* 0xe2400fffff07000f */
/*1fd0*/ NOP; /* 0x50b0000000070f00 */
What we note is that the compiler has:
The compiler has unrolled your loop of 150 iterations (not sure what you were hoping to accomplish with that)
Your load of n1 and n1 (kernel arguments) is happening only once.
These loads occur here:
/*1f58*/ MOV R0, c[0x0][0x148] ; /* 0x4c98078005270000 */
/* 0x003f98001e4007f4 */
/*1f68*/ IADD R0, R0, c[0x0][0x14c] ; /* 0x4c10000005370000
These are loading kernel parameters through the __constant__ memory system (this is expected behavior). The __constant__ memory system is not the same as the L1 cache or the "L1 constant cache". Even if we ignore that point, it only makes sense to talk about the latency associated with cache access if the item requested is already in the cache. In the above SASS code, there is no reason to assume this is true. You are accessing the items only once, and therefore you are witnessing the latency associated with populating the __constant__ cache with items from global memory (which is ultimately how all data moves from host to device -- even kernel arguments).
You might now ask "why don't I see that long latency with measureConst1 ? In that case, your kernel design is a bit different and we see that the load behavior is also a bit different. The SASS looks like this:
Function : _Z13measureConst1Pii
.headerflags #"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fc800fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*0018*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/* 0x001fbc00fde000b1 */
/*0028*/ LDG.E R0, [R2] ; /* 0xeed4200000070200 */
/*0030*/ NOP ; /* 0x50b0000000070f00 */
/*0038*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001fd400ffe007e9 */
/*0048*/ NOP ; /* 0x50b0000000070f00 */
/*0050*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/*0058*/ MEMBAR.CTA ; /* 0xef98000000070000 */
(repeating ...)
/*1f68*/ MEMBAR.CTA ; /* 0xef98000000070000 */
/*1f70*/ CS2R R4, SR_CLOCKLO ; /* 0x50c8000005070004 */
/*1f78*/ IADD R0, R0, c[0x0][0x148] ; /* 0x4c10000005270000 */
/* 0x003fc800fcc000f2 */
/*1f88*/ STS [RZ], R0 ; /* 0xef5c00000007ff00 */
/*1f90*/ CS2R R5, SR_CLOCKLO ; /* 0x50c8000005070005 */
/*1f98*/ IADD R0, -R4, R5 ; /* 0x5c12000000570400 */
/* 0x001fa000fde007f1 */
/*1fa8*/ STG.E [R2], R0 ; /* 0xeedc200000070200 */
/*1fb0*/ NOP ; /* 0x50b0000000070f00 */
/*1fb8*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001f8000ffe007ff */
/*1fc8*/ EXIT ; /* 0xe30000000007000f */
/*1fd0*/ BRA 0x1fd0 ; /* 0xe2400fffff87000f */
/*1fd8*/ NOP; /* 0x50b0000000070f00 */
Here we see at the very beginning of your kernel (due to your specific kernel design), the SASS is loading items from __constant__ that are immediately adjacent to the items you are loading in your timing region. Therefore its reasonable to assume that with some cache line load granularity, your timing region is now measuring something like cache latency, rather than global load latency.

Seems I have found answer on my question.
According to this question, there is immediate constant (IMC) and index constant (INC) caches. INC cache handles accesses created by LDC instruction, and IMC handles others.
I believe, that latency of L1 constant cache that is pointed out in the mentioned article "Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking" is average latency of these two caches. And latency of IMC cache is no longer than latency of access to register file.
To check these hypothesis I have written small benchmark, that contains 6 kernels. Each of these kernel consequentially read array of integers situated in "const" memory and measure latency of reads.
There is difference between these kernels. The first kernel ("imc") read data through IMC cache without any prefetching. The second one ("imc_with_imc_prefetch") also reads data through IMC, but at first prefetch data to IMC. The third one ("imc_with_inc_prefetch") prefetches data to INC and then reads from IMC. There are another 3 kernels: "inc", "inc_with_imc_prefetch", "inc_with_inc_prefetch".
According to obtained results, I have came to the conclusions:
IMC latency is 12, INC latency is 40. Average latency is 26 that is very close to latency of L1 constant cache in the mentioned article.
Cache line size of IMC and INC is 64 bytes.
There is L1.5 const cache with line size 256 bytes and average latency of 78. If it is accessed through IMC its latency is 60. And if it is accessed through INC its latency is 96.
Full code of the benchmark:
#include <iostream>
#define SMPL_COUNT 128
#define CONST_SIZE 10000 // Count of elements in constant array
__constant__ int carr[CONST_SIZE];
__global__ void imc(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
// Reading from immediate constant cache
int sum = 0;
for(int i=0;i<SMPL_COUNT;++i) {
t1 = clock();
sum += carr[i];
temp[i] = sum;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = sum;
}
__global__ void imc_with_imc_prefetch(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
const int stride = 32/4; // Make assumption that cache line is more or equal to 32 bytes
// Prefetch data to immediate constant cache
for(int i=0; i<(SMPL_COUNT+stride-1)/stride; ++i) arr[i] = carr[i*stride];
// Reading from immediate constant cache
int sum = 0;
for(int i=0;i<SMPL_COUNT;++i) {
t1 = clock();
sum += carr[i];
temp[i] = sum;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = sum;
}
__global__ void imc_with_inc_prefetch(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
// Prefetch data to index constant cache
int index = carr[CONST_SIZE-1];
for(int i=0;i<SMPL_COUNT;++i)
index = *((int*)(((char*)carr)+index)); //Subtle way to avoid calling of shift instruction
arr[0] = index;
__syncthreads();
//Reading from immediate constant cache
index = 0;
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index += carr[i];
temp[i]=index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index;
}
__global__ void inc(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
int index = carr[CONST_SIZE-1];
arr[SMPL_COUNT - 1] = index;
__syncthreads();
//Reading from index constant cache
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index = *((int*)(((char*)carr)+index));
temp[i] = index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index;
}
__global__ void inc_with_imc_prefetch(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
//Prefetch data to index constant cache
const int stride = 32/4; // Make assumption that cache line is more or equal to 32 bytes
// Prefetch data to immediate constant cache
for(int i=0; i<(SMPL_COUNT+stride-1)/stride; ++i)
arr[i] = carr[i*stride];
int index = carr[CONST_SIZE-1];
arr[SMPL_COUNT - 1] = index;
__syncthreads();
//Reading from index constant cache
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index = *((int*)(((char*)carr)+index));
temp[i] = index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index;
}
__global__ void inc_with_inc_prefetch(int*arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
int index = carr[CONST_SIZE-1];
for(int i=0;i<SMPL_COUNT;++i){
index = carr[index/4];
}
arr[0] = index;
index = carr[CONST_SIZE-1];
arr[SMPL_COUNT - 1] = index;
__syncthreads();
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index = *((int*)(((char*)carr)+index));
temp[i] = index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index+2;
}
int main(int argc, char** argv){
int hostArr[SMPL_COUNT], *devArr, *devTimes;
int imc_times[SMPL_COUNT], imc_imc_times[SMPL_COUNT], imc_inc_times[SMPL_COUNT];
int inc_times[SMPL_COUNT], inc_imc_times[SMPL_COUNT], inc_inc_times[SMPL_COUNT];
cudaMalloc(&devArr, SMPL_COUNT*sizeof(int));
cudaMalloc(&devTimes, SMPL_COUNT*sizeof(int));
cudaMemset (carr, 0, CONST_SIZE*sizeof(int));
cudaMemset (devArr, 0, SMPL_COUNT*sizeof(int));
cudaMemset (devTimes, 0, SMPL_COUNT*sizeof(int));
for(int i=0;i<SMPL_COUNT;++i) hostArr[i]=4*(i+1);
cudaMemcpyToSymbol(carr, hostArr, SMPL_COUNT*sizeof(int));
imc<<<1,1>>>(devArr, devTimes);
cudaMemcpy(imc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
imc_with_imc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(imc_imc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
imc_with_inc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(imc_inc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
inc<<<1,1>>>(devArr, devTimes);
cudaMemcpy(inc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
inc_with_imc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(inc_imc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
inc_with_inc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(inc_inc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(devArr);
cudaFree(devTimes);
std::cout<<"IMC\tIMC(IMC)\tIMC(INC)\tINC\tINC(IMC)\tINC(INC)\n";
for(int i=0;i<SMPL_COUNT;++i){
std::cout<<imc_times[i]<<"\t"<<imc_imc_times[i]<<"\t"<<imc_inc_times[i]<<"\t";
std::cout<<inc_times[i]<<"\t"<<inc_imc_times[i]<<"\t"<<inc_inc_times[i]<<"\n";
}
return 0;
}

Related

Asynchronous executions of CUDA memory copies and cuFFT

I have a CUDA program for calculating FFTs of, let's say, size 50000. Currently, I copy the whole array to the GPU and execute the cuFFT. Now, I am trying to optimize the programm and the NVIDIA Visual Profiler tells me to hide the memcopy by concurrency with parallel computations. My question is:
Is it possible, for example, to copy the first 5000 Elements, then start calculating, then copying the next bunch of data in parallel to calculations etc?
Since a DFT is basically a sum over the time values multiplied with a complex exponential function, I think that it should possible to calculate the FFT "blockwise".
Does cufft support this? Is it in general a good computational idea?
EDIT
To be more clear, I do not want to calculate different FFTs parallel on different arrays. Lets say I have a big trace of a sinusoidal signal in the time domain and I want to know which frequencies are in the signal. My Idea is to copy, for example, one third of the signal length to the GPU, then the next third and calculate the FFT with the first third of the already copied input values parallel. Then copy the last third and update the output values until all the time values are processed. So in the end there should be one output array with a peak at the frequency of the sinus.
Please, take into account the comments above and, in particular, that:
If you calculate the FFT over Npartial elements, you will have an output of Npartial elements;
(following Robert Crovella) All the data required for the cuFFT must be resident on the device, before the cuFFT call is launched, so that you will not be able to break the data into pieces for a single cuFFT operation, and begin that operation before all pieces are on the GPU; furthermore, a cuFFT call is opaque;
Taking into account the above two points, I think you can only "emulate" what you like to achieve if you properly use zero padding in the way illustrated by the code below. As you will see, letting N to be the data size, by dividing the data in NUM_STREAMS chunks, the code performs NUM_STREAMS zero padded and streamed cuFFT calls of size N. After the cuFFT, you have to combine (sum) the partial results.
#include <stdio.h>
#include <cufft.h>
#define BLOCKSIZE 32
#define NUM_STREAMS 3
/**********/
/* iDivUp */
/*********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/******************/
/* SUMMING KERNEL */
/******************/
__global__ void kernel(float2 *vec1, float2 *vec2, float2 *vec3, float2 *out, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
out[tid].x = vec1[tid].x + vec2[tid].x + vec3[tid].x;
out[tid].y = vec1[tid].y + vec2[tid].y + vec3[tid].y;
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 600000;
const int Npartial = N / NUM_STREAMS;
// --- Host input data initialization
float2 *h_in1 = new float2[Npartial];
float2 *h_in2 = new float2[Npartial];
float2 *h_in3 = new float2[Npartial];
for (int i = 0; i < Npartial; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out = new float2[N];
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, Npartial*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out; gpuErrchk(cudaMalloc((void**)&d_out, N*sizeof(float2)));
// --- Zero padding
gpuErrchk(cudaMemset(d_in1, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in2, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in3, 0, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(&d_in2[Npartial], h_in2, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(&d_in3[2*Npartial], h_in3, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
kernel<<<iDivUp(BLOCKSIZE,N), BLOCKSIZE>>>(d_out1, d_out2, d_out3, d_out, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_out, d_out, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("i = %i; real(h_out) = %f; imag(h_out) = %f\n", i, h_out[i].x, h_out[i].y);
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
cudaDeviceReset();
return 0;
}
This is the timeline of the above code when run on a Kepler K20c card. As you can see, the computation overlaps the async memory transfers.

does cuda code skip execution of unnecessary clauses in logical expressions?

In ordinary C++, if I say the following, it is safe because the third clause will be skipped execution. I'm just wondering if cuda kernel code will also have this property or it doesn't in consideration of maximizing parallelism?
int x[100] = {...}, i = -1;
if (i < 0 || i >= 100 || x[i] == 0) {
// do something.
}
EDIT:
Taking from Jack's program, the following program runs OK and output "10". There is no error doing cuda-memcheck.
#include <stdio.h>
__global__ void test(float *input, float *output, int i, int N) {
float c = 10;
// NOTE: uncomment this will cause cuda-memcheck to give an error.
// c = input[-1];
if (i < 0 || i >= N || (c = input[-1])) {
output[0] = c;
}
}
int main(void) {
int i = -1;
int N = 10;
float* input;
float* output;
float* dev_input;
float* dev_output;
input = (float*)malloc(sizeof(float) * N);
output = (float*)malloc(sizeof(float));
for (int j = 0; j < N; j++) {
input[j] = 2.0f;
}
output[0] = 3.0f;
cudaMalloc((void**)&dev_input,sizeof(float) * N);
cudaMalloc((void**)&dev_output,sizeof(float));
cudaMemcpy(dev_input,input,sizeof(float) * N,cudaMemcpyHostToDevice);
cudaMemcpy(dev_output,output,sizeof(float),cudaMemcpyHostToDevice);
test<<<1,1>>>(dev_input,dev_output,i,N);
cudaMemcpy(output,dev_output,sizeof(float),cudaMemcpyDeviceToHost);
printf("%f\n", output[0]);
return 0;
}
The CUDA C/C++ compiler should obey the language requirements in this respect.
Specifically, the language requirements as far as order-of-operation and short-circuiting should be maintained for non-overloaded || and && operators.
Try the simple code below in which the kernel function tries to access input[-1]. You will realize it will get stuck.
#include <stdio.h>
__global__ void test(float *input, float *output, int i, int N) {
if (i < N || input[i] == 0) {
output[i] = input[i];
}
}
void main(void) {
int i = -1;
int N = 10;
float* input;
float* dev_input;
float* dev_output;
input = (float*)malloc(sizeof(float));
input[0] = 2.f;
cudaMalloc((void**)&dev_input,sizeof(float));
cudaMalloc((void**)&dev_output,sizeof(float));
cudaMemcpy(dev_input,input,sizeof(float),cudaMemcpyHostToDevice);
test<<<1,1>>>(dev_input,dev_output,i,N);
}
The reason can be explained by having a look at the disassembled code.
MOV R1, c[0x1][0x100]; R1 = c[0x1][0x100]
NOP;
MOV R3, c[0x0][0x28]; R3 = c[0x0][0x28]
SHL R2, R3, 0x2; R2 = shiftleft(R3)
IADD R0, R2, c[0x0][0x20]; R0 = R2 + 0x2
LDU R0, [R0]; Load the memory addressed by R0 to R0
FSETP.EQ.AND P0, PT, R0, RZ, PT; Predicate register P0 will contain result of test R0 == 0
ISETP.LT.OR P0, PT, R3, c[0x0][0x2c], P0; Predicate register P0 will contain result of test P0 || (R3 < c[0x0][0x2c])
#P0 IADD R2, R2, c[0x0][0x24]; ...
#P0 ST [R2], R0;
EXIT ;
As you can see, the device will attempt to load the data from global memory regardless from the result of the first clause.

CUDA Matrix Multiplication Locks Up and Display Matrix of Zeros

I'm attempting to write a simple matrix multplication program that continually adds the product of two matrices to a third result matrix (I'm essentially giving the GPU a workout while I measure power consumption with a separate apparatus).
My problem occurs when I specify a large number of iterations. I've tried this with several combinations of BLOCK_SIZE and matrix dimension values, and I've noted that the number of iterations can be increased with smaller matrix dimensions, but the BLOCK_SIZE must be the square root of the matrix dimensions (square matrices).
The resulting error in this case is a 39 second (regardless of iteration value, as long as it is 'too much') freeze followed by all zero matrix output. Interestingly, I ran this once with an iteration of 20000 and it worked fine. I ran it again and got the freeze error.
Any ideas? Thanks in advance!
Kernel:
//********************************************************************
// matrixMultiplication_kernel.cu
//
// Kernel for a basic CUDA matrix multiplication program.
//********************************************************************
#ifndef MATRIXMULTIPLICATION_KERNEL
#define MATRIXMULTIPLICATION_KERNEL
#define BLOCK_SIZE 16 // Set thread block size
#define colsA 256 // Set matrix A column dimension
#define rowsA 256 // Set matrix A row dimension
#define colsB 256 // Set matrix B column dimension
#define rowsB colsA // Set matrix B row dimension
#define colsC colsB // Set matrix C column dimension
#define rowsC rowsA // Set matrix C row dimension
//--------------------------------------------------------------------
// matrixMultiplication() - Multiplies matrixA and matrixB, storing
// the result in device memory for matrixC.
//
// PRE: matrixA, matrixB, and matrixC are float pointers; numColsA
// numColsB are integers.
// POST: The result of multiplying matrixA and matrixB is stored in
// matrixC.
//--------------------------------------------------------------------
__global__ void matrixMultiplication(float * matrixA, float * matrixB,
float * matrixC, int numColsA,
int numColsB) {
/* Declare matrix-multplication holder value ouside of for loop */
float val;
/* Set block and thread index positions */
int blockX = blockIdx.x;
int blockY = blockIdx.y;
int threadX = threadIdx.x;
int threadY = threadIdx.y;
/*
Set starting and ending indices of the first sub-matrix of A
and sub-matrix size for matrix A
*/
int startA = numColsA * BLOCK_SIZE * blockY;
int endA = startA + numColsA - 1;
int subSizeA = BLOCK_SIZE;
/*
Set starting index of the first sub-matrix of B and sub-matrix
size for matrix B
*/
int startB = BLOCK_SIZE * blockX;
int subSizeB = BLOCK_SIZE * colsB;
/* Perform matrix multiplication 20000 times */
for (int iteration = 0; iteration < 20000; iteration++) {
/* Loop through matrix A and matrix B's sub-matrices */
for (int i = startA, j = startB; i <= endA; i += subSizeA,
j += subSizeB) {
/*
Declare shared memory arrays for matrix A and B
sub-matrices
*/
__shared__ float subA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float subB[BLOCK_SIZE][BLOCK_SIZE];
/* Fill sub-matrices */
subA[threadY][threadX] =
matrixA[i + colsA * threadY + threadX];
subB[threadY][threadX] =
matrixB[j + colsB * threadY + threadX];
/* Ensure that the matrices are loaded */
__syncthreads();
/* Loop through the block */
for (int k = 0; k < BLOCK_SIZE; ++k) {
/* Compute product of two matrix indices */
val += subA[threadY][k] * subB[k][threadX];
}
/*
Ensure completion before the next set of sub-matrices
begin computation
*/
__syncthreads();
}
/* Set device memory for this sub-matrix */
int position = colsB * BLOCK_SIZE * blockY + BLOCK_SIZE * blockX;
matrixC[position + colsB * threadY + threadX] = val;
}
}
#endif
Host:
//********************************************************************
// matrixMultiplication.cu
//
// A basic CUDA matrix multiplication program.
//********************************************************************
/* Include necessary libraries and kernel */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMultiplication_kernel.cu>
/* Function declarations */
void fillMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Declare device memory */
float * deviceA;
float * deviceB;
float * deviceC;
srand(2013); // Set random seed
/* Determine total number of indices in each matrix */
unsigned int numIndicesA = colsA * rowsA;
unsigned int numIndicesB = colsB * rowsB;
unsigned int numIndicesC = colsC * rowsC;
/* Determine memory size of each matrix */
unsigned int memoryA = sizeof(float) * numIndicesA;
unsigned int memoryB = sizeof(float) * numIndicesB;
unsigned int memoryC = sizeof(float) * numIndicesC;
/* Allocate memory for each matrix */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Set contents of matrices A and B (matrix C is all zeros) */
fillMatrix(matrixA, numIndicesA);
fillMatrix(matrixB, numIndicesB);
/* Allocate device memory for each matrix */
cudaMalloc((void **) &deviceA, memoryA);
cudaMalloc((void **) &deviceB, memoryB);
cudaMalloc((void **) &deviceC, memoryC);
/* Copy host memory to device memory for matrices A and B */
cudaMemcpy(deviceA, matrixA, memoryA, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, matrixB, memoryB, cudaMemcpyHostToDevice);
/* Set thread count to BLOCK_SIZE x BLOCK_SIZE */
dim3 tCount(BLOCK_SIZE, BLOCK_SIZE);
/* Set thread block count */
dim3 tbCount((colsC / tCount.x), (rowsC / tCount.y));
/* Run kernel */
matrixMultiplication <<< tbCount, tCount >>> (deviceA, deviceB,
deviceC, colsA,
colsB);
/* Copy device memory to host memory for matrix C */
cudaMemcpy(matrixC, deviceC, memoryC, cudaMemcpyDeviceToHost);
for(int i = 0; i < 256; i++) {
printf("%f ", matrixC[i]);
}
printf("\n");
/* Free up host and device memory for each matrix */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
}
//--------------------------------------------------------------------
// fillMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been filled with random float
// values.
//--------------------------------------------------------------------
void fillMatrix(float * matrix, int numIndices) {
/* Loop through each index of the matrix */
for (int i = 0; i < numIndices; ++i) {
/*
Assign a random float between 0 and 1 for this index of
the matrix
*/
matrix[i] = rand() / (float)RAND_MAX;
}
}
Makefile:
GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMultiplication.o: matrixMultiplication.cu
$(GCC) $(INCLUDES) -c matrixMultiplication.cu -o $#
matrixMultiplication: matrixMultiplication.o
$(GCC) -o $# matrixMultiplication.o $(CUDA_LIBS)
clean:
$(RM) *.o *~
Problem solved! It was a system timeout issue due to the long duration of the kernel. By switching terminal only mode, I was able to circumvent the issue.
Thanks for all the help guys!

CUDA Makefile nvcc Error

I'm attempting to compile a basic CUDA matrix multiplication program, but I'm running into this error:
nvcc -I. -I/usr/local/cuda/include -c matrixMult1.cu -o matrixMult1.o
make: nvcc: Command not found
make: *** [matrixMult1.o] Error 127
I was getting another error originally and it was recommended that I use nvcc, the only catch being that I know absolutely nothing about nvcc. Anyone have an idea? Thanks in advance!
Makefile:
GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.cu
$(GCC) $(INCLUDES) -c matrixMult1.cu -o $#
matrixMult1: matrixMult1.o
$(GCC) -o $# matrixMult1.o $(CUDA_LIBS)
clean:
$(RM) *.o *~
Kernel:
//********************************************************************
// matrixMul_kernel.cu
//
// Kernel for a basic matrix multiplication program.
//********************************************************************
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
/* Thread block size */
#define BLOCK_SIZE 3
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
/* CUDA Kernel */
__global__ void matrixMul (float * C, float * A, float * B, int wA,
int wB) {
/* Two dimensional thread ID */
int tx = threadIdx.x;
int ty = threadIdx.y;
/* Computation holder variable */
float value = 0;
/* Loop through row of A and column of B to compute cell of C */
for (int i = 0; i < wA; ++i) {
float elementA = A[ty * wA + i];
float elementB = B[i * wB + tx];
value += elementA * elementB;
}
/* Write the result to C */
C[ty * wA + tx] = value;
}
#endif
Main Program:
//********************************************************************
// matrixMult1.c
//
// A basic matrix multiplication program.
//********************************************************************
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMul_kernel.cu>
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
void initMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Set random seed */
srand(2013);
/* Compute memory sizes for matrices A, B, and C */
unsigned int sizeA = WA * HA;
unsigned int sizeB = WB * HB;
unsigned int sizeC = WC * HC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/* Allocate memory for matrices A, B, and C */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(matrixA, sizeA);
initMatrix(matrixB, sizeB);
/* Print matrix A */
printf("\nMatrix A:\n");
for (int i = 0; i < sizeA; i++) {
printf("%f ", matrixA[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Print matrix B */
printf("\nMatrix B:\n");
for (int i = 0; i < sizeB; i++) {
printf("%f ", matrixB[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Allocate device memory */
float* deviceMemA;
float* deviceMemB;
float* deviceMemC;
cudaMalloc((void**) &deviceMemA, memoryA);
cudaMalloc((void**) &deviceMemB, memoryB);
cudaMalloc((void**) &deviceMemC, memoryC);
/* Copy host memory to device */
cudaMemcpy(deviceMemA, matrixA, memoryA,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemB, matrixB, memoryB,
cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
/* Execute kernel */
matrixMul<<< grid, threads >>>(deviceMemC, deviceMemA,
deviceMemB, WA, WB);
cudaMemcpy(deviceMemC, matrixC, memoryC,
cudaMemcpyHostToDevice);
/* Print matrix C */
printf("\nMatrix C:\n");
for (int i = 0; i < sizeC; i++) {
printf("%f ", matrixC[i]);
if (((i + 1) % WC) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
printf("\n");
/* Free up memory */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceMemA);
cudaFree(deviceMemB);
cudaFree(deviceMemC);
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix(float * matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; ++i) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float)RAND_MAX;
}
}
This error:
nvcc: Command not found
indicates that nvcc is not in your shell's PATH.
To fix it, assuming it's bash or similar:
PATH=$PATH:/usr/local/cuda/bin
make
...or add it to the system or your user's profile.

One dimensional fftshift in CUDA

I'm setting up a one dimensional fftshift in CUDA. My code is the following
__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
double2 temp;
if(i< N/2)
{
temp.x = u_d[i].x;
temp.y = u_d[i].y;
u_d[i].x =u_d[i+N/2].x;
u_d[i].y =u_d[i+N/2].y;
u_d[i+N/2].x = temp.x;
u_d[i+N/2].y = temp.y;
}
}
Is there any way, smarter than that shown above, to perform the fftshift in CUDA?
Thanks in advance.
A PERHAPS BETTER SOLUTION
I found that perhaps the following solution could be a good alternative
__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < N)
{
double a = pow(-1.0,i&1);
u_d[i].x *= a;
u_d[i].y *= a;
}
}
It consists in multiplying the vector to be transformed by a sequence of 1s and -1s which is equivalent to the multiplication by exp(-jnpi) and thus to a shift in the conjugate domain.
You have to call this kernel before and after the application of the CUFFT.
One pro is that memory movements/swapping are avoided and the idea can be immediately extended to the 2D case, see CUDA Device To Device transfer expensive.
CONCERNING SYMMETRIC DATA
This solution seems not to be limited to symmetric data. Try for example the following Matlab code, applying the idea to a completely complex random matrix (Gaussian amplitude and uniform phase).
N1=512;
N2=256;
Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);
Im=Magnitude.*exp(j*Phase);
Transform=fftshift(fft2(ifftshift(Im)));
n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);
100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))
The returned normalized root mean square error will be 0, confirming that Transform=Im4.
IMPROVEMENT TO THE SPEED
Following the suggestion received at the NVIDIA Forum, improved speed can be achieved as by changing the instruction
double a = pow(-1.0,i&1);
to
double a = 1-2*(i&1);
to avoid the use of the slow routine pow.
After much time and the introduction of the callback functionality of cuFFT, I can provide a meaningful answer to my own question.
Above I was proposing a "perhaps better solution". After some testing, I have realized that, without using the callback cuFFT functionality, that solution is slower because it uses pow. Then, I have explored two alternatives to the use of pow, something like
float a = (float)(1-2*((int)offset%2));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
and
float2 out = ((float2*)d_in)[offset];
if ((int)offset&1) {
out.x = -out.x;
out.y = -out.y;
}
But, with standard cuFFT, all the above solutions require two separate kernel calls, one for the fftshift and one for the cuFFT execution call. However, with the new cuFFT callback functionality, the above alternative solutions can be embedded in the code as __device__ functions.
So, finally I ended up with the below comparison code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <cufft.h>
#include <cufftXt.h>
//#define DEBUG
#define BLOCKSIZE 256
/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2)
{
float2 temp = d_inout[tid];
d_inout[tid] = d_inout[tid + (N / 2)];
d_inout[tid + (N / 2)] = temp;
}
}
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float a = (float)(1-2*((int)offset%2));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float a = pow(-1.,(double)(offset&1));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float2 out = ((float2*)d_in)[offset];
if ((int)offset&1) {
out.x = -out.x;
out.y = -out.y;
}
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;
/********/
/* MAIN */
/********/
int main()
{
const int N = 131072;
printf("N = %d\n", N);
// --- Host side input array
float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
for (int i=0; i<N; i++) {
h_vect[i].x = (float)rand() / (float)RAND_MAX;
h_vect[i].y = (float)rand() / (float)RAND_MAX;
}
// --- Host side output arrays
float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));
// --- Device side input arrays
float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
// --- Device side output arrays
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/*******************************************/
/* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
/*******************************************/
cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Memory movements elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));
/****************************************/
/* CHESSBOARD MULTIPLICATION V1 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v1 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }
printf("Chessboard v1 test passed!\n");
/****************************************/
/* CHESSBOARD MULTIPLICATION V2 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v2 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }
printf("Chessboard v2 test passed!\n");
/****************************************/
/* CHESSBOARD MULTIPLICATION V3 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v3 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }
printf("Chessboard v3 test passed!\n");
return 0;
}
RESULTS ON A GTX 480
N Mem mov v1 v2 v3
131072 0.552 0.136 0.354 0.183
262144 0.536 0.175 0.451 0.237
524288 0.661 0.283 0.822 0.290
1048576 0.784 0.565 1.548 0.548
2097152 1.298 0.952 2.973 0.944
RESULTS ON A TESLA C2050
N Mem mov v1 v2 v3
131072 0.278 0.130 0.236 0.132
262144 0.344 0.202 0.374 0.206
524288 0.544 0.378 0.696 0.387
1048576 0.909 0.695 1.294 0.695
2097152 1.656 1.349 2.531 1.349
RESULTS ON A KEPLER K20c
N Mem mov v1 v2 v3
131072 0.077 0.076 0.136 0.076
262144 0.142 0.128 0.202 0.127
524288 0.268 0.229 0.374 0.230
1048576 0.516 0.433 0.717 0.435
2097152 1.019 0.853 1.400 0.855
Some more details have recently appeared at The 1D fftshift in CUDA by chessboard multiplication and at the GitHub page.
If space is not a concern (and are using fftshift for only one dimension), create u_d with size 1.5 x N, and write the first N/2 elements at the end. You can then move u_d to u_d + N / 2
Here is how you could do it.
double2 *u_d, *u_d_begin;
size_t bytes = N * sizeof(double2);
// This is different from bytes / 2 when N is odd
size_t half_bytes = (N / 2) * sizeof(double2);
CUDA_CHK(cudaMalloc( &u_d, bytes + half_bytes ));
u_d_begin = u_d;
...
// Do some processing and populate u_d;
...
// Copy first half to the end
CUDA_CHK(cudaMemcpy(u_d + N, u_d, half_bytes, cudaMemcpyDeviceToDevice));
u_d = u_d + N /2;