does cuda code skip execution of unnecessary clauses in logical expressions? - cuda

In ordinary C++, if I say the following, it is safe because the third clause will be skipped execution. I'm just wondering if cuda kernel code will also have this property or it doesn't in consideration of maximizing parallelism?
int x[100] = {...}, i = -1;
if (i < 0 || i >= 100 || x[i] == 0) {
// do something.
}
EDIT:
Taking from Jack's program, the following program runs OK and output "10". There is no error doing cuda-memcheck.
#include <stdio.h>
__global__ void test(float *input, float *output, int i, int N) {
float c = 10;
// NOTE: uncomment this will cause cuda-memcheck to give an error.
// c = input[-1];
if (i < 0 || i >= N || (c = input[-1])) {
output[0] = c;
}
}
int main(void) {
int i = -1;
int N = 10;
float* input;
float* output;
float* dev_input;
float* dev_output;
input = (float*)malloc(sizeof(float) * N);
output = (float*)malloc(sizeof(float));
for (int j = 0; j < N; j++) {
input[j] = 2.0f;
}
output[0] = 3.0f;
cudaMalloc((void**)&dev_input,sizeof(float) * N);
cudaMalloc((void**)&dev_output,sizeof(float));
cudaMemcpy(dev_input,input,sizeof(float) * N,cudaMemcpyHostToDevice);
cudaMemcpy(dev_output,output,sizeof(float),cudaMemcpyHostToDevice);
test<<<1,1>>>(dev_input,dev_output,i,N);
cudaMemcpy(output,dev_output,sizeof(float),cudaMemcpyDeviceToHost);
printf("%f\n", output[0]);
return 0;
}

The CUDA C/C++ compiler should obey the language requirements in this respect.
Specifically, the language requirements as far as order-of-operation and short-circuiting should be maintained for non-overloaded || and && operators.

Try the simple code below in which the kernel function tries to access input[-1]. You will realize it will get stuck.
#include <stdio.h>
__global__ void test(float *input, float *output, int i, int N) {
if (i < N || input[i] == 0) {
output[i] = input[i];
}
}
void main(void) {
int i = -1;
int N = 10;
float* input;
float* dev_input;
float* dev_output;
input = (float*)malloc(sizeof(float));
input[0] = 2.f;
cudaMalloc((void**)&dev_input,sizeof(float));
cudaMalloc((void**)&dev_output,sizeof(float));
cudaMemcpy(dev_input,input,sizeof(float),cudaMemcpyHostToDevice);
test<<<1,1>>>(dev_input,dev_output,i,N);
}
The reason can be explained by having a look at the disassembled code.
MOV R1, c[0x1][0x100]; R1 = c[0x1][0x100]
NOP;
MOV R3, c[0x0][0x28]; R3 = c[0x0][0x28]
SHL R2, R3, 0x2; R2 = shiftleft(R3)
IADD R0, R2, c[0x0][0x20]; R0 = R2 + 0x2
LDU R0, [R0]; Load the memory addressed by R0 to R0
FSETP.EQ.AND P0, PT, R0, RZ, PT; Predicate register P0 will contain result of test R0 == 0
ISETP.LT.OR P0, PT, R3, c[0x0][0x2c], P0; Predicate register P0 will contain result of test P0 || (R3 < c[0x0][0x2c])
#P0 IADD R2, R2, c[0x0][0x24]; ...
#P0 ST [R2], R0;
EXIT ;
As you can see, the device will attempt to load the data from global memory regardless from the result of the first clause.

Related

Measuring of const memory access latency gives strange results

I have 3 kernels that sum two numbers. The first adds two numbers in registers. The second takes one number from const memory and another one from register. The third takes two numbers from const memory.
According to article "Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking" latency of access to L1 constant cache is ~24, and latency of IADD instruction is 6 cycles for Pascal. Therefore for the first kernel I expected to get duration greater than 6, for the second greater than 24, and for the third greater than 24.
But, when I measure time with clock() I get 13, 12 and 214 respectively.
My videocard is GeForce GTX 1050 Mobile. CUDA 10.1.
Compilation command: nvcc -arch=sm_61 main.cu -o main
Below I present code of program and fragments of SASS code.
Program code:
#include <iostream>
#define RES_SIZE 10
__global__ void measureReg(int *res){
int a = res[0], b = res[1], c;
__shared__ int shdata[1];
for(int i=0;i<150;++i) __syncthreads(); // Covers latencies of accesses to global memory
unsigned int t1,t2;
t1 = clock();
asm volatile("add.s32 %0, %1, %2;": "=r"(c) : "r"(a), "r"(b) : "memory");
shdata[0] = c; //Prevents execution of clock() before add has finished
t2 = clock();
res[0] = t2 - t1;
}
__global__ void measureConst1(int *res, int n1){
int a,b;
a = res[0];
__shared__ int shdata[1];
for(int i=0;i<150;++i) __syncthreads();
unsigned int t1,t2;
t1 = clock();
asm volatile("add.s32 %0, %1, %2;": "=r"(b) : "r"(a), "r"(n1) : "memory");
shdata[0] = b;
t2 = clock();
res[0] = t2 - t1;
}
__global__ void measureConst2(int *res, int n1, int n2){
int a;
__shared__ int shdata[1];
unsigned int t1,t2;
t1 = clock();
asm volatile("add.s32 %0, %1, %2;": "=r"(a) : "r"(n1), "r"(n2) : "memory");
shdata[0] = a;
t2 = clock();
res[0] = t2 - t1;
}
int main(int argc, char** argv){
int hostRes[RES_SIZE], *devRes;
std::fill(hostRes, hostRes + RES_SIZE, 1);
cudaMalloc(&devRes, RES_SIZE*sizeof(int));
cudaMemcpy(devRes, hostRes, RES_SIZE*sizeof(int), cudaMemcpyHostToDevice);
measureReg<<<1,1>>>(devRes);
cudaMemcpy(hostRes, devRes, RES_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
std::cout<<"IADD with registers: "<<hostRes[0]<<std::endl;
measureConst1<<<1,1>>>(devRes, 10);
cudaMemcpy(hostRes, devRes, RES_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
std::cout<<"IADD with register and const mem: "<<hostRes[0]<<std::endl;
measureConst2<<<1,1>>>(devRes, 10, 20);
cudaMemcpy(hostRes, devRes, RES_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
std::cout<<"IADD with const mem: "<<hostRes[0]<<std::endl;
cudaFree(devRes);
return 0;
}
Fragments of SASS code:
/* measureReg */
CS2R R4, SR_CLOCKLO ;
IADD R0, R0, R5 ;
STS [RZ], R0 ;
CS2R R5, SR_CLOCKLO ;
/* measureConst1 */
CS2R R4, SR_CLOCKLO ;
IADD R0, R0, c[0x0][0x148] ;
STS [RZ], R0 ;
CS2R R5, SR_CLOCKLO ;
/* measureConst2 */
CS2R R2, SR_CLOCKLO ;
MOV R0, c[0x0][0x148] ;
IADD R0, R0, c[0x0][0x14c] ;
STS [RZ], R0 ;
CS2R R0, SR_CLOCKLO ;
This is broken:
int hostRes[RES_SIZE], *devRes;
std::fill(hostRes, hostRes + RES_SIZE, 1);
cudaMemcpy(devRes, hostRes, RES_SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&devRes, RES_SIZE*sizeof(int));
The compiler issues a warning to the effect that devRes is used before its value is set. You should not ignore these warnings. The correct sequence is:
int hostRes[RES_SIZE], *devRes;
std::fill(hostRes, hostRes + RES_SIZE, 1);
cudaMalloc(&devRes, RES_SIZE*sizeof(int));
cudaMemcpy(devRes, hostRes, RES_SIZE*sizeof(int), cudaMemcpyHostToDevice);
With that change, and compiling your code for sm_61 using CUDA 10.2, and focused on your measureConst2 function (although the loop unrolling behavior is the same for all) I observe SASS that looks like this:
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_61
Function : _Z13measureConst2Piii
.headerflags #"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fbc00fde007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ NOP ; /* 0x50b0000000070f00 */
/*0018*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001fd400ffe007ed */
/*0028*/ NOP ; /* 0x50b0000000070f00 */
/*0030*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/*0038*/ MEMBAR.CTA ; /* 0xef98000000070000 */
/* 0x001fb800fde007ef */
/*0048*/ NOP ; /* 0x50b0000000070f00 */
/*0050*/ NOP ; /* 0x50b0000000070f00 */
/*0058*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001fbc00fea007ff */
/*0068*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/*0070*/ MEMBAR.CTA ; /* 0xef98000000070000 */
The above sequence repeats 149 times due to compiler unrolling and optimization...
/*1f18*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001ffc00fdc007ef */
/*1f28*/ NOP ; /* 0x50b0000000070f00 */
/*1f30*/ NOP ; /* 0x50b0000000070f00 */
/*1f38*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/* 0x001fd800fcc007f5 */
/*1f48*/ MEMBAR.CTA ; /* 0xef98000000070000 */
/*1f50*/ CS2R R2, SR_CLOCKLO ; /* 0x50c8000005070002 */
/*1f58*/ MOV R0, c[0x0][0x148] ; /* 0x4c98078005270000 */
/* 0x003f98001e4007f4 */
/*1f68*/ IADD R0, R0, c[0x0][0x14c] ; /* 0x4c10000005370000 */
/*1f70*/ STS [RZ], R0 ; /* 0xef5c00000007ff00 */
/*1f78*/ CS2R R0, SR_CLOCKLO ; /* 0x50c8000005070000 */
/* 0x001fc800fe2007f1 */
/*1f88*/ IADD R0, -R2, R0 ; /* 0x5c12000000070200 */
/*1f90*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*1f98*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/* 0x001ffc00fde007f1 */
/*1fa8*/ STG.E [R2], R0 ; /* 0xeedc200000070200 */
/*1fb0*/ NOP ; /* 0x50b0000000070f00 */
/*1fb8*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*1fc8*/ BRA 0x1fc0 ; /* 0xe2400fffff07000f */
/*1fd0*/ NOP; /* 0x50b0000000070f00 */
What we note is that the compiler has:
The compiler has unrolled your loop of 150 iterations (not sure what you were hoping to accomplish with that)
Your load of n1 and n1 (kernel arguments) is happening only once.
These loads occur here:
/*1f58*/ MOV R0, c[0x0][0x148] ; /* 0x4c98078005270000 */
/* 0x003f98001e4007f4 */
/*1f68*/ IADD R0, R0, c[0x0][0x14c] ; /* 0x4c10000005370000
These are loading kernel parameters through the __constant__ memory system (this is expected behavior). The __constant__ memory system is not the same as the L1 cache or the "L1 constant cache". Even if we ignore that point, it only makes sense to talk about the latency associated with cache access if the item requested is already in the cache. In the above SASS code, there is no reason to assume this is true. You are accessing the items only once, and therefore you are witnessing the latency associated with populating the __constant__ cache with items from global memory (which is ultimately how all data moves from host to device -- even kernel arguments).
You might now ask "why don't I see that long latency with measureConst1 ? In that case, your kernel design is a bit different and we see that the load behavior is also a bit different. The SASS looks like this:
Function : _Z13measureConst1Pii
.headerflags #"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fc800fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*0018*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/* 0x001fbc00fde000b1 */
/*0028*/ LDG.E R0, [R2] ; /* 0xeed4200000070200 */
/*0030*/ NOP ; /* 0x50b0000000070f00 */
/*0038*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001fd400ffe007e9 */
/*0048*/ NOP ; /* 0x50b0000000070f00 */
/*0050*/ BAR.SYNC 0x0 ; /* 0xf0a81b8000070000 */
/*0058*/ MEMBAR.CTA ; /* 0xef98000000070000 */
(repeating ...)
/*1f68*/ MEMBAR.CTA ; /* 0xef98000000070000 */
/*1f70*/ CS2R R4, SR_CLOCKLO ; /* 0x50c8000005070004 */
/*1f78*/ IADD R0, R0, c[0x0][0x148] ; /* 0x4c10000005270000 */
/* 0x003fc800fcc000f2 */
/*1f88*/ STS [RZ], R0 ; /* 0xef5c00000007ff00 */
/*1f90*/ CS2R R5, SR_CLOCKLO ; /* 0x50c8000005070005 */
/*1f98*/ IADD R0, -R4, R5 ; /* 0x5c12000000570400 */
/* 0x001fa000fde007f1 */
/*1fa8*/ STG.E [R2], R0 ; /* 0xeedc200000070200 */
/*1fb0*/ NOP ; /* 0x50b0000000070f00 */
/*1fb8*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001f8000ffe007ff */
/*1fc8*/ EXIT ; /* 0xe30000000007000f */
/*1fd0*/ BRA 0x1fd0 ; /* 0xe2400fffff87000f */
/*1fd8*/ NOP; /* 0x50b0000000070f00 */
Here we see at the very beginning of your kernel (due to your specific kernel design), the SASS is loading items from __constant__ that are immediately adjacent to the items you are loading in your timing region. Therefore its reasonable to assume that with some cache line load granularity, your timing region is now measuring something like cache latency, rather than global load latency.
Seems I have found answer on my question.
According to this question, there is immediate constant (IMC) and index constant (INC) caches. INC cache handles accesses created by LDC instruction, and IMC handles others.
I believe, that latency of L1 constant cache that is pointed out in the mentioned article "Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking" is average latency of these two caches. And latency of IMC cache is no longer than latency of access to register file.
To check these hypothesis I have written small benchmark, that contains 6 kernels. Each of these kernel consequentially read array of integers situated in "const" memory and measure latency of reads.
There is difference between these kernels. The first kernel ("imc") read data through IMC cache without any prefetching. The second one ("imc_with_imc_prefetch") also reads data through IMC, but at first prefetch data to IMC. The third one ("imc_with_inc_prefetch") prefetches data to INC and then reads from IMC. There are another 3 kernels: "inc", "inc_with_imc_prefetch", "inc_with_inc_prefetch".
According to obtained results, I have came to the conclusions:
IMC latency is 12, INC latency is 40. Average latency is 26 that is very close to latency of L1 constant cache in the mentioned article.
Cache line size of IMC and INC is 64 bytes.
There is L1.5 const cache with line size 256 bytes and average latency of 78. If it is accessed through IMC its latency is 60. And if it is accessed through INC its latency is 96.
Full code of the benchmark:
#include <iostream>
#define SMPL_COUNT 128
#define CONST_SIZE 10000 // Count of elements in constant array
__constant__ int carr[CONST_SIZE];
__global__ void imc(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
// Reading from immediate constant cache
int sum = 0;
for(int i=0;i<SMPL_COUNT;++i) {
t1 = clock();
sum += carr[i];
temp[i] = sum;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = sum;
}
__global__ void imc_with_imc_prefetch(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
const int stride = 32/4; // Make assumption that cache line is more or equal to 32 bytes
// Prefetch data to immediate constant cache
for(int i=0; i<(SMPL_COUNT+stride-1)/stride; ++i) arr[i] = carr[i*stride];
// Reading from immediate constant cache
int sum = 0;
for(int i=0;i<SMPL_COUNT;++i) {
t1 = clock();
sum += carr[i];
temp[i] = sum;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = sum;
}
__global__ void imc_with_inc_prefetch(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
// Prefetch data to index constant cache
int index = carr[CONST_SIZE-1];
for(int i=0;i<SMPL_COUNT;++i)
index = *((int*)(((char*)carr)+index)); //Subtle way to avoid calling of shift instruction
arr[0] = index;
__syncthreads();
//Reading from immediate constant cache
index = 0;
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index += carr[i];
temp[i]=index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index;
}
__global__ void inc(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
int index = carr[CONST_SIZE-1];
arr[SMPL_COUNT - 1] = index;
__syncthreads();
//Reading from index constant cache
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index = *((int*)(((char*)carr)+index));
temp[i] = index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index;
}
__global__ void inc_with_imc_prefetch(int* arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
//Prefetch data to index constant cache
const int stride = 32/4; // Make assumption that cache line is more or equal to 32 bytes
// Prefetch data to immediate constant cache
for(int i=0; i<(SMPL_COUNT+stride-1)/stride; ++i)
arr[i] = carr[i*stride];
int index = carr[CONST_SIZE-1];
arr[SMPL_COUNT - 1] = index;
__syncthreads();
//Reading from index constant cache
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index = *((int*)(((char*)carr)+index));
temp[i] = index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index;
}
__global__ void inc_with_inc_prefetch(int*arr, int* t){
unsigned int t1, t2;
__shared__ int temp[SMPL_COUNT];
__shared__ int times[SMPL_COUNT];
int index = carr[CONST_SIZE-1];
for(int i=0;i<SMPL_COUNT;++i){
index = carr[index/4];
}
arr[0] = index;
index = carr[CONST_SIZE-1];
arr[SMPL_COUNT - 1] = index;
__syncthreads();
for(int i=0;i<SMPL_COUNT;++i){
t1 = clock();
index = *((int*)(((char*)carr)+index));
temp[i] = index;
t2 = clock();
times[i] = t2-t1;
__syncthreads();
}
for(int i=0;i<SMPL_COUNT;++i) t[i] = times[i];
arr[0] = t2-t1;
arr[1] = index+2;
}
int main(int argc, char** argv){
int hostArr[SMPL_COUNT], *devArr, *devTimes;
int imc_times[SMPL_COUNT], imc_imc_times[SMPL_COUNT], imc_inc_times[SMPL_COUNT];
int inc_times[SMPL_COUNT], inc_imc_times[SMPL_COUNT], inc_inc_times[SMPL_COUNT];
cudaMalloc(&devArr, SMPL_COUNT*sizeof(int));
cudaMalloc(&devTimes, SMPL_COUNT*sizeof(int));
cudaMemset (carr, 0, CONST_SIZE*sizeof(int));
cudaMemset (devArr, 0, SMPL_COUNT*sizeof(int));
cudaMemset (devTimes, 0, SMPL_COUNT*sizeof(int));
for(int i=0;i<SMPL_COUNT;++i) hostArr[i]=4*(i+1);
cudaMemcpyToSymbol(carr, hostArr, SMPL_COUNT*sizeof(int));
imc<<<1,1>>>(devArr, devTimes);
cudaMemcpy(imc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
imc_with_imc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(imc_imc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
imc_with_inc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(imc_inc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
inc<<<1,1>>>(devArr, devTimes);
cudaMemcpy(inc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
inc_with_imc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(inc_imc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
inc_with_inc_prefetch<<<1,1>>>(devArr, devTimes);
cudaMemcpy(inc_inc_times, devTimes, SMPL_COUNT*sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(devArr);
cudaFree(devTimes);
std::cout<<"IMC\tIMC(IMC)\tIMC(INC)\tINC\tINC(IMC)\tINC(INC)\n";
for(int i=0;i<SMPL_COUNT;++i){
std::cout<<imc_times[i]<<"\t"<<imc_imc_times[i]<<"\t"<<imc_inc_times[i]<<"\t";
std::cout<<inc_times[i]<<"\t"<<inc_imc_times[i]<<"\t"<<inc_inc_times[i]<<"\n";
}
return 0;
}

CUBLAS batch and matrix sizes [duplicate]

Some background info on the problem I am trying to speed up using CUDA:
I have a large number of small/moderate same-sized linear systems I need to solve independently. Each linear system is square, real, dense, invertible, and non-symmetric. These are actually matrix systems so each system look like, AX = B, where A, X, and B are (n x n) matrixes.
In this previous question I ask CUBLAS batch and matrix sizes, where I learn cuBLAS batch operations give best performance for matrix of size 100x100 or smaller.
I still have an issue because the matrices I am working with have 100 < n < 700. So, the matrices are of moderate size where cuBLAS batch operations are not give best performance, and regular BLAS (cusolverDnDgetrf, cusolverDnDgetrs) also are not give better performance than MATLAB (look at timings below).
I did some timing compared to MATLAB, for solving a single system, and found regular BLAS is better for matrices of size (4096x4096) or larger. I make a random matrix of size (n x n), for n=64,256,512,1024,4096,16384, and only time the factorization and back/forward solve, no transfers across PCIE.
DOUBLE PRECISION CUDA (GTX 1080ti) vs MATLAB (backslash)
(GPU) 64: 0.001157 sec
(MATLAB) 64: 0.000205 sec
(GPU) 256: 0.01161 sec
(MATLAB) 256: 0.007762 sec
(GPU) 512: 0.026348 sec
(MATLAB) 512: 0.008550 sec
(GPU) 1024: 0.064357 sec
(MATLAB) 1024: 0.036280 sec
(GPU) 4096: 0.734908 sec
(MATLAB) 4096: 1.174442 sec
(GPU) 16384: 32.962229 sec (MATLAB) 16384: 68.691236 sec
These timing make me conclude that iterating one by one over my matrices calling non-batch inversion method will be slower than MATLAB. Also, for my moderate sized matrices, batch cuBLAS batch inversion method will not perform well, according to CUBLAS batch and matrix sizes.
Is there other approach I should consider to speed up my code with CUDA? Or am I misunderstanding something?
/* How to use
* ./cuSolverDn_LinearSolver // Default: cholesky
* ./cuSolverDn_LinearSolver -R=chol -filefile> // cholesky factorization
* ./cuSolverDn_LinearSolver -R=lu -file<file> // LU with partial pivoting
* ./cuSolverDn_LinearSolver -R=qr -file<file> // QR factorization
*
* Remark: the absolute error on solution x is meaningless without knowing condition number of A.
* The relative error on residual should be close to machine zero, i.e. 1.e-15.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
int linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
}
//checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
//checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, n, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (ipiv ) { checkCudaErrors(cudaFree(ipiv));}
return 0;
}
void generate_random_dense_matrix(int M, int N, double **outA)
{
int i, j;
double rMax = (double)RAND_MAX;
double *A = (double *)malloc(sizeof(double) * M * N);
// For each column
for (j = 0; j < N; j++)
{
// For each row
for (i = 0; i < M; i++)
{
double dr = (double)rand();
A[j * M + i] = (dr / rMax) * 100.0;
//printf("A[j * M + i] = %f \n",A[j * M + i]);
}
}
*outA = A;
}
int main (int argc, char *argv[])
{
struct testOpts opts;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 0; // number of rows of A
int colsA = 0; // number of columns of A
int nnzA = 0; // number of nonzeros of A
int baseA = 0; // base index in CSR format
int lda = 0; // leading dimension in dense matrix
// CSR(A) from I/O
int *h_csrRowPtrA = NULL;
int *h_csrColIndA = NULL;
double *h_csrValA = NULL;
double *h_A = NULL; // dense matrix from CSR(A)
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
colsA = 660;
rowsA = colsA;
int NN = colsA;
int MM = rowsA;
lda = rowsA;
// Generate inputs
srand(9384);
generate_random_dense_matrix(MM, NN, &h_A);
generate_random_dense_matrix(MM, NN, &h_b);
parseCommandLineArguments(argc, argv, opts);
if (NULL == opts.testFunc)
{
//opts.testFunc = "chol"; // By default running Cholesky as NO solver selected with -R option.
opts.testFunc = "lu";
//opts.testFunc = "qr";
}
findCudaDevice(argc, (const char **)argv);
/*
printf("step 1: read matrix market format\n");
if (opts.sparse_mat_filename == NULL)
{
opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
if (opts.sparse_mat_filename != NULL)
printf("Using default input file [%s]\n", opts.sparse_mat_filename);
else
printf("Could not find gr_900_900_crg.mtx\n");
}
else
{
printf("Using input file [%s]\n", opts.sparse_mat_filename);
}
if (opts.sparse_mat_filename == NULL)
{
fprintf(stderr, "Error: input matrix is not provided\n");
return EXIT_FAILURE;
}
if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
&nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true))
{
exit(EXIT_FAILURE);
}
baseA = h_csrRowPtrA[0]; // baseA = {0,1}
printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);
if ( rowsA != colsA )
{
fprintf(stderr, "Error: only support square matrix\n");
exit(EXIT_FAILURE);
}
printf("step 2: convert CSR(A) to dense matrix\n");
lda = opts.lda ? opts.lda : rowsA;
if (lda < rowsA)
{
fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
exit(EXIT_FAILURE);
}
*/
//h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*lda*colsA);
//h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*lda*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
/*
memset(h_A, 0, sizeof(double)*lda*colsA);
for(int row = 0 ; row < rowsA ; row++)
{
const int start = h_csrRowPtrA[row ] - baseA;
const int end = h_csrRowPtrA[row+1] - baseA;
for(int colidx = start ; colidx < end ; colidx++)
{
const int col = h_csrColIndA[colidx] - baseA;
const double Areg = h_csrValA[colidx];
h_A[row + col*lda] = Areg;
}
}
printf("step 3: set right hand side vector (b) to 1\n");
for(int row = 0 ; row < rowsA ; row++)
{
h_b[row] = 1.0;
}
*/
// verify if A is symmetric or not.
if ( 0 == strcmp(opts.testFunc, "chol") )
{
int issym = 1;
for(int j = 0 ; j < colsA ; j++)
{
for(int i = j ; i < rowsA ; i++)
{
double Aij = h_A[i + j*lda];
double Aji = h_A[j + i*lda];
if ( Aij != Aji )
{
issym = 0;
break;
}
}
}
if (!issym)
{
printf("Error: A has no symmetric pattern, please use LU or QR \n");
exit(EXIT_FAILURE);
}
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*lda*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*lda*rowsA));
printf("step 4: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*lda*rowsA, cudaMemcpyHostToDevice));
printf("step 5: solve A*x = b \n");
// d_A and d_b are read-only
if ( 0 == strcmp(opts.testFunc, "chol") )
{
linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "lu") )
{
//printf("hi \n");
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "qr") )
{
linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
}
else
{
fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
exit(EXIT_FAILURE);
}
printf("step 6: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowsA,
colsA,
colsA,
&minus_one,
d_A,
lda,
d_x,
rowsA,
&one,
d_r,
rowsA));
checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*lda*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("x[0] = %f\n", h_x[0]);
printf("r[0] = %f\n", h_r[0]);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));
if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }
if (h_csrValA ) { free(h_csrValA); }
if (h_csrRowPtrA) { free(h_csrRowPtrA); }
if (h_csrColIndA) { free(h_csrColIndA); }
if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }
if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }
return 0;
}
Try using two or more parallel streams (with one linear system each) on the GPU, possibly this helps utilizing a bigger part of the GPU.
For timing measurments and hardware utilization use the visual profiler instead of CPU time measurements.
Another point is, that the GTX (consumer) GPUs perform pretty bad on double preision. If you have the chance, try to use a Tesla GPU instead.
MATLAB provides a way to call the cublas batch interface for GPU arrays using pagefun.

CUDA shared memory writes incur unexplainable long latency

This has been driving me crazy. I have a 3D grid of 1D blocks. Each block contains 272 threads. Each thread does a dot product of two vectors and stores its result in a corresponding location in shared memory array of doubles sized at [272] which is the same number of threads. the main thread is calling multiple kernels and I am adding up time taken to execute them all. When I comment out the line that writes to shared memory, I get execution times around 2,401 ms. When I uncomment the shared memory write line, I get tremendously long times like 450,309 ms. I tried using int values instead of doubles. I also tried having a if(threadIdx.x ==0) statement to let just one thread do its write, to avoid possible bank conflicts. Nothing seems to work.
Here is the calling thread code:
double theta=0;
int count=0;
cudaEventRecord(start,0);
while(theta <180)
{
theta+=0.18;
calc_LF<<<gridDim, blockDim>>>(ori_dev, X_dev, Y_dev, Z_dev, F_dev, F_grad_dev, g_oriD, r_vD, LF);
calc_S<<<gridDim, 272>>>(g_traD, LF, Ci, C);
count++;
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &elapsedTime, start, stop );
err = cudaGetLastError();
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
else
{
fprintf( stderr, "\n \n Cuda NO error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
printf("\n %d orientation updates: Total Time = %3.10f ms\n", count, elapsedTime);
}
The Kernel in question is the calc_S kernel, whose code is:
__global__ void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
__shared__ double G[H];
int myTRA[W];
int tx= threadIdx.x;
for(int j=0; j<W; j++)
{
myTRA[j]= getElement(g_traD, tx, j, W);
}
double sum;
for(int j=0; j<W; j++)
{
sum += myTRA[j] * LF[j];
}
// Write your sum to shared memory
G[threadIdx.x]=sum;
//__syncthreads();
}
I am using MS Visual Studio 2008 with CUDA 4.2 and a GPU of compute capability 2.0 (namely a GeForce GTX 580).
Notes:
272 threads per block.
H/W threading limit: 1,536 / 272 = up to 5 blocks
Shared memory limit: G[272] of doubles = 2,176 bytes needed. 48K / 2176= up to 22 blocks (which will never happen but we know shared memory is no limitation)
registers are not an issue at all.
So, it should be that 5 blocks can be executing at the same time.
Thanks for any help.
Mai
Edit:
Here is a shortened version of the entire code. The whole code can be run in the MatrixMul Nvidia SDK example.
In File "MatrixMul.cu"
int main(int argc, char** argv)
{
// reading data from Matlab into double arrays
//CUDA begins here:
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
{
cutilDeviceInit(argc, argv);
}
else
{
cutilSafeCall( cudaSetDevice(cutGetMaxGflopsDeviceId()) );
}
int devID;
cudaDeviceProp props;
// get GPU props
cutilSafeCall(cudaGetDevice(&devID));
cutilSafeCall(cudaGetDeviceProperties(&props, devID));
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name, props.major, props.minor);
//Declare Device memory for matrices read from Matlab
double *X_dev; // size 19 x 1
double *Y_dev; // size 19 x 1
double *Z_dev; // size 17 x 1
double *r_vD; // size 544 x 3
double *g_oriD; // size 544 x 3
double *g_traD; // size 272 x 544
double *cov_D; // size 272 x 272
double *cov_i_D; // size 272 x 272
err= cudaMalloc((void**)&X_dev, sizeX*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&Y_dev, sizeY*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&Z_dev, sizeZ*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&r_vD, sizeR_V*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&g_oriD, sizeG_ori*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&g_traD, sizeG_tra*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&cov_D, sizeCov*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&cov_i_D, sizeCov_i*sizeof(double));
errorCheck(err);
//Transfer Xs, Ys, and Zs to GPU Global memory
cudaMemcpy(X_dev,dipole_x_coords, sizeX*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(Y_dev,dipole_y_coords, sizeY*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(Z_dev,dipole_z_coords, sizeZ*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
// Transfer r_v, g_ori, and g_tra to GPU memory
cudaMemcpy(r_vD, r_v, sizeR_V*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(g_oriD,g_ori, sizeG_ori*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(g_traD,g_tra, sizeG_tra*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
// Transfer cov, and cov_i to GPU memory
cudaMemcpy(cov_D, cov_post, sizeCov*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(cov_i_D,cov_post_i, sizeCov_i*sizeof(double), cudaMemcpyHostToDevice);
//Specify dimensions of block and grid
dim3 gridDim(sizeX, sizeY, sizeZ); // 19 x 19 x 17
int numThreads=(int) sizeR_V/3; // numThreads = 544
dim3 blockDim(numThreads,1,1); // 544 x 1 x 1
//call Cuda wrapper
float cf = runB(X_dev, Y_dev, Z_dev, r_vD, g_oriD, g_traD, cov_i_D, cov_D, blockDim, gridDim, sizeG_tra, tra_W, tra_H);
int c=0;
scanf("%d", c);
return 0;
}
float runB(double* X_dev, double* Y_dev, double* Z_dev,
double* r_vD, double* g_oriD, double* g_traD, double* Ci, double* C,
dim3 blockDim, dim3 gridDim, int sizeG_tra, int tra_W, int tra_H)
{
cudaError err;
// Calculate the size of thread output in global memory
size_t size_F = gridDim.x * gridDim.y * gridDim.z * blockDim.x;
size_t size_F_grad = gridDim.x * gridDim.y * gridDim.z * blockDim.x * 3;
// Make global memory space for F and F_grad
double* F_dev;
double* F_grad_dev;
err= cudaMalloc((void**)&F_dev, size_F*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&F_grad_dev, size_F_grad*sizeof(double));
errorCheck(err);
//Allocate Device memory for LF
double *LF;
err= cudaMalloc((void**)&LF, 544*sizeof(double));
errorCheck(err);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
double theta=0;
cudaEventRecord(start,0);
while(theta <180)
{
theta+=0.18;
calc_LF<<<gridDim, blockDim>>>(ori_dev, X_dev, Y_dev, Z_dev, F_dev, F_grad_dev, g_oriD, r_vD, LF);
calc_S<<<gridDim, 272>>>(g_traD, LF, Ci, C);
count++;
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &elapsedTime, start, stop );
err = cudaGetLastError();
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
else
{
fprintf( stderr, "\n \n Cuda NO error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
printf("\n 180 orientation updates: Total Time = %3.10f ms\n",elapsedTime);
}
return 0;
}
In file "MatrixMul_kernel.cu"
#define HDM_DIM 3
__global__ void calc_LF(double* ori_dev, double* X_dev, double* Y_dev, double* Z_dev, double* F_dev, double* F_grad_dev,
double* g_oriD, double* r_vD, double* LF)
{
// Get this block's global index
int blockId= blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z;
int tx= threadIdx.x;
// This thread's global index
int gtx= blockId*blockDim.x + threadIdx.x;
double r_v[3];
double g_ori[3];
// Each thread reads 1 row (3 values) of r_vD
r_v[0] = getElement(r_vD, tx, 0, HDM_DIM);
r_v[1] = getElement(r_vD, tx, 1, HDM_DIM);
r_v[2] = getElement(r_vD, tx, 2, HDM_DIM);
// Each thread reads 1 row (3 values) of g_oriD (which contains grad.ori data)
g_ori[0] = getElement(g_oriD, tx, 0, HDM_DIM);
g_ori[1] = getElement(g_oriD, tx, 1, HDM_DIM);
g_ori[2] = getElement(g_oriD, tx, 2, HDM_DIM);
//fetch d_ori from global memory
double d_ori[3];
for(int i=0; i< 3; i++){
d_ori[i]= ori_dev[3*gtx+i];
}
//read this block's X, Y, Z location
double x= X_dev[blockIdx.x];
double y= Y_dev[blockIdx.y];
double z= Z_dev[blockIdx.z];
double c2[HDM_DIM];
c2[0]= d_ori[1]*z - d_ori[2]*y;
c2[1]= d_ori[2]*x - d_ori[0]*z;
c2[2]= d_ori[0]*y - d_ori[1]*x;
// Fetch F and F_grad from global memory
double F = F_dev[gtx];
double F_grad[3];
for(int j=0; j<3; j++)
{
F_grad[j] = F_grad_dev[gtx*3+j];
}
double c1[HDM_DIM];
c1[0]= F* c2[0];
c1[1]= F* c2[1];
c1[2]= F* c2[2];
double d3= c2[0]*r_v[0] + c2[1]*r_v[1] + c2[2]*r_v[2];
double s2[HDM_DIM];
for(int j=0; j<HDM_DIM; j++)
{
s2[j] = d3*F_grad[j];
}
double s1[HDM_DIM];
for(int j=0; j<HDM_DIM; j++)
{
s1[j] = c1[j] - s2[j];
}
double b_v[HDM_DIM];
for(int j=0; j<HDM_DIM; j++)
{
b_v[j] = (10^-7)/(F*F) * s1[j];
}
double sum=0;
for(int j=0; j<HDM_DIM; j++)
{
sum += b_v[j]*g_ori[j];
}
// Write this thread's value to global memory
LF[tx]= sum;
}
Worth mentioning is that this calc_LF kernel used to write its final result in shared memory which increased execution time from about 500+ ms to about 2,500 ms (i.e. just the shared memory write line roughly multiplied the time by 5).
__global__ void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
__shared__ double T[H];
__shared__ double G[H];
// Get this block's global index
int blockId= blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z;
int tx= threadIdx.x;
// This thread's global index
int gtx= blockId*blockDim.x + threadIdx.x;
int myTRA[W];
double my_LF[W];
for (int i=0; i<W; i++){
my_LF[i]= LF[gtx];
}
for(int j=0; j<W; j++){
myTRA[j]= getElement(g_traD, tx, j, W);
}
double sum;
for(int j=0; j<W; j++)
{
sum += myTRA[j] * my_LF[j];
}
// Write your sum to shared memory
G[tx]=sum;
__syncthreads();
}
The effect you are seeing is the result of compiler optimisation. Taking a compilable version of your basic kernel code:
#define H (128)
#define W (128)
__device__
double getElement(const double *g, int t, int j, int w)
{
return g[t + j*w];
}
__global__
void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
__shared__ double G[H];
// Get this block's global index
int blockId= blockIdx.x + gridDim.x*blockIdx.y +
gridDim.x*gridDim.y*blockIdx.z;
int tx= threadIdx.x;
// This thread's global index
int gtx= blockId*blockDim.x + threadIdx.x;
int myTRA[W];
double my_LF[W];
for (int i=0; i<W; i++){
my_LF[i]= LF[gtx];
}
for(int j=0; j<W; j++){
myTRA[j]= getElement(g_traD, tx, j, W);
}
double sum;
for(int j=0; j<W; j++)
{
sum += myTRA[j] * my_LF[j];
}
// Write your sum to shared memory
G[tx]=sum;
__syncthreads();
}
and compiling it with CUDA 5 gives this:
$ nvcc -m64 -arch=sm_20 -cubin -Xptxas="-v" dead_code.cu
dead_code.cu(13): warning: variable "G" was set but never used
dead_code.cu(13): warning: variable "G" was set but never used
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z6calc_SPdS_S_S_' for 'sm_20'
ptxas info : Function properties for _Z6calc_SPdS_S_S_
1536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 23 registers, 1024 bytes smem, 64 bytes cmem[0]
There is a warning about the shared memory variable G no being used, but the compiler honors it and emits code consuming 23 registers. So now, if I comment out the G[tx]=sum at the end of the kernel, it compiles like this:
$ nvcc -m64 -arch=sm_20 -cubin -Xptxas="-v" dead_code.cu
dead_code.cu(13): warning: variable "G" was declared but never referenced
dead_code.cu(13): warning: variable "G" was declared but never referenced
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z6calc_SPdS_S_S_' for 'sm_20'
ptxas info : Function properties for _Z6calc_SPdS_S_S_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 2 registers, 64 bytes cmem[0]
Now there are only two registers used, and the toolchain emitted this:
$ cuobjdump -sass dead_code.cubin
code for sm_20
Function : _Z6calc_SPdS_S_S_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0xfc1fdc03207e0000*/ IMAD.U32.U32 RZ, R1, RZ, RZ;
/*0010*/ /*0xffffdc0450ee0000*/ BAR.RED.POPC RZ, RZ;
/*0018*/ /*0x00001de780000000*/ EXIT;
ie. four assembly instructions. All your code is gone.
The underlying source of this effect is compiler dead code removal. The compiler is smart enough to determine that code which has no effect on a global or shared memory output is unneeded and can be removed. In this case, one the write to G is removed, the whole kernel is effectively pointless, and the compiler just optimises the whole thing away. You can see some other examples of dead code removal and its effects here and here. The latter is in OpenCL, but the same mechanism applies.

One dimensional fftshift in CUDA

I'm setting up a one dimensional fftshift in CUDA. My code is the following
__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
double2 temp;
if(i< N/2)
{
temp.x = u_d[i].x;
temp.y = u_d[i].y;
u_d[i].x =u_d[i+N/2].x;
u_d[i].y =u_d[i+N/2].y;
u_d[i+N/2].x = temp.x;
u_d[i+N/2].y = temp.y;
}
}
Is there any way, smarter than that shown above, to perform the fftshift in CUDA?
Thanks in advance.
A PERHAPS BETTER SOLUTION
I found that perhaps the following solution could be a good alternative
__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < N)
{
double a = pow(-1.0,i&1);
u_d[i].x *= a;
u_d[i].y *= a;
}
}
It consists in multiplying the vector to be transformed by a sequence of 1s and -1s which is equivalent to the multiplication by exp(-jnpi) and thus to a shift in the conjugate domain.
You have to call this kernel before and after the application of the CUFFT.
One pro is that memory movements/swapping are avoided and the idea can be immediately extended to the 2D case, see CUDA Device To Device transfer expensive.
CONCERNING SYMMETRIC DATA
This solution seems not to be limited to symmetric data. Try for example the following Matlab code, applying the idea to a completely complex random matrix (Gaussian amplitude and uniform phase).
N1=512;
N2=256;
Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);
Im=Magnitude.*exp(j*Phase);
Transform=fftshift(fft2(ifftshift(Im)));
n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);
100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))
The returned normalized root mean square error will be 0, confirming that Transform=Im4.
IMPROVEMENT TO THE SPEED
Following the suggestion received at the NVIDIA Forum, improved speed can be achieved as by changing the instruction
double a = pow(-1.0,i&1);
to
double a = 1-2*(i&1);
to avoid the use of the slow routine pow.
After much time and the introduction of the callback functionality of cuFFT, I can provide a meaningful answer to my own question.
Above I was proposing a "perhaps better solution". After some testing, I have realized that, without using the callback cuFFT functionality, that solution is slower because it uses pow. Then, I have explored two alternatives to the use of pow, something like
float a = (float)(1-2*((int)offset%2));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
and
float2 out = ((float2*)d_in)[offset];
if ((int)offset&1) {
out.x = -out.x;
out.y = -out.y;
}
But, with standard cuFFT, all the above solutions require two separate kernel calls, one for the fftshift and one for the cuFFT execution call. However, with the new cuFFT callback functionality, the above alternative solutions can be embedded in the code as __device__ functions.
So, finally I ended up with the below comparison code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <assert.h>
#include <cufft.h>
#include <cufftXt.h>
//#define DEBUG
#define BLOCKSIZE 256
/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#endif
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2)
{
float2 temp = d_inout[tid];
d_inout[tid] = d_inout[tid + (N / 2)];
d_inout[tid + (N / 2)] = temp;
}
}
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float a = (float)(1-2*((int)offset%2));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float a = pow(-1.,(double)(offset&1));
float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;
/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {
float2 out = ((float2*)d_in)[offset];
if ((int)offset&1) {
out.x = -out.x;
out.y = -out.y;
}
return out;
}
__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;
/********/
/* MAIN */
/********/
int main()
{
const int N = 131072;
printf("N = %d\n", N);
// --- Host side input array
float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
for (int i=0; i<N; i++) {
h_vect[i].x = (float)rand() / (float)RAND_MAX;
h_vect[i].y = (float)rand() / (float)RAND_MAX;
}
// --- Host side output arrays
float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));
// --- Device side input arrays
float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
// --- Device side output arrays
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/*******************************************/
/* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
/*******************************************/
cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Memory movements elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));
/****************************************/
/* CHESSBOARD MULTIPLICATION V1 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v1 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }
printf("Chessboard v1 test passed!\n");
/****************************************/
/* CHESSBOARD MULTIPLICATION V2 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v2 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }
printf("Chessboard v2 test passed!\n");
/****************************************/
/* CHESSBOARD MULTIPLICATION V3 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;
gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v3 elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }
printf("Chessboard v3 test passed!\n");
return 0;
}
RESULTS ON A GTX 480
N Mem mov v1 v2 v3
131072 0.552 0.136 0.354 0.183
262144 0.536 0.175 0.451 0.237
524288 0.661 0.283 0.822 0.290
1048576 0.784 0.565 1.548 0.548
2097152 1.298 0.952 2.973 0.944
RESULTS ON A TESLA C2050
N Mem mov v1 v2 v3
131072 0.278 0.130 0.236 0.132
262144 0.344 0.202 0.374 0.206
524288 0.544 0.378 0.696 0.387
1048576 0.909 0.695 1.294 0.695
2097152 1.656 1.349 2.531 1.349
RESULTS ON A KEPLER K20c
N Mem mov v1 v2 v3
131072 0.077 0.076 0.136 0.076
262144 0.142 0.128 0.202 0.127
524288 0.268 0.229 0.374 0.230
1048576 0.516 0.433 0.717 0.435
2097152 1.019 0.853 1.400 0.855
Some more details have recently appeared at The 1D fftshift in CUDA by chessboard multiplication and at the GitHub page.
If space is not a concern (and are using fftshift for only one dimension), create u_d with size 1.5 x N, and write the first N/2 elements at the end. You can then move u_d to u_d + N / 2
Here is how you could do it.
double2 *u_d, *u_d_begin;
size_t bytes = N * sizeof(double2);
// This is different from bytes / 2 when N is odd
size_t half_bytes = (N / 2) * sizeof(double2);
CUDA_CHK(cudaMalloc( &u_d, bytes + half_bytes ));
u_d_begin = u_d;
...
// Do some processing and populate u_d;
...
// Copy first half to the end
CUDA_CHK(cudaMemcpy(u_d + N, u_d, half_bytes, cudaMemcpyDeviceToDevice));
u_d = u_d + N /2;

Cuda Exceptions

I am doing something in CUDA (FFT), but I have no idea why it is generating exceptions when calling the kernel function.
All includes and definitions:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define CPU_ARRAY_SIZE 1024 // 1024, 2048, 4096 8192
#define GPU_ARRAY_SIZE 512 //
#define THREAD_SIZE 16 // fixed
#define BLOCK_SIZE (GPU_ARRAY_SIZE/THREAD_SIZE) // 32
#define PI 3.14
As I am running it in a NVIDIA GTX480, I thought it could be the shared memory space, although it doesn't seem to be (as there are "some many" shared variables). So, I aws changing the GPU_ARRAY_SIZE to see how it works, and it was giving me different results when I define it as 32, 64, 256, 512 (in the 512 case, it returns ALL zeros, which I guess CUDA couldn't make anything - in other cases, it returns weird, as I don't know the reason why it jumps 16 cells without any calculation). In most cases, in the Output window of my Microsoft Visual Studio, it returns billions of exceptions of the style "First-chance exception at 0x75b9b9bc in .exe: Microsoft C++ exception: cudaError_enum at memory location ". Before you ask me to debug, I cannot debug it, as the VS doesn't do that for files that are not recognized by VS (like .cpp - at least this theory works in my case).
Do you guys have any idea for the questions:
1. why is it generating exceptions?
2. why is it calculating, what it should do for every cell in every block, just within few cells
How could I solve this problem... any idea?
Kernel function:
__global__ void twiddle_factor(double *d_isub_matrix, double *d_osub_matrix)
{
__shared__ double block[THREAD_SIZE][THREAD_SIZE];
__shared__ double spectrum[THREAD_SIZE][THREAD_SIZE];
__shared__ double sum_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double sum_sin[THREAD_SIZE][THREAD_SIZE];
__shared__ double local_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double local_sin[THREAD_SIZE][THREAD_SIZE];
unsigned int xIndex = threadIdx.x + blockIdx.x* blockDim.x;
unsigned int yIndex = threadIdx.y + blockIdx.y* blockDim.y;
int u;
int x=0,y=0;
int tx = threadIdx.x;
int ty = threadIdx.y;
double sum_sines=0.0,sum_cosines=0.0;
double angle=(2*PI)/GPU_ARRAY_SIZE;
block[tx][ty] = d_isub_matrix[yIndex*GPU_ARRAY_SIZE+xIndex];
__syncthreads();
//for every column!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one row is activate. The thread in row adds all element of its column. */
if (ty == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
/* Access each column to add all elements of the column.*/
for (y=0; y<THREAD_SIZE; y++)
{
sum_sines += local_sin[tx][y];
sum_cosines += local_cos[tx][y];
}
//if (sum_sines < 0)
//sum_sin[u][tx] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[u][tx] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[u][tx] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[u][tx] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)
+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
block[tx][ty] = spectrum[tx][ty];
__syncthreads();
//for every row!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one column is activate. The thread in colum adds all element of its row. */
if (tx == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
for (x=0; x<THREAD_SIZE; x++)
{
sum_sines += local_sin[x][ty];
sum_cosines += local_cos[x][ty];
}
//if (sum_sines < 0)
//sum_sin[ty][u] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[ty][u] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[ty][u] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[ty][u] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
/* Transpose! I think this is not necessary part. */
d_osub_matrix[xIndex*GPU_ARRAY_SIZE + yIndex] = spectrum[threadIdx.y][threadIdx.x];
__syncthreads();
}
The main function:
int main(int argc, char** argv)
{
int i,j, w, h, sw, sh;
int numSubblock = CPU_ARRAY_SIZE / GPU_ARRAY_SIZE;
double *d_isub_matrix,*d_osub_matrix;
double *big_matrix = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *big_matrix2 = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *isub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
double *osub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
cudaEvent_t start,stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (i=0; i<CPU_ARRAY_SIZE; i++)
{
for (j=0; j<CPU_ARRAY_SIZE; j++)
big_matrix[i*CPU_ARRAY_SIZE + j] = rand();//i*CPU_ARRAY_SIZE + j;
}
cudaEventRecord(start,0);
//cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
//cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
for(i = 0; i < numSubblock; i++)
{
for (j=0; j < numSubblock; j++)
{
// start position of subarea of big array
cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
h = i*GPU_ARRAY_SIZE;
w = j*GPU_ARRAY_SIZE;
//printf("h = %d, w=%d",h,w);
//system("PAUSE");
// move subarea of big array into isub array.
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
isub_matrix[sh*GPU_ARRAY_SIZE+sw] = big_matrix[(h+sh)*CPU_ARRAY_SIZE + (w+sw)];
}
}
cudaMemcpy(d_isub_matrix,isub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);
//call the cuda kernel
dim3 blocks(BLOCK_SIZE, BLOCK_SIZE);
dim3 threads(THREAD_SIZE, THREAD_SIZE);
twiddle_factor<<<blocks, threads>>>(d_isub_matrix,d_osub_matrix);
cudaMemcpy(osub_matrix,d_osub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
big_matrix2[(h+sh)*CPU_ARRAY_SIZE + (w+sw)] = osub_matrix[sh*GPU_ARRAY_SIZE+sw];
printf(" sh %d sw %d %lf \n", sh, sw, osub_matrix[sh*GPU_ARRAY_SIZE+sw]);
}
}
printf("passei por aqui algumas vezes\n");
cudaFree(d_osub_matrix);
cudaFree(d_isub_matrix);
}
}
// cudaFree(d_osub_matrix);
// cudaFree(d_isub_matrix);
//Stop the time
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime,start,stop);
//showing the processing time
printf("The processing time took... %fms to execute everything",elapsedtime);
system("PAUSE");
for (sh = 0; sh < CPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <CPU_ARRAY_SIZE; sw++)
{
printf(" sh %d sw %d %lf \n", sh, sw, big_matrix2[sh*CPU_ARRAY_SIZE+sw]);
}
}
system("PAUSE");
// I guess the result is "[1][0] = [1], [1][512] = [513], [513][0] = [524289], [513][512] = [524801]".
}
By a short look the problem could and should be the folling lines:
// start position of subarea of big array
cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
You are allocating just to few memory for your double values on the GPU. Your sub matrix is allocated with 4 byte per point where 8 byte are needed.