Wrong results cufft 3D in-place - cuda

I write because I'm facing problems with the cufft 3D transform in-place, while I have no problems for the out-of-place version. I tried to follow Robert Crovella's answer here but I'm not obtaining the correct results when I make a FFT+IFT.
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <complex.h>
#include <cuComplex.h>
#include <cufft.h>
// Main function
int main(int argc, char **argv){
int N = 4;
double *in = NULL, *d_in = NULL;
cuDoubleComplex *out = NULL, *d_out = NULL;
cufftHandle plan_r2c, plan_c2r;
unsigned int out_mem_size = sizeof(cuDoubleComplex) * N*N*(N/2 + 1);
unsigned int in_mem_size = out_mem_size;
in = (double *) malloc (in_mem_size);
out = (cuDoubleComplex *)in;
cudaMalloc((void **)&d_in, in_mem_size);
d_out = (cuDoubleComplex *)d_in;
cufftPlan3d(&plan_r2c, N, N, N, CUFFT_D2Z);
cufftPlan3d(&plan_c2r, N, N, N, CUFFT_Z2D);
memset(in, 0, in_mem_size);
unsigned int idx;
for (int z = 0; z < N; z++){
for (int y = 0; y < N; y++){
for (int x = 0; x < N; x++){
idx = z + N * ( y + x * N);
in[idx] = idx;
}
}
}
printf("\nStart: \n");
for (int z = 0; z < N; z++){
printf("plane = %d ----------------------------\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N * ( y + x * N);
printf("%.3f \t", in[idx]);
}
printf("\n");
}
}
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cufftExecD2Z(plan_r2c, (cufftDoubleReal *)d_in, (cufftDoubleComplex *)d_out);
cufftExecZ2D(plan_c2r, (cufftDoubleComplex *)d_out, (cufftDoubleReal *)d_in);
memset(in, 0, in_mem_size);
CU_ERR_CHECK( cudaMemcpy(in, d_in, in_mem_size, cudaMemcpyDeviceToHost) );
printf("\nAfter FFT+IFT: \n");
for (int z = 0; z < N; z++){
printf("plane = %d ----------------------------\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N * ( y + x * N);
// Normalisation
in[idx] /= (N*N*N);
printf("%.3f \t", in[idx]);
}
printf("\n");
}
}
return 0;
}
The program outputs the following data:
Starting file
plane = 0 ----------------------------
0.000 4.000 8.000 12.000
16.000 20.000 24.000 28.000
32.000 36.000 40.000 44.000
48.000 52.000 56.000 60.000
plane = 1 ----------------------------
1.000 5.000 9.000 13.000
17.000 21.000 25.000 29.000
33.000 37.000 41.000 45.000
49.000 53.000 57.000 61.000
plane = 2 ----------------------------
2.000 6.000 10.000 14.000
18.000 22.000 26.000 30.000
34.000 38.000 42.000 46.000
50.000 54.000 58.000 62.000
plane = 3 ----------------------------
3.000 7.000 11.000 15.000
19.000 23.000 27.000 31.000
35.000 39.000 43.000 47.000
51.000 55.000 59.000 63.000
After FFT+IFT
plane = 0 ----------------------------
-0.000 -0.344 8.000 12.000
-0.031 20.000 24.000 -0.031
32.000 36.000 0.031 44.000
48.000 -0.094 56.000 60.000
plane = 1 ----------------------------
1.000 -0.000 9.000 13.000
-0.000 21.000 25.000 0.125
33.000 37.000 0.000 45.000
49.000 0.000 57.000 61.000
plane = 2 ----------------------------
2.000 6.000 -0.000 14.000
18.000 0.000 26.000 30.000
0.000 38.000 42.000 -0.000
50.000 54.000 -0.000 62.000
plane = 3 ----------------------------
3.000 7.000 0.031 15.000
19.000 -0.031 27.000 31.000
-0.031 39.000 43.000 0.031
51.000 55.000 0.031 63.000
I even tried to pad the data this way:
// With padding
unsigned int idx;
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
for (int z = 0; z < 2*(N/2+1); z++){
idx = z + N * ( y + x * N);
if (z < 4) in[idx] = idx;
else in[idx] = 0;
}
}
}
What am I doing wrong?

As you already found out, you need padding if you use the CUFFT_COMPATIBILITY_FFTW_PADDINGcompatibility mode which is default. For your code to work you could use cufftSetCompatibilityMode() to set CUFFT_COMPATIBILITY_NATIVE. However, this mode is marked as deprecated in the current version of CUDA.
Therefore, I recommend to use the default compatibility mode and use padding. Your try to implement padding is wrong. The formula to calculate a linear index for 3 dimension x, y, z where z is the fastest running index is idx = z + Nz*(y + Ny*x). The size Nz of the z dimension including padding is Nz = (N/2+1)*2. Then, the correct initialization of the array is:
unsigned int idx;
for (int z = 0; z < N; z++){
for (int y = 0; y < N; y++){
for (int x = 0; x < N; x++){
idx = z + (N/2+1)*2 * ( y + x * N);
in[idx] = idx;
}
}
}
Accordingly for the print loops.

Related

Using cublasGemmBatchedEx

I am trying to use cublasGemmBatchedEx to perform matrix multiplication.
Here is my code.
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i][k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
for (int i = 0; i < batch_size; i++){
h_A[i] = (float*)malloc(M * K * sizeof(float));
h_B[i] = (float*)malloc(K * N * sizeof(float));
h_C[i] = (float*)malloc(M * N * sizeof(float));
}
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i][j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i][j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i][j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];
for (int i = 0; i < batch_size; i++){
cudaMalloc(&d_A[i], sizeof(float)* M * K);
cudaMalloc(&d_B[i], sizeof(float)* K * N);
cudaMalloc(&d_C[i], sizeof(float)* M * N);
}
cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
cublasStatus_t status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_A, CUDA_R_32F, lda,
(const void**)d_B, CUDA_R_32F, ldb,
&beta,
(void**)d_C, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C,d_C,sizeof(float) * M * N * batch_size, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
}
This is the result when I ran this code.
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
C =
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
The problem is I don't get a expected result.
It comes full of zeros.
Is there any problem with my code?
The matrix arguments that you pass to this cublas function need to be an array of device pointers, where each device pointer is properly allocated, properly copied, and has a proper population of its allocation.
There are at least several problems with your attempt to do this. The central problem is around these lines:
cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);
You cannot copy an array of independent allocations that way, and furthermore you are copying pointer data using sizes and types as if it were matrix contents (float * vs. float).
The following example has various issues fixed:
$ cat t2167.cu
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i][k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
for (int i = 0; i < batch_size; i++){
h_A[i] = (float*)malloc(M * K * sizeof(float));
h_B[i] = (float*)malloc(K * N * sizeof(float));
h_C[i] = (float*)malloc(M * N * sizeof(float));
}
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i][j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i][j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i][j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];
for (int i = 0; i < batch_size; i++){
cudaMalloc(&d_A[i], sizeof(float)* M * K);
cudaMemcpy(d_A[i], h_A[i], sizeof(float)*M*K, cudaMemcpyHostToDevice);
cudaMalloc(&d_B[i], sizeof(float)* K * N);
cudaMemcpy(d_B[i], h_B[i], sizeof(float)*N*K, cudaMemcpyHostToDevice);
cudaMalloc(&d_C[i], sizeof(float)* M * N);
cudaMemcpy(d_C[i], h_C[i], sizeof(float)*N*M, cudaMemcpyHostToDevice);
}
float **d_dA, **d_dB, **d_dC;
cudaMalloc(&d_dA, sizeof(float *)*batch_size);
cudaMalloc(&d_dB, sizeof(float *)*batch_size);
cudaMalloc(&d_dC, sizeof(float *)*batch_size);
cudaMemcpy(d_dA, d_A, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dB, d_B, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dC, d_C, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_dA, CUDA_R_32F, lda,
(const void**)d_dB, CUDA_R_32F, ldb,
&beta,
(void**)d_dC, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
for (int i = 0; i < batch_size; i++)
cudaMemcpy(h_C[i], d_C[i], sizeof(float)*M*N, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_dA);
cudaFree(d_dB);
cudaFree(d_dC);
for (int i = 0; i < batch_size; i++){
free(h_A[i]);
free(h_B[i]);
free(h_C[i]);
cudaFree(d_A[i]);
cudaFree(d_B[i]);
cudaFree(d_C[i]);}
}
$ nvcc -o t2167 t2167.cu -lcublas
$ compute-sanitizer ./t2167
========= COMPUTE-SANITIZER
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
C =
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66
========= ERROR SUMMARY: 0 errors
$
given that cublasGemmBatchedEx() requires that all the matrices across the batch be of identical sizes, and given that we are starting with a clean slate for this exercise, we can use a somewhat simpler realization by concatenating matrices so we can do single allocations for the batch. This won't work for every use case, but may be of interest:
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float *A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i*rows*cols+k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A, *h_B, *h_C;
h_A = (float*)malloc(M * K * sizeof(float) * batch_size);
h_B = (float*)malloc(K * N * sizeof(float) * batch_size);
h_C = (float*)malloc(M * N * sizeof(float) * batch_size);
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i*M*K+j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i*K*N+j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i*M*N+j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, sizeof(float)* M * K*batch_size);
cudaMemcpy(d_A, h_A, sizeof(float)*M*K*batch_size, cudaMemcpyHostToDevice);
cudaMalloc(&d_B, sizeof(float)* K * N*batch_size);
cudaMemcpy(d_B, h_B, sizeof(float)*N*K*batch_size, cudaMemcpyHostToDevice);
cudaMalloc(&d_C, sizeof(float)* M * N*batch_size);
cudaMemcpy(d_C, h_C, sizeof(float)*N*M*batch_size, cudaMemcpyHostToDevice);
float *h_dA[batch_size], *h_dB[batch_size], *h_dC[batch_size];
for (int i = 0; i < batch_size; i++){
h_dA[i] = d_A+i*M*K;
h_dB[i] = d_B+i*K*N;
h_dC[i] = d_C+i*M*N;}
float **d_dA, **d_dB, **d_dC;
cudaMalloc(&d_dA, sizeof(float *)*batch_size);
cudaMalloc(&d_dB, sizeof(float *)*batch_size);
cudaMalloc(&d_dC, sizeof(float *)*batch_size);
cudaMemcpy(d_dA, h_dA, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dB, h_dB, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_dC, h_dC, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_dA, CUDA_R_32F, lda,
(const void**)d_dB, CUDA_R_32F, ldb,
&beta,
(void**)d_dC, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C, d_C, sizeof(float)*M*N*batch_size, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_dA);
cudaFree(d_dB);
cudaFree(d_dC);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
}

cuda batched cholesky factorization

I kinda understand how to deal with 2D cuda. But batched cholesky has a 4D towards the end of the algorithm. I attached cholesky and my cuda code if anyone could give me a hint.
int i, k, m, n;
// Batched Cholesky factorization.
for (i = 0; i < batch; i++) {
float *pA = &dA[i*N*N];
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
Cuda:
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.y * blockDim.y + threadIdx.y;
int m = blockIdx.z * blockDim.z + threadIdx.z;
int n = blockIdx.z * blockDim.z + threadIdx.z;
if( k >= N || m >= N || n >= N || i >= batch ) return;
float *pA = &dA[i*N*N];
pA[k*N+k] = sqrtf(pA[k*N+k]);
pA[k*N+m] /= pA[k*N+k];
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
starter:
dim3 dimBlock( (batch+31)/32, (n+31)/32, (n+31)/32 );
dim3 dimGrid( 32, 32, 32);
spotrf_batched_kernel<<< dimBlock, dimGrid, 0, stream>>>(n, batch, dA);
I am going to leave this here without much comment. The code is relatively self-explanatory. This implementation is completely faithful to your serial version, with the following features:
Each block performs exactly one factorization in the batch. Run as many blocks as there are batched matrices to factorize.
Because the factorization is all done at block scope, synchronization between parallel operations is possible, so the order of operations of the factorization is respected
The only parallelism the algorithm exposes is within the row operations of the factorization and update operations
Blocks should be sized according to the number of rows in the batch matrix size in round multiples of the warp size (32 on all CUDA capable devices to date)
The code below has been extremely lightly tested and is not guaranteed to work or be correct. Use at your own peril:
#include <iostream>
#include <algorithm>
__global__
void batchkernel(float** batches, int nbatches, int N, int LDA)
{
if (blockIdx.x < nbatches) {
float* pA = batches[blockIdx.x];
for (int k = 0; k < N; k++) {
// Panel factorization.
if (threadIdx.x == 0) {
pA[k*LDA+k] = sqrtf(pA[k*LDA+k]);
}
__syncthreads();
for (int m = threadIdx.x; ((m < N) && (threadIdx.x > k)); m+=blockDim.x) {
pA[k*LDA+m] /= pA[k*LDA+k];
}
__syncthreads();
// Update of the trailing submatrix.
for (int n = k+1; (n < N); n++) {
for (int m = threadIdx.x; ((m < N) && (threadIdx.x >= n)); m+=blockDim.x) {
pA[n*LDA+m] -= pA[k*LDA+n] * pA[k*LDA+m];
}
}
__syncthreads();
}
}
}
void refCholeskey(float* pA, int N)
{
int k, m, n;
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
int main()
{
// B = np.random.random((10,10))
// SPDmatrix = (0.5*(B+B.T)) + B.shape[0]*np.eye(B.shape[0])
const int N = 10;
const int LDA = 10;
float SPDmatrix[LDA*N] = {
10.22856331, 0.17380577, 0.61779525, 0.66592082, 0.46915566,
0.09946502, 0.69386511, 0.35224291, 0.53155506, 0.51441469,
0.17380577, 10.67971161, 0.34481401, 0.64766522, 0.22372943,
0.55896022, 0.59083588, 0.48872497, 0.54049871, 0.74764959,
0.61779525, 0.34481401, 10.229388, 0.40904432, 0.5015491,
0.52152334, 0.19684814, 0.28262256, 0.04384535, 0.61919751,
0.66592082, 0.64766522, 0.40904432, 10.78410647, 0.12708693,
0.3241063, 0.6984497, 0.65074097, 0.08027563, 0.56332844,
0.46915566, 0.22372943, 0.5015491, 0.12708693, 10.52234091,
0.76346103, 0.80932473, 0.8234331, 0.52737611, 0.65777357,
0.09946502, 0.55896022, 0.52152334, 0.3241063, 0.76346103,
10.54906761, 0.32865411, 0.32467483, 0.80720007, 0.36287463,
0.69386511, 0.59083588, 0.19684814, 0.6984497, 0.80932473,
0.32865411, 10.29729551, 0.34707933, 0.69379356, 0.87612982,
0.35224291, 0.48872497, 0.28262256, 0.65074097, 0.8234331,
0.32467483, 0.34707933, 10.42929929, 0.78849458, 0.159371,
0.53155506, 0.54049871, 0.04384535, 0.08027563, 0.52737611,
0.80720007, 0.69379356, 0.78849458, 10.49604818, 0.43871288,
0.51441469, 0.74764959, 0.61919751, 0.56332844, 0.65777357,
0.36287463, 0.87612982, 0.159371, 0.43871288, 10.94535485 };
const int nbatches = 8;
float** batches;
cudaMallocManaged((void **)&batches, nbatches * sizeof(float*));
for(int i=0; i<nbatches; i++) {
cudaMallocManaged((void **)&batches[i], N * LDA * sizeof(float));
cudaMemcpy(batches[i], SPDmatrix, N * LDA * sizeof(float), cudaMemcpyDefault);
}
int blocksz = 32;
int nblocks = nbatches;
batchkernel<<<nblocks, blocksz>>>(batches, nbatches, N, LDA);
refCholeskey(SPDmatrix, N);
cudaDeviceSynchronize();
float maxabsrelerror = 0.0f;
for(int i = 0; i < N*N; i++) {
float absrelerror = std::fabs(SPDmatrix[i] - batches[0][i]) / std::fabs(SPDmatrix[i]);
maxabsrelerror = std::max(absrelerror, maxabsrelerror);
}
std::cout << "Maximum absolute relative error = " << maxabsrelerror << std::endl;
cudaDeviceReset();
return 0;
}

cuFFT wrong results only when starting from complex

I was helped before in this answer to realise an in-place transform and it works well but ONLY if I start with real data. If I start with complex data, the results after IFT+FFT are wrong, and this happens only in the in-place version, I have perfect results with an out-of-place version of this transform.
This is the code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <complex.h>
#include <cuComplex.h>
#include <cufft.h>
#include <cufftXt.h>
#define N 4
#define N_PAD ( 2*(N/2+1) )
void print_3D_Real(double *array){
printf("\nPrinting 3D real matrix \n");
unsigned long int idx;
for (int z = 0; z < N; z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N_PAD * (y + x * N);
printf("%.3f \t", array[idx]);
}
printf("\n");
}
}
}
void print_3D_Comp(cuDoubleComplex *array){
printf("\nPrinting 3D complex matrix \n");
unsigned long int idx;
for (int z = 0; z < (N/2+1); z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + (N/2+1) * (y + x * N);
printf("%+.3f%+.3fi \t", array[idx].x, array[idx].y);
}
printf("\n");
}
}
}
// Main function
int main(int argc, char **argv){
CU_ERR_CHECK( cudaSetDevice(0) );
unsigned long int idx, in_mem_size, out_mem_size;
cuDoubleComplex *in = NULL, *d_in = NULL;
double *out = NULL, *d_out = NULL;
cufftHandle plan_r2c, plan_c2r;
in_mem_size = sizeof(cuDoubleComplex) * N*N*(N/2+1);
out_mem_size = in_mem_size;
in = (cuDoubleComplex *) malloc (in_mem_size);
out = (double *) in;
cudaMalloc((void **)&d_in, in_mem_size);
d_out = (double *) d_in;
cufftPlan3d(&plan_c2r, N, N, N, CUFFT_Z2D);
cufftPlan3d(&plan_r2c, N, N, N, CUFFT_D2Z);
memset(in, 0, in_mem_size);
memset(out, 0, out_mem_size);
// Initial complex data
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
for (int z = 0; z < (N/2+1); z++){
idx = z + (N/2+1) * (y + x * N);
in[idx].x = idx;
}
}
}
print_3D_Comp(in);
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cufftExecZ2D(plan_c2r, (cufftDoubleComplex *)d_in, (cufftDoubleReal *)d_out);
cudaMemcpy(out, d_out, out_mem_size, cudaMemcpyDeviceToHost);
// Normalisation
for (int i = 0; i < N*N*N_PAD; i++)
out[i] /= (N*N*N);
print_3D_Real(out);
cudaMemcpy(d_out, out, out_mem_size, cudaMemcpyHostToDevice);
cufftExecD2Z(plan_r2c, (cufftDoubleReal *)d_out, (cufftDoubleComplex *)d_in);
cudaMemcpy(in, d_in, in_mem_size, cudaMemcpyDeviceToHost) );
print_3D_Comp(in);
cudaDeviceReset();
return 0;
}
The output of my program is on this pastebin.
Can someone direct me on the right path? Thank you very much in advance.
First of all, your code doesn't compile.
In its most general definition, the fourier transform performs a mapping from one complex domain to another complex domain, and this operation should be reversible.
However, the C2R and R2C are special cases, with an assumption that the signal is completely representable in one of the 2 domains (the "time" domain) as a purely real signal (all imaginary components are zero).
However, it should be evident that there will be some complex "frequency" domain representations that cannot be represented by a purely real time domain signal. If the counter case were true (any complex frequency domain signal can be represented as a purely real time domain signal) then the FFT could not be reversible for a complex time domain signal (since all frequency domain data sets map to purely real time domain data sets.)
Therefore you cannot choose arbitrary data in the frequency domain, and expect it to map correctly into a purely real time domain signal. (*)
As a demonstration, change your input data set to the following:
in[idx].x = (idx)?0:1;
and I believe you will get a "passing" test case.
Furthermore, your allegation that "I have perfect results with an out-of-place version of this transform" I believe cannot be supported, if you are in fact using this particular data set as posted in your question. If you disagree, please post a complete code demonstrating your passing test case with the out-of-place transform, that is otherwise identical to your posted code.
Finally, we can test this with fftw. A conversion of your program to use fftw instead of cufft produces exactly the same output:
$ cat t355.cpp
#include <stdio.h>
#include <stdlib.h>
#include <fftw3.h>
#include <string.h>
#define N 4
#define N_PAD ( 2*(N/2+1) )
void print_3D_Real(double *array){
printf("\nPrinting 3D real matrix \n");
unsigned long int idx;
for (int z = 0; z < N; z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N_PAD * (y + x * N);
printf("%.3f \t", array[idx]);
}
printf("\n");
}
}
}
void print_3D_Comp(fftw_complex *array){
printf("\nPrinting 3D complex matrix \n");
unsigned long int idx;
for (int z = 0; z < (N/2+1); z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + (N/2+1) * (y + x * N);
printf("%+.3f%+.3fi \t", array[idx][0], array[idx][1]);
}
printf("\n");
}
}
}
// Main function
int main(int argc, char **argv){
unsigned long int idx, in_mem_size, out_mem_size;
fftw_complex *in = NULL;
double *out = NULL;
in_mem_size = sizeof(fftw_complex) * N*N*(N/2+1);
out_mem_size = in_mem_size;
in = (fftw_complex *) malloc (in_mem_size);
out = (double *) in;
memset(in, 0, in_mem_size);
memset(out, 0, out_mem_size);
// Initial complex data
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
for (int z = 0; z < (N/2+1); z++){
idx = z + (N/2+1) * (y + x * N);
in[idx][0] = idx;
}
}
}
print_3D_Comp(in);
fftw_plan plan_c2r = fftw_plan_dft_c2r_3d(N, N, N, in, out, FFTW_ESTIMATE);
fftw_plan plan_r2c = fftw_plan_dft_r2c_3d(N, N, N, out, in, FFTW_ESTIMATE);
fftw_execute(plan_c2r);
// Normalisation
for (int i = 0; i < N*N*N_PAD; i++)
out[i] /= (N*N*N);
print_3D_Real(out);
fftw_execute(plan_r2c);
print_3D_Comp(in);
return 0;
}
$ g++ t355.cpp -o t355 -lfftw3
$ ./t355
Printing 3D complex matrix
---------------------------------------------------------------------------- plane 0 below
+0.000+0.000i +3.000+0.000i +6.000+0.000i +9.000+0.000i
+12.000+0.000i +15.000+0.000i +18.000+0.000i +21.000+0.000i
+24.000+0.000i +27.000+0.000i +30.000+0.000i +33.000+0.000i
+36.000+0.000i +39.000+0.000i +42.000+0.000i +45.000+0.000i
---------------------------------------------------------------------------- plane 1 below
+1.000+0.000i +4.000+0.000i +7.000+0.000i +10.000+0.000i
+13.000+0.000i +16.000+0.000i +19.000+0.000i +22.000+0.000i
+25.000+0.000i +28.000+0.000i +31.000+0.000i +34.000+0.000i
+37.000+0.000i +40.000+0.000i +43.000+0.000i +46.000+0.000i
---------------------------------------------------------------------------- plane 2 below
+2.000+0.000i +5.000+0.000i +8.000+0.000i +11.000+0.000i
+14.000+0.000i +17.000+0.000i +20.000+0.000i +23.000+0.000i
+26.000+0.000i +29.000+0.000i +32.000+0.000i +35.000+0.000i
+38.000+0.000i +41.000+0.000i +44.000+0.000i +47.000+0.000i
Printing 3D real matrix
---------------------------------------------------------------------------- plane 0 below
23.500 -1.500 -1.500 -1.500
-6.000 0.000 0.000 0.000
-6.000 0.000 0.000 0.000
-6.000 0.000 0.000 0.000
---------------------------------------------------------------------------- plane 1 below
-0.500 0.750 0.000 -0.750
3.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
-3.000 0.000 0.000 0.000
---------------------------------------------------------------------------- plane 2 below
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
---------------------------------------------------------------------------- plane 3 below
-0.500 -0.750 0.000 0.750
-3.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
3.000 0.000 0.000 0.000
Printing 3D complex matrix
---------------------------------------------------------------------------- plane 0 below
+0.000+0.000i +6.000+0.000i +6.000+0.000i +6.000+0.000i
+24.000+0.000i +30.000+0.000i +30.000+0.000i +30.000+0.000i
+24.000+0.000i +30.000+0.000i +30.000+0.000i +30.000+0.000i
+24.000+0.000i +30.000+0.000i +30.000+0.000i +30.000+0.000i
---------------------------------------------------------------------------- plane 1 below
+1.000+0.000i +4.000+0.000i +7.000+0.000i +10.000+0.000i
+13.000+0.000i +16.000+0.000i +19.000+0.000i +22.000+0.000i
+25.000+0.000i +28.000+0.000i +31.000+0.000i +34.000+0.000i
+37.000+0.000i +40.000+0.000i +43.000+0.000i +46.000+0.000i
---------------------------------------------------------------------------- plane 2 below
+2.000+0.000i +8.000+0.000i +8.000+0.000i +8.000+0.000i
+26.000+0.000i +32.000+0.000i +32.000+0.000i +32.000+0.000i
+26.000+0.000i +32.000+0.000i +32.000+0.000i +32.000+0.000i
+26.000+0.000i +32.000+0.000i +32.000+0.000i +32.000+0.000i
$
(*) You can argue, if you wish, that that the complex-conjugate symmetry feature of the C2R and R2C transforms should account for a correct mapping of all possible complex "frequency" domain signals into unique, purely real "time" domain signals. I claim, without proof, that it does not, with 2 data points:
The example code in this question.
Since the complex space in a C2R or R2C transform is numerically larger than the real space (by a factor of (2*(N/2+1))/N), it stands to reason that there cannot be a unique 1:1 mapping of all possible complex signals into unique real signals. And the unique 1:1 mapping would be necessary for full reversibility.
For additional background on the possibility of lack of symmetry in random data, note the discussion around CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC in the cufft documentation.

CUDA in-place transpose doesn't complete transpose total matrix [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
I've written the CUDA code below. It's supposed to transpose a matrix using tiling blocks, and the code works when using small values, but when using, for example:
TILE = 32, matrix 128 x 128, it doesn't complete the transpose, it stops after 96. In host this is my dimension thread/block
dim3 dimGrid((nEven + TILE_DIM - 1) / TILE_DIM, (nEven + TILE_DIM - 1) / TILE_DIM);
dim3 dimBlock(TILE_DIM, TILE_DIM);
where I let the threads number == to tile block number,
the global code is simple and it should theoretically work:
__global__ void transposeMain( int *idata)
{
__shared__ int tile2[TILE_DIM][TILE_DIM];
int yyy = blockIdx.y * TILE_DIM ; // col values (0,32,64,96)
int xxx = blockIdx.x * TILE_DIM ; // row values (0,32,64,96)
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
__syncthreads();
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}
}
Any idea what might be the problem?
The problem is you are trying to do an in-place transpose.
CUDA device code execution is broken up into threadblocks. Threadblocks (groups of threads) can execute in any order, and do not all (typically) execute at the same time. So when you read a tile in here:
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
That is OK. But when you write the tile:
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
You are frequently over-writing data (in some other tile in the original matrix) which you haven't read yet (because the threadblock responsible for reading that tile hasn't even begun to execute yet). Once you overwrite it like this, it's lost.
The solution (for square matrix transpose) has several aspects to it:
Each threadblock must first read 2 tiles. These 2 tiles from the input data will be swapped.
Then each threadblock can write those two tiles.
The tiles along the main diagonal need special casing.
since most threadblocks are handling 2 tiles, only threadblocks on or on one side of the main diagonal need do any work.
You haven't shown a complete MCVE (which is expected when you have questions like this), and your code has other issues such as the potential for uncoalesced access (lower performance) so I'm not going to try to "fix" your code.
Instead, here's a fully worked example, lifted from here:
$ cat t469.cu
#include <stdio.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#define uS_PER_SEC 1000000
#define uS_PER_mS 1000
#define N 4096
#define M 4096
#define TILE_DIM 32
#define BLOCK_ROWS 8
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
__global__ void iptransposeCoalesced(float *data)
{
__shared__ float tile_s[TILE_DIM][TILE_DIM+1];
__shared__ float tile_d[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
if (blockIdx.y>blockIdx.x) { // handle off-diagonal case
int dx = blockIdx.y * TILE_DIM + threadIdx.x;
int dy = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_d[threadIdx.y+j][threadIdx.x] = data[(dy+j)*width + dx];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(dy+j)*width + dx] = tile_s[threadIdx.x][threadIdx.y + j];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_d[threadIdx.x][threadIdx.y + j];
}
else if (blockIdx.y==blockIdx.x){ // handle on-diagonal case
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_s[threadIdx.x][threadIdx.y + j];
}
}
int validate(const float *mat, const float *mat_t, int n, int m){
int result = 1;
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
if (mat[(i*m)+j] != mat_t[(j*n)+i]) result = 0;
return result;
}
int main(){
timeval t1, t2;
float *matrix = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i ++)
for (int j = 0; j < M; j++)
matrix[(i*M) + j] = i;
// Starting the timer
gettimeofday(&t1, NULL);
float *matrixT = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
matrixT[(j*N)+i] = matrix[(i*M)+j]; // matrix is obviously filled
//Ending the timer
gettimeofday(&t2, NULL);
if (!validate(matrix, matrixT, N, M)) {printf("fail!\n"); return 1;}
float et1 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("CPU time = %fms\n", et1);
float *h_matrixT , *d_matrixT , *d_matrix;
h_matrixT = (float *) (malloc (N * M * sizeof(float)));
cudaMalloc((void **)&d_matrixT , N * M * sizeof(float));
cudaMalloc((void**)&d_matrix , N * M * sizeof(float));
cudaMemcpy(d_matrix , matrix , N * M * sizeof(float) , cudaMemcpyHostToDevice);
//Starting the timer
gettimeofday(&t1, NULL);
const float alpha = 1.0;
const float beta = 0.0;
cublasHandle_t handle;
//gettimeofday(&t1, NULL);
cublasCreate(&handle);
gettimeofday(&t1, NULL);
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, &alpha, d_matrix, M, &beta, d_matrix, N, d_matrixT, N);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cublasDestroy(handle);
//Ending the timer
float et2 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU Sgeam time = %fms\n", et2);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
cudaMemset(d_matrixT,0, N*M*sizeof(float));
memset(h_matrixT, 0, N*M*sizeof(float));
dim3 threads(TILE_DIM, BLOCK_ROWS);
dim3 blocks(N/TILE_DIM, M/TILE_DIM);
gettimeofday(&t1, NULL);
transposeCoalesced<<<blocks, threads >>>(d_matrixT, d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et3 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU kernel time = %fms\n", et3);
memset(h_matrixT, 0, N*M*sizeof(float));
gettimeofday(&t1, NULL);
iptransposeCoalesced<<<blocks, threads >>>(d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrix , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et4 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU in-place kernel time = %fms\n", et4);
cudaFree(d_matrix);
cudaFree(d_matrixT);
return 0;
}
$ nvcc -arch=sm_20 -o t469 t469.cu -lcublas
$ ./t469
CPU time = 450.095001ms
GPU Sgeam time = 1.937000ms
GPU kernel time = 1.694000ms
GPU in-place kernel time = 1.839000ms
$
Note that this compares several different approaches to matrix transpose.
If you study the iptransposeCoalesced you will see that it is adhering to the 4 specific aspects I outlined above.
It is fishy to use __syncthreads(); in the if statement in CUDA. Try to move it outside this block by simple:
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
}
__syncthreads();
if (xxx < nEven && yyy < nEven)
{
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}

scan-array CUDA

I'm trying to scan a simple array using CUDA but it seems there is something wrong with the code below..I am trying to find what i am doing wrong but i can't.Can anyone please help me?
#include <stdio.h>
#include <stdlib.h>
__global__ void prescan(int *g_odata, int *g_idata, int n){
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1){ // build sum in place up the tree
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2){ // traverse down tree & build scan
offset >>= 1;
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(int argc, char *argv[]){
int i;
int *input = 0;
int *output = 0;
int *g_idata = 0;
int *g_odata = 0;
int numblocks = 1;
int radix = 16;
input = (int*)malloc(numblocks*radix*sizeof(int));
output = (int*)malloc(numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_idata, numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_odata, numblocks*radix*sizeof(int));
for(i=0; i<numblocks*radix; i++){
input[i] = 1 + 2*i;
}
for(i=0; i<numblocks*radix; i++){
printf("%d ", input[i]);
}
cudaMemcpy(g_idata, input, numblocks*radix*sizeof(int), cudaMemcpyHostToDevice);
prescan<<<1,8>>>(g_odata, g_idata, numblocks*radix);
cudaThreadSynchronize();
cudaMemcpy(output, g_odata, numblocks*radix*sizeof(int), cudaMemcpyDeviceToHost);
for(i=0; i<numblocks*radix; i++){
printf("%d ", output[i]);
}
free(input);
free(output);
cudaFree(g_idata);
cudaFree(g_odata);
return 0;
}
The output is this: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.I want to have this output: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 1 4 9 16 25 36 49 64 81 100 121 144 169 196 225
Just go through this code to implement scan in parallel environment.
The algorithm which I implemented here is Hillis Steele exclusive scan.I implemented algorithm through shared memory, it will definitely improve the execution time for the large data set.
#include<stdio.h>
#include<math.h>
__global__ void scan(int *d_in,int *d_out,int n)
{
extern __shared__ int sdata[];
int i;
int tid = threadIdx.x;
sdata[tid] = d_in[tid];
for (i = 1; i <n; i <<= 1)
{
if (tid>=i)
{
sdata[tid] +=sdata[tid-i];
}
__syncthreads();
}
d_out[tid] = sdata[tid];
__syncthreads();
}
int main()
{
int h_in[16],h_out[16];
int i,j;
for (i = 0; i < 16; i++)
h_in[i] = 2*i+1;
for (i = 0; i < 16; i++)
printf("%d ", h_in[i]);
int *d_in;
int *d_out;
cudaMalloc((void**)&d_in, sizeof(int)* 16);
cudaMalloc((void**)&d_out, sizeof(int)* 16);
cudaMemcpy(d_in, h_in, sizeof(int) * 16, cudaMemcpyHostToDevice);
scan <<<1, 16, sizeof(int)*16 >>>(d_in,d_out, 16);
cudaMemcpy(h_out, d_out, sizeof(int) * 16, cudaMemcpyDeviceToHost);
for (i = 0; i < 16; i++)
printf("%d ", h_out[i]);
return 0;
}