Cuda idx doesnt index matrices correctly - cuda

I have the following kernel in cuda:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
//for(j=0;j<N;j++){
// outgoing[j].p_t1=ingoing[j].p_t1;
//}
outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
This doesnt work. The following works:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
What is wrong? Why idx doesnt index the matrices correctly?
The whole code is written below. It wouldn't be so easy to understand it. The thing is that when I print the outgoing[idx].p_t1 fields at the end of the main function they print 0s when I do
outgoing[idx].p_t1=ingoing[idx].p_t1;
but they are correct when I do
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
Whats wrong?
/******************** Includes - Defines ****************/
#include "pagerank_serial.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <sys/time.h>
#include <fcntl.h>
#include <cuda.h>
#include "string.h"
/******************** Defines ****************/
// Number of nodes
int N;
// Convergence threashold and algorithm's parameter d
double threshold, d;
// Table of node's data
Node *Nodes;
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
/***** Read graph connections from txt file *****/
void Read_from_txt_file(char* filename)
{
FILE *fid;
int from_idx, to_idx;
int temp_size;
fid = fopen(filename, "r");
if (fid == NULL){
printf("Error opening data file\n");
}
while (!feof(fid))
{
if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx))
{
Nodes[from_idx].con_size++;
temp_size = Nodes[from_idx].con_size;
//Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int));
Nodes[from_idx].To_id[temp_size - 1] = to_idx;
}
}
//printf("End of connections insertion!\n");
fclose(fid);
}
/***** Read P vector from txt file*****/
void Read_P_from_txt_file()
{
FILE *fid;
double temp_P;
int index = 0;
fid = fopen("P.txt", "r");
if (fid == NULL){printf("Error opening the Probabilities file\n");}
while (!feof(fid))
{
// P's values are double!
if (fscanf(fid," double sum = 0;%lf\n", &temp_P))
{
Nodes[index].p_t1 = temp_P;
index++;
}
}
//printf("End of P insertion!");
fclose(fid);
}
/***** Read E vector from txt file*****/
void Read_E_from_txt_file()
{
FILE *fid;
double temp_E;
int index = 0;
fid = fopen("E.txt", "r");
if (fid == NULL)
printf("Error opening the E file\n");
while (!feof(fid))
{
// E's values are double!
if (fscanf(fid,"%lf\n", &temp_E))
{
Nodes[index].e = temp_E;
index++;
}
}
//printf("End of E insertion!");
fclose(fid);
}
/***** Create P and E with equal probability *****/
void Random_P_E()
{
int i;
// Sum of P (it must be =1)
double sum_P_1 = 0;
// Sum of E (it must be =1)
double sum_E_1 = 0;
// Arrays initialization
for (i = 0; i < N; i++)
{
Nodes[i].p_t0 = 0;
Nodes[i].p_t1 = 1;
Nodes[i].p_t1 = (double) Nodes[i].p_t1 / N;
sum_P_1 = sum_P_1 + Nodes[i].p_t1;
Nodes[i].e = 1;
Nodes[i].e = (double) Nodes[i].e / N;
sum_E_1 = sum_E_1 + Nodes[i].e;
}
// Assert sum of probabilities is =1
// Print sum of P (it must be =1)
//printf("Sum of P = %f\n",sum_P_1);
// Exit if sum of P is !=1
assert(sum_P_1 = 1);
//printf("\n");
// Print sum of E (it must be =1)
//printf("Sum of E = %f\n",sum_E_1);
// Exit if sum of Pt0 is !=1
assert(sum_E_1 = 1);
}
/***** Main function *****/
int main(int argc, char** argv)
{
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
// Check input arguments
if (argc < 5)
{
printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n");
return 0;
}
// get arguments
char filename[256];
strcpy(filename, argv[1]);
N = atoi(argv[2]);
threshold = atof(argv[3]);
d = atof(argv[4]);
int i;
// a constant value contributed of all nodes with connectivity = 0
// it's going to be addes to all node's new probability
// Allocate memory for N nodes
Nodes = (Node*) malloc(N * sizeof(Node));
for (i = 0; i < N; i++)
{
Nodes[i].con_size = 0;
//Nodes[i].To_id = (int*) malloc(sizeof(int));
}
Read_from_txt_file(filename);
// set random probabilities
Random_P_E();
Node *h_ingoing;
Node *h_outgoing;
h_ingoing = Nodes;
h_outgoing = (Node *)calloc(N, sizeof *h_outgoing);
Node *d_ingoing;
Node *d_outgoing;
cudaMalloc(&d_ingoing, N * sizeof *d_ingoing);
cudaMalloc(&d_outgoing, N * sizeof *d_outgoing);
cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice);
cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice);
float time;
cudaEvent_t begin, end;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize);
cudaEventCreate(&begin);
cudaEventCreate(&end);
cudaEventRecord(begin, 0);
pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d);
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
cudaEventElapsedTime(&time, begin, end);
cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost);
printf("%f\n", time) ;
printf("\n");
// Print final probabilitities
for (i = 0; i <100; i++)
{
printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1);
}
printf("\n");
printf("End of program!\n");
return (EXIT_SUCCESS);
}

When you say main function they print 0s when I do, I assume you are refering to all entries and not just index 0. Indeed, index 0 is not processed by your code with the fisrt version as ((idx > 0) && (idx < N)) is false for idx=0.
Getting further, in your code, we are missing the definition of the Node type. which is mandatory to get a better understanding of what could go wrong in your code.
Depending on the size of Node, its contents, and the structure packing you are using in compilation, it might be that Node size on host side differs from Node size on device. Using printf to verify that would be usefull, or using a debugger.
Also, you do not seem to be checking for error in launch. You definitely want to add a cudaPeekAtLastError and a cudaDeviceSynchronize after your kernel call to make sure no error occurred. (any other method call from cuda Runtime API may also return errors your code does not check).
EDIT
Trying to reproduce, I wrote the following, as close as possible to your code. I don't have a card with sufficient memory, hence the smaller node count.
typedef struct
{
double p_t0;
double p_t1;
double e;
int To_id[460];
int con_size;
} Node ;
__global__ void pagerank(Node* ingoing, Node* outgoing, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x ;
if ((idx > 0) && (idx < N))
outgoing[idx].p_t1 = ingoing[idx].p_t1;
}
#include <cstdlib>
#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR # %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } }
int main()
{
// int N = 916428 ; // does not fit on my GPU
int N = 400000 ;
int blockSize;
int minGridSize;
int gridSize;
Node* Nodes = (Node*)malloc(N * sizeof (Node)) ;
for (int i = 0 ; i < N ; ++i)
Nodes[i].p_t1 = (double)i+1;
Node* h_ingoing = Nodes;
Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ;
Node* d_ingoing ;
Node* d_outgoing ;
cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing));
cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing));
cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice));
cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice));
float time;
cudaEvent_t begin, end ;
//blockSize = 256 ;
cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ;
gridSize = (N + blockSize -1) / blockSize ;
printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ;
cudaCheck (cudaEventCreate (&begin)) ;
cudaCheck (cudaEventCreate (&end)) ;
cudaCheck (cudaEventRecord (begin, 0)) ;
pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ;
cudaCheck (cudaEventRecord (end, 0)) ;
cudaCheck (cudaEventSynchronize (end)) ;
cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ;
for (int i = 0 ; i < 100 ; ++i)
{
printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ;
}
for (int i = 0 ; i < N ; ++i)
{
if (h_outgoing[i].p_t1 != (double)(i+1))
printf ("Error # %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1));
}
return 0 ;
}
Except at index 0 for which the first draft of answer stated there was an issue, each output is correct.

Related

CUBLAS batch and matrix sizes [duplicate]

Some background info on the problem I am trying to speed up using CUDA:
I have a large number of small/moderate same-sized linear systems I need to solve independently. Each linear system is square, real, dense, invertible, and non-symmetric. These are actually matrix systems so each system look like, AX = B, where A, X, and B are (n x n) matrixes.
In this previous question I ask CUBLAS batch and matrix sizes, where I learn cuBLAS batch operations give best performance for matrix of size 100x100 or smaller.
I still have an issue because the matrices I am working with have 100 < n < 700. So, the matrices are of moderate size where cuBLAS batch operations are not give best performance, and regular BLAS (cusolverDnDgetrf, cusolverDnDgetrs) also are not give better performance than MATLAB (look at timings below).
I did some timing compared to MATLAB, for solving a single system, and found regular BLAS is better for matrices of size (4096x4096) or larger. I make a random matrix of size (n x n), for n=64,256,512,1024,4096,16384, and only time the factorization and back/forward solve, no transfers across PCIE.
DOUBLE PRECISION CUDA (GTX 1080ti) vs MATLAB (backslash)
(GPU) 64: 0.001157 sec
(MATLAB) 64: 0.000205 sec
(GPU) 256: 0.01161 sec
(MATLAB) 256: 0.007762 sec
(GPU) 512: 0.026348 sec
(MATLAB) 512: 0.008550 sec
(GPU) 1024: 0.064357 sec
(MATLAB) 1024: 0.036280 sec
(GPU) 4096: 0.734908 sec
(MATLAB) 4096: 1.174442 sec
(GPU) 16384: 32.962229 sec (MATLAB) 16384: 68.691236 sec
These timing make me conclude that iterating one by one over my matrices calling non-batch inversion method will be slower than MATLAB. Also, for my moderate sized matrices, batch cuBLAS batch inversion method will not perform well, according to CUBLAS batch and matrix sizes.
Is there other approach I should consider to speed up my code with CUDA? Or am I misunderstanding something?
/* How to use
* ./cuSolverDn_LinearSolver // Default: cholesky
* ./cuSolverDn_LinearSolver -R=chol -filefile> // cholesky factorization
* ./cuSolverDn_LinearSolver -R=lu -file<file> // LU with partial pivoting
* ./cuSolverDn_LinearSolver -R=qr -file<file> // QR factorization
*
* Remark: the absolute error on solution x is meaningless without knowing condition number of A.
* The relative error on residual should be close to machine zero, i.e. 1.e-15.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
int linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
}
//checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
//checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, n, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (ipiv ) { checkCudaErrors(cudaFree(ipiv));}
return 0;
}
void generate_random_dense_matrix(int M, int N, double **outA)
{
int i, j;
double rMax = (double)RAND_MAX;
double *A = (double *)malloc(sizeof(double) * M * N);
// For each column
for (j = 0; j < N; j++)
{
// For each row
for (i = 0; i < M; i++)
{
double dr = (double)rand();
A[j * M + i] = (dr / rMax) * 100.0;
//printf("A[j * M + i] = %f \n",A[j * M + i]);
}
}
*outA = A;
}
int main (int argc, char *argv[])
{
struct testOpts opts;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 0; // number of rows of A
int colsA = 0; // number of columns of A
int nnzA = 0; // number of nonzeros of A
int baseA = 0; // base index in CSR format
int lda = 0; // leading dimension in dense matrix
// CSR(A) from I/O
int *h_csrRowPtrA = NULL;
int *h_csrColIndA = NULL;
double *h_csrValA = NULL;
double *h_A = NULL; // dense matrix from CSR(A)
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
colsA = 660;
rowsA = colsA;
int NN = colsA;
int MM = rowsA;
lda = rowsA;
// Generate inputs
srand(9384);
generate_random_dense_matrix(MM, NN, &h_A);
generate_random_dense_matrix(MM, NN, &h_b);
parseCommandLineArguments(argc, argv, opts);
if (NULL == opts.testFunc)
{
//opts.testFunc = "chol"; // By default running Cholesky as NO solver selected with -R option.
opts.testFunc = "lu";
//opts.testFunc = "qr";
}
findCudaDevice(argc, (const char **)argv);
/*
printf("step 1: read matrix market format\n");
if (opts.sparse_mat_filename == NULL)
{
opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
if (opts.sparse_mat_filename != NULL)
printf("Using default input file [%s]\n", opts.sparse_mat_filename);
else
printf("Could not find gr_900_900_crg.mtx\n");
}
else
{
printf("Using input file [%s]\n", opts.sparse_mat_filename);
}
if (opts.sparse_mat_filename == NULL)
{
fprintf(stderr, "Error: input matrix is not provided\n");
return EXIT_FAILURE;
}
if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
&nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true))
{
exit(EXIT_FAILURE);
}
baseA = h_csrRowPtrA[0]; // baseA = {0,1}
printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);
if ( rowsA != colsA )
{
fprintf(stderr, "Error: only support square matrix\n");
exit(EXIT_FAILURE);
}
printf("step 2: convert CSR(A) to dense matrix\n");
lda = opts.lda ? opts.lda : rowsA;
if (lda < rowsA)
{
fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
exit(EXIT_FAILURE);
}
*/
//h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*lda*colsA);
//h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*lda*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
/*
memset(h_A, 0, sizeof(double)*lda*colsA);
for(int row = 0 ; row < rowsA ; row++)
{
const int start = h_csrRowPtrA[row ] - baseA;
const int end = h_csrRowPtrA[row+1] - baseA;
for(int colidx = start ; colidx < end ; colidx++)
{
const int col = h_csrColIndA[colidx] - baseA;
const double Areg = h_csrValA[colidx];
h_A[row + col*lda] = Areg;
}
}
printf("step 3: set right hand side vector (b) to 1\n");
for(int row = 0 ; row < rowsA ; row++)
{
h_b[row] = 1.0;
}
*/
// verify if A is symmetric or not.
if ( 0 == strcmp(opts.testFunc, "chol") )
{
int issym = 1;
for(int j = 0 ; j < colsA ; j++)
{
for(int i = j ; i < rowsA ; i++)
{
double Aij = h_A[i + j*lda];
double Aji = h_A[j + i*lda];
if ( Aij != Aji )
{
issym = 0;
break;
}
}
}
if (!issym)
{
printf("Error: A has no symmetric pattern, please use LU or QR \n");
exit(EXIT_FAILURE);
}
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*lda*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*lda*rowsA));
printf("step 4: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*lda*rowsA, cudaMemcpyHostToDevice));
printf("step 5: solve A*x = b \n");
// d_A and d_b are read-only
if ( 0 == strcmp(opts.testFunc, "chol") )
{
linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "lu") )
{
//printf("hi \n");
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "qr") )
{
linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
}
else
{
fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
exit(EXIT_FAILURE);
}
printf("step 6: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowsA,
colsA,
colsA,
&minus_one,
d_A,
lda,
d_x,
rowsA,
&one,
d_r,
rowsA));
checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*lda*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("x[0] = %f\n", h_x[0]);
printf("r[0] = %f\n", h_r[0]);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));
if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }
if (h_csrValA ) { free(h_csrValA); }
if (h_csrRowPtrA) { free(h_csrRowPtrA); }
if (h_csrColIndA) { free(h_csrColIndA); }
if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }
if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }
return 0;
}
Try using two or more parallel streams (with one linear system each) on the GPU, possibly this helps utilizing a bigger part of the GPU.
For timing measurments and hardware utilization use the visual profiler instead of CPU time measurements.
Another point is, that the GTX (consumer) GPUs perform pretty bad on double preision. If you have the chance, try to use a Tesla GPU instead.
MATLAB provides a way to call the cublas batch interface for GPU arrays using pagefun.

How to keep track of executed CUDA blocks?

Just for the sake of testing my understanding of things, I decided to modify the vector addition found in the CUDA samples so that the kernel quits after a specific time and is then re-launched to complete. The way I achieve the "timeout" is by having a pinned variable that the host sets to 1 after some time. Within the kernel, a check of this variable is performed to determine whether execution should continue. If the thread continues its execution it is marked as complete. In order to test that each thread executes just once, I've modified the addition to C[i] = C[i] + B[i] This all works as expected; the device code looks as follows:
/* Function
* Internal device function used for getting the current thread's global ID
* regardless of the block/grid configuration. It assumes that the
* grid and block are 3 dimensional.
*
* #return: The thread's global ID
*/
static __device__ int get_global_idx()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
/* Function
* Device function that determines if the current thread should continue execution.
* A check should be used on the return value. If the timeout has not been set
* and the thread has not previously executed the index at the thread's ID in the
* thread_ids array is set to 1 to indicate it was allowed to proceed.
*
* #param thread_ids: A pointer to the array with a size that matches the max number
* of threads that will be spawned
*
* #param time_out: Memory mapped variable used by the host to signal the kernel when
* execution should suspend
*
* #return: A boolean value indicating whether the current thread should continue or not
*/
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out)
{
if(*time_out == 1){
return false;
}
int tid = get_global_idx();
if(thread_ids[tid] == 1)
{
return false;
}
thread_ids[tid] = 1;
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout)
{
if(!continue(thread_ids, timeout))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
/* C[i] = A[i] + B[i]; */
C[i] = C[i] + B[i]; //Modifed from above
}
}
I considered how this may fail if __syncthreads() was used. So I decided to do block level suspension. Based on my understanding, I thought this would be simple. Keep track of if a block has started, and count how many threads have executed for that block and only suspend when all threads of an already started block have completed and deny any threads who's block has not started. So I used a struct and modified the continue function as follows:
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
This does not work, when I execute verification on the host (h_B[i] - h_C[i]) I don't get a consistent zero result. Which means that some threads somehow managed to execute multiple times. Any ideas how/why this is happening with the latter attempt? Thanks.
I don't care about performance at this point; just trying to understand what is really happening.
EDIT
Here is the complete code, compile with nvcc file_name.cu and execute program_name <vector-length>.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
if(!continue_execution(time_out, b_info))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
You have a race condition in your continue_execution routine. Consider the following scenario:
warp0 of a threadblock enters the continue_execution routine. At the moment that it checks the variables *time_out and b_info[bid].started it witnesses those to be 0 and 0 respectively. So it proceeds to the next if test.
warp1 of the same threadblock enters the continue_execution routine (let's say slightly later), and it witnesses the variables to be 1 and 0 respectively. So it returns false and causes the warp1 threads to exit.
warp0 continues on and eventually sets b_info[bid].started to 1, and then updates the thread_count. It then returns true and proceeds with the vector add.
I could continue with this, but I think if you consider the above 3 items carefully you will realize it is a case you did not account for. Your implicit expectation is that every thread would read a coherent (i.e. the same across a given threadblock) value for *time_out. But this is not guaranteed by your code, and if it fails to do so, then we end up with some threadblocks where some threads have completed their work and some have not.
So how could we fix this? The above description should point the way. One possible approach is to guarantee that for any given threadblock, that every thread gets the same value for *time_out whether it be 1 or 0. One possible solution would be to make the following changes to the beginning of your vectorAdd kernel:
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
with those changes, we ensure that every thread in a block gets a coherent view of the time out variable, and according to my testing, the problem is resolved:
$ cat t100.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
#ifdef USE_FIX
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
#else
if(!continue_execution(time_out, b_info))
#endif
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %ld elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_61 -o t100 t100.cu
$ ./t100 327678
[Vector addition of 327678 elements]
Result verification failed at element 0!
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$
One other change I made to your code was in this line:
printf("[Vector addition of %d elements]\n", numElements);
this has no bearing on the problem, but your format specifier does not match your variable type. Fix by changing to %ld.

2D Convolution Incorrect Results Cuda Constant Memory

I'm struggling in the kernel code. I have updated this to include support files, but those were provided and should be correct.
This is one of my first GPU programs and I've spent several hours trying new things and I can't seem to get this right. It is compiling and running, but the results are incorrect.
I am basically having trouble understanding what exactly I need to be doing differently because this kernel is giving incorrect results. I'm trying to load a tile of the input image to shared memory (Ns[][], which I think I've done correctly) and apply the filter on the input image tile (which I am struggling with).
I would greatly appreciate it if someone who is more experienced could assist me in figuring out exactly where I've gone wrong and give me an idea how to resolve the issue. I appreciate your time and apologies if I've asked this question incorrectly.
main.cu:
#include <stdio.h>
#include "support.h"
#include "kernel.cu"
#include <time.h>
int main(int argc, char* argv[]){
Timer timer;
time_t t;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image
Matrix N_d, P_d;
unsigned imageHeight, imageWidth;
cudaError_t cuda_ret;
dim3 dim_grid, dim_block;
/* Read image dimensions */
if (argc == 1) {
imageHeight = 600;
imageWidth = 1000;
} else if (argc == 2) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[1]);
} else if (argc == 3) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./convolution # Image is 600 x 1000"
"\n Usage: ./convolution <m> # Image is m x m"
"\n Usage: ./convolution <m> <n> # Image is m x n"
"\n");
exit(0);
}
/* Allocate host memory */
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE);
N_h = allocateMatrix(imageHeight, imageWidth);
P_h = allocateMatrix(imageHeight, imageWidth);
/* Initialize filter and images */
initMatrix(M_h);
initMatrix(N_h);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Image: %u x %u\n", imageHeight, imageWidth);
printf(" Mask: %u x %u\n", FILTER_SIZE, FILTER_SIZE);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
N_d = allocateDeviceMatrix(imageHeight, imageWidth);
P_d = allocateDeviceMatrix(imageHeight, imageWidth);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
/* Copy image to device global memory */
copyToDeviceMatrix(N_d, N_h);
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));
dim_grid = dim3(((N_h.width / BLOCK_SIZE) + 1), ((N_h.height / BLOCK_SIZE) + 1));
dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
convolution<<<dim_grid, dim_block>>>(N_d, P_d);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
copyFromDeviceMatrix(P_h, P_d);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(M_h, N_h, P_h);
// Free memory ------------------------------------------------------------
freeMatrix(M_h);
freeMatrix(N_h);
freeMatrix(P_h);
freeDeviceMatrix(N_d);
freeDeviceMatrix(P_d);
return 0;
}
kernel.cu:
__constant__ float M_c[FILTER_SIZE][FILTER_SIZE];
__global__ void convolution(Matrix N, Matrix P){
__shared__ float Ns[TILE_SIZE + 5 - 1][TILE_SIZE + 5 -1];
int i, j;
float output = 0.0f;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row_o = blockIdx.y * TILE_SIZE + ty;
int col_o = blockIdx.x * TILE_SIZE + tx;
int row_i = row_o - 2;
int col_i = col_o - 2;
if((row_i >= 0) && (row_i < N.height) && (col_i >= 0) && (col_i < N.width)){
Ns[ty][tx] = N.elements[row_i * N.width + col_i];
}
else{
Ns[ty][tx] = 0.0f;
}
__syncthreads();
if(ty < TILE_SIZE && tx < TILE_SIZE){
for(i = 0; i < 5; i++){
for(j = 0; j < 5; j++){
output += M_c[i][j] * Ns[i + ty][j + tx];
}
}
}
if(row_o < P.height && col_o < P.width){
P.elements[row_o * P.width + col_o] = output;
}
}
support.h:
#ifndef __FILEH__
#define __FILEH__
#include <sys/time.h>
typedef struct {
struct timeval startTime;
struct timeval endTime;
} Timer;
// Matrix Structure declaration
typedef struct {
unsigned int width;
unsigned int height;
unsigned int pitch;
float* elements;
} Matrix;
#define FILTER_SIZE 5
#define TILE_SIZE 12
#define BLOCK_SIZE (TILE_SIZE + FILTER_SIZE - 1)
Matrix allocateMatrix(unsigned height, unsigned width);
void initMatrix(Matrix mat);
Matrix allocateDeviceMatrix(unsigned height, unsigned width);
void copyToDeviceMatrix(Matrix dst, Matrix src);
void copyFromDeviceMatrix(Matrix dst, Matrix src);
void verify(Matrix M, Matrix N, Matrix P);
void freeMatrix(Matrix mat);
void freeDeviceMatrix(Matrix mat);
void startTime(Timer* timer);
void stopTime(Timer* timer);
float elapsedTime(Timer timer);
#define FATAL(msg, ...) \
do {\
fprintf(stderr, "[%s:%d] "msg"\n", __FILE__, __LINE__, ##__VA_ARGS__);\
exit(-1);\
} while(0)
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
#endif
support.cu:
#include <stdlib.h>
#include <stdio.h>
#include "support.h"
Matrix allocateMatrix(unsigned height, unsigned width)
{
Matrix mat;
mat.height = height;
mat.width = mat.pitch = width;
mat.elements = (float*)malloc(height*width*sizeof(float));
if(mat.elements == NULL) FATAL("Unable to allocate host");
return mat;
}
void initMatrix(Matrix mat)
{
for (unsigned int i=0; i < mat.height*mat.width; i++) {
mat.elements[i] = (rand()%100)/100.00;
}
}
Matrix allocateDeviceMatrix(unsigned height, unsigned width)
{
Matrix mat;
cudaError_t cuda_ret;
mat.height = height;
mat.width = mat.pitch = width;
cuda_ret = cudaMalloc((void**)&(mat.elements), height*width*sizeof(float));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
return mat;
}
void copyToDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy to device");
}
void copyFromDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy from device");
}
void verify(Matrix M, Matrix N, Matrix P) {
const float relativeTolerance = 1e-6;
for(int row = 0; row < N.height; ++row) {
for(int col = 0; col < N.width; ++col) {
float sum = 0.0f;
for(int i = 0; i < M.height; ++i) {
for(int j = 0; j < M.width; ++j) {
int iN = row - M.height/2 + i;
int jN = col - M.width/2 + j;
if(iN >= 0 && iN < N.height && jN >= 0 && jN < N.width) {
sum += M.elements[i*M.width + j]*N.elements[iN*N.width + jN];
}
}
}
float relativeError = (sum - P.elements[row*P.width + col])/sum;
if (relativeError > relativeTolerance
|| relativeError < -relativeTolerance) {
printf("TEST FAILED\n\n");
exit(0);
}
}
}
printf("TEST PASSED\n\n");
}
void freeMatrix(Matrix mat)
{
free(mat.elements);
mat.elements = NULL;
}
void freeDeviceMatrix(Matrix mat)
{
cudaFree(mat.elements);
mat.elements = NULL;
}
void startTime(Timer* timer) {
gettimeofday(&(timer->startTime), NULL);
}
void stopTime(Timer* timer) {
gettimeofday(&(timer->endTime), NULL);
}
float elapsedTime(Timer timer) {
return ((float) ((timer.endTime.tv_sec - timer.startTime.tv_sec) \
+ (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
}
One set of problems is here:
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));
If you ran your code with cuda-memcheck it would point you right at this line as being a problem.
The first parameter should be the destination symbol, i.e. M_c, and the second parameter should be the host source pointer, i.e. M_h.
Furthermore, shouldn't it be FILTER_SIZE*FILTER_SIZE ? Isn't the size of data you want to transfer equal to the dimension squared?
Finally, M_h is not a valid source pointer. You should use M_h.elements.
So something like this:
cudaMemcpyToSymbol(M_c, M_h.elements,FILTER_SIZE*FILTER_SIZE*sizeof(float));
I don't believe this fixes all the issues in your code. To continue the debug, I would print out one element in the GPU result that does not match your verify routine, and work through the arithmetic for that one element. Use printf in device code if that helps.
In the future, please run your code with cuda-memcheck before asking for help here. Even if you don't understand the output, it will be useful for those trying to help you.

maximum supported size for cub library

Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...
Here is my code:
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const int size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
The problem is here:
const int size = num_items * sizeof(mytype);
And it can be fixed by changing it to:
const size_t size = num_items * sizeof(mytype);
The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.
When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).
In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.
Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.
For reference, here is the fixed code (based on what OP posted here):
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const size_t size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}

Is prefix scan CUDA sample code in gpugems3 correct?

I've written a piece of code to call the kernel in the book GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA.
However the results that I get are a bunch of negative numbers instead of prefix scan.
Is my kernel call wrong or is there something wrong with the code from the GPU Gems 3 book?
Here is my code:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int *g_idata, int n, int dim)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += g_idata[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int *h_in,int num_items)
{
int j;
for(j=0;j<num_items;j++)
h_in[j]=j;
printf(" input: ");
printf("\n\n");
}
int main(int argc, char** argv)
{
int num_items = 512;
int* h_in = new int[num_items];
// Initialize problem
Initialize(h_in, num_items);
int *d_in = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * num_items);
if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
// Allocate device output array
int *d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
int* h_out= new int[num_items+1];
if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
int i;
printf(" \n");
for(i=0;i<num_items;i++)
printf(" ,%d ",h_out[i]);
// Cleanup
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
printf("\n\n");
return 0;
}
It seems that you've made at least 1 error in transcribing the code from the GPU Gems 3 chapter into your kernel. This line is incorrect:
temp[bi] += g_idata[ai];
it should be:
temp[bi] += temp[ai];
When I make that one change to the code you have now posted, it seems to print out the correct (exclusive-scan) prefix sum for me. There's a few other things I would mention:
Even without that change, I get some results that are close to correct. So if you're getting widely different stuff (e.g. negative numbers) you may have a problem with your machine setup or CUDA install. I would suggest using more rigorous cuda error checking than what you have now (although a machine setup problem should have been indicated in one of your checks.)
The routine as crafted will have some limitations. It can only be used in a single threadblock, it will have bank conflicts on shared memory access, and it will be limited in data set size to what can be handled by a single threadblock (this routine produces two output elements per thread, so the data set size is expected to be equal to twice the number of threads). As has been already covered, the dynamic shared memory allocation needs to be as large as the data set size (ie. twice the thread size, in number of elements).
This may be useful for learning, but if you want a robust, fast prefix scan, you are advised to use a routine from thrust or cub instead of your own code, even if derived from this (old) article.
The following code is similar to yours, but it has the above issues fixed, and I have templated the kernel for use with various datatypes:
#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
extern __shared__ T temp[]; // allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
T t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(){
mytype *h_i, *d_i, *h_o, *d_o;
int dszp = (DSIZE)*sizeof(mytype);
h_i = (mytype *)malloc(dszp);
h_o = (mytype *)malloc(dszp);
if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
cudaMalloc(&d_i, dszp);
cudaMalloc(&d_o, dszp);
cudaCheckErrors("cudaMalloc fail");
for (int i = 0 ; i < DSIZE; i++){
h_i[i] = i;
h_o[i] = 0;}
cudaMemset(d_o, 0, dszp);
cudaCheckErrors("cudaMemset fail");
cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
mytype psum = 0;
for (int i =1; i < DSIZE; i++){
psum += h_i[i-1];
if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
}
return 0;
}