Is prefix scan CUDA sample code in gpugems3 correct? - cuda

I've written a piece of code to call the kernel in the book GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA.
However the results that I get are a bunch of negative numbers instead of prefix scan.
Is my kernel call wrong or is there something wrong with the code from the GPU Gems 3 book?
Here is my code:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int *g_idata, int n, int dim)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += g_idata[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int *h_in,int num_items)
{
int j;
for(j=0;j<num_items;j++)
h_in[j]=j;
printf(" input: ");
printf("\n\n");
}
int main(int argc, char** argv)
{
int num_items = 512;
int* h_in = new int[num_items];
// Initialize problem
Initialize(h_in, num_items);
int *d_in = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * num_items);
if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
// Allocate device output array
int *d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
int* h_out= new int[num_items+1];
if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
int i;
printf(" \n");
for(i=0;i<num_items;i++)
printf(" ,%d ",h_out[i]);
// Cleanup
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
printf("\n\n");
return 0;
}

It seems that you've made at least 1 error in transcribing the code from the GPU Gems 3 chapter into your kernel. This line is incorrect:
temp[bi] += g_idata[ai];
it should be:
temp[bi] += temp[ai];
When I make that one change to the code you have now posted, it seems to print out the correct (exclusive-scan) prefix sum for me. There's a few other things I would mention:
Even without that change, I get some results that are close to correct. So if you're getting widely different stuff (e.g. negative numbers) you may have a problem with your machine setup or CUDA install. I would suggest using more rigorous cuda error checking than what you have now (although a machine setup problem should have been indicated in one of your checks.)
The routine as crafted will have some limitations. It can only be used in a single threadblock, it will have bank conflicts on shared memory access, and it will be limited in data set size to what can be handled by a single threadblock (this routine produces two output elements per thread, so the data set size is expected to be equal to twice the number of threads). As has been already covered, the dynamic shared memory allocation needs to be as large as the data set size (ie. twice the thread size, in number of elements).
This may be useful for learning, but if you want a robust, fast prefix scan, you are advised to use a routine from thrust or cub instead of your own code, even if derived from this (old) article.
The following code is similar to yours, but it has the above issues fixed, and I have templated the kernel for use with various datatypes:
#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
extern __shared__ T temp[]; // allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
T t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(){
mytype *h_i, *d_i, *h_o, *d_o;
int dszp = (DSIZE)*sizeof(mytype);
h_i = (mytype *)malloc(dszp);
h_o = (mytype *)malloc(dszp);
if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
cudaMalloc(&d_i, dszp);
cudaMalloc(&d_o, dszp);
cudaCheckErrors("cudaMalloc fail");
for (int i = 0 ; i < DSIZE; i++){
h_i[i] = i;
h_o[i] = 0;}
cudaMemset(d_o, 0, dszp);
cudaCheckErrors("cudaMemset fail");
cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
mytype psum = 0;
for (int i =1; i < DSIZE; i++){
psum += h_i[i-1];
if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
}
return 0;
}

Related

CUBLAS batch and matrix sizes [duplicate]

Some background info on the problem I am trying to speed up using CUDA:
I have a large number of small/moderate same-sized linear systems I need to solve independently. Each linear system is square, real, dense, invertible, and non-symmetric. These are actually matrix systems so each system look like, AX = B, where A, X, and B are (n x n) matrixes.
In this previous question I ask CUBLAS batch and matrix sizes, where I learn cuBLAS batch operations give best performance for matrix of size 100x100 or smaller.
I still have an issue because the matrices I am working with have 100 < n < 700. So, the matrices are of moderate size where cuBLAS batch operations are not give best performance, and regular BLAS (cusolverDnDgetrf, cusolverDnDgetrs) also are not give better performance than MATLAB (look at timings below).
I did some timing compared to MATLAB, for solving a single system, and found regular BLAS is better for matrices of size (4096x4096) or larger. I make a random matrix of size (n x n), for n=64,256,512,1024,4096,16384, and only time the factorization and back/forward solve, no transfers across PCIE.
DOUBLE PRECISION CUDA (GTX 1080ti) vs MATLAB (backslash)
(GPU) 64: 0.001157 sec
(MATLAB) 64: 0.000205 sec
(GPU) 256: 0.01161 sec
(MATLAB) 256: 0.007762 sec
(GPU) 512: 0.026348 sec
(MATLAB) 512: 0.008550 sec
(GPU) 1024: 0.064357 sec
(MATLAB) 1024: 0.036280 sec
(GPU) 4096: 0.734908 sec
(MATLAB) 4096: 1.174442 sec
(GPU) 16384: 32.962229 sec (MATLAB) 16384: 68.691236 sec
These timing make me conclude that iterating one by one over my matrices calling non-batch inversion method will be slower than MATLAB. Also, for my moderate sized matrices, batch cuBLAS batch inversion method will not perform well, according to CUBLAS batch and matrix sizes.
Is there other approach I should consider to speed up my code with CUDA? Or am I misunderstanding something?
/* How to use
* ./cuSolverDn_LinearSolver // Default: cholesky
* ./cuSolverDn_LinearSolver -R=chol -filefile> // cholesky factorization
* ./cuSolverDn_LinearSolver -R=lu -file<file> // LU with partial pivoting
* ./cuSolverDn_LinearSolver -R=qr -file<file> // QR factorization
*
* Remark: the absolute error on solution x is meaningless without knowing condition number of A.
* The relative error on residual should be close to machine zero, i.e. 1.e-15.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
int linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
}
//checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
//checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, n, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (ipiv ) { checkCudaErrors(cudaFree(ipiv));}
return 0;
}
void generate_random_dense_matrix(int M, int N, double **outA)
{
int i, j;
double rMax = (double)RAND_MAX;
double *A = (double *)malloc(sizeof(double) * M * N);
// For each column
for (j = 0; j < N; j++)
{
// For each row
for (i = 0; i < M; i++)
{
double dr = (double)rand();
A[j * M + i] = (dr / rMax) * 100.0;
//printf("A[j * M + i] = %f \n",A[j * M + i]);
}
}
*outA = A;
}
int main (int argc, char *argv[])
{
struct testOpts opts;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 0; // number of rows of A
int colsA = 0; // number of columns of A
int nnzA = 0; // number of nonzeros of A
int baseA = 0; // base index in CSR format
int lda = 0; // leading dimension in dense matrix
// CSR(A) from I/O
int *h_csrRowPtrA = NULL;
int *h_csrColIndA = NULL;
double *h_csrValA = NULL;
double *h_A = NULL; // dense matrix from CSR(A)
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
colsA = 660;
rowsA = colsA;
int NN = colsA;
int MM = rowsA;
lda = rowsA;
// Generate inputs
srand(9384);
generate_random_dense_matrix(MM, NN, &h_A);
generate_random_dense_matrix(MM, NN, &h_b);
parseCommandLineArguments(argc, argv, opts);
if (NULL == opts.testFunc)
{
//opts.testFunc = "chol"; // By default running Cholesky as NO solver selected with -R option.
opts.testFunc = "lu";
//opts.testFunc = "qr";
}
findCudaDevice(argc, (const char **)argv);
/*
printf("step 1: read matrix market format\n");
if (opts.sparse_mat_filename == NULL)
{
opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
if (opts.sparse_mat_filename != NULL)
printf("Using default input file [%s]\n", opts.sparse_mat_filename);
else
printf("Could not find gr_900_900_crg.mtx\n");
}
else
{
printf("Using input file [%s]\n", opts.sparse_mat_filename);
}
if (opts.sparse_mat_filename == NULL)
{
fprintf(stderr, "Error: input matrix is not provided\n");
return EXIT_FAILURE;
}
if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
&nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true))
{
exit(EXIT_FAILURE);
}
baseA = h_csrRowPtrA[0]; // baseA = {0,1}
printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);
if ( rowsA != colsA )
{
fprintf(stderr, "Error: only support square matrix\n");
exit(EXIT_FAILURE);
}
printf("step 2: convert CSR(A) to dense matrix\n");
lda = opts.lda ? opts.lda : rowsA;
if (lda < rowsA)
{
fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
exit(EXIT_FAILURE);
}
*/
//h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*lda*colsA);
//h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*lda*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
/*
memset(h_A, 0, sizeof(double)*lda*colsA);
for(int row = 0 ; row < rowsA ; row++)
{
const int start = h_csrRowPtrA[row ] - baseA;
const int end = h_csrRowPtrA[row+1] - baseA;
for(int colidx = start ; colidx < end ; colidx++)
{
const int col = h_csrColIndA[colidx] - baseA;
const double Areg = h_csrValA[colidx];
h_A[row + col*lda] = Areg;
}
}
printf("step 3: set right hand side vector (b) to 1\n");
for(int row = 0 ; row < rowsA ; row++)
{
h_b[row] = 1.0;
}
*/
// verify if A is symmetric or not.
if ( 0 == strcmp(opts.testFunc, "chol") )
{
int issym = 1;
for(int j = 0 ; j < colsA ; j++)
{
for(int i = j ; i < rowsA ; i++)
{
double Aij = h_A[i + j*lda];
double Aji = h_A[j + i*lda];
if ( Aij != Aji )
{
issym = 0;
break;
}
}
}
if (!issym)
{
printf("Error: A has no symmetric pattern, please use LU or QR \n");
exit(EXIT_FAILURE);
}
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*lda*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*lda*rowsA));
printf("step 4: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*lda*rowsA, cudaMemcpyHostToDevice));
printf("step 5: solve A*x = b \n");
// d_A and d_b are read-only
if ( 0 == strcmp(opts.testFunc, "chol") )
{
linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "lu") )
{
//printf("hi \n");
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
}
else if ( 0 == strcmp(opts.testFunc, "qr") )
{
linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
}
else
{
fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
exit(EXIT_FAILURE);
}
printf("step 6: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowsA,
colsA,
colsA,
&minus_one,
d_A,
lda,
d_x,
rowsA,
&one,
d_r,
rowsA));
checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*lda*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*lda*rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("x[0] = %f\n", h_x[0]);
printf("r[0] = %f\n", h_r[0]);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));
if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }
if (h_csrValA ) { free(h_csrValA); }
if (h_csrRowPtrA) { free(h_csrRowPtrA); }
if (h_csrColIndA) { free(h_csrColIndA); }
if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }
if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }
return 0;
}
Try using two or more parallel streams (with one linear system each) on the GPU, possibly this helps utilizing a bigger part of the GPU.
For timing measurments and hardware utilization use the visual profiler instead of CPU time measurements.
Another point is, that the GTX (consumer) GPUs perform pretty bad on double preision. If you have the chance, try to use a Tesla GPU instead.
MATLAB provides a way to call the cublas batch interface for GPU arrays using pagefun.

Cuda idx doesnt index matrices correctly

I have the following kernel in cuda:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
//for(j=0;j<N;j++){
// outgoing[j].p_t1=ingoing[j].p_t1;
//}
outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
This doesnt work. The following works:
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
What is wrong? Why idx doesnt index the matrices correctly?
The whole code is written below. It wouldn't be so easy to understand it. The thing is that when I print the outgoing[idx].p_t1 fields at the end of the main function they print 0s when I do
outgoing[idx].p_t1=ingoing[idx].p_t1;
but they are correct when I do
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
Whats wrong?
/******************** Includes - Defines ****************/
#include "pagerank_serial.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <string.h>
#include <sys/time.h>
#include <fcntl.h>
#include <cuda.h>
#include "string.h"
/******************** Defines ****************/
// Number of nodes
int N;
// Convergence threashold and algorithm's parameter d
double threshold, d;
// Table of node's data
Node *Nodes;
__global__ void pagerank(Node *ingoing, Node *outgoing, int N) {
int j;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ((idx > 0) && (idx < N)){
for(j=0;j<N;j++){
outgoing[j].p_t1=ingoing[j].p_t1;
}
//outgoing[idx].p_t1=ingoing[idx].p_t1;
}
}
/***** Read graph connections from txt file *****/
void Read_from_txt_file(char* filename)
{
FILE *fid;
int from_idx, to_idx;
int temp_size;
fid = fopen(filename, "r");
if (fid == NULL){
printf("Error opening data file\n");
}
while (!feof(fid))
{
if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx))
{
Nodes[from_idx].con_size++;
temp_size = Nodes[from_idx].con_size;
//Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int));
Nodes[from_idx].To_id[temp_size - 1] = to_idx;
}
}
//printf("End of connections insertion!\n");
fclose(fid);
}
/***** Read P vector from txt file*****/
void Read_P_from_txt_file()
{
FILE *fid;
double temp_P;
int index = 0;
fid = fopen("P.txt", "r");
if (fid == NULL){printf("Error opening the Probabilities file\n");}
while (!feof(fid))
{
// P's values are double!
if (fscanf(fid," double sum = 0;%lf\n", &temp_P))
{
Nodes[index].p_t1 = temp_P;
index++;
}
}
//printf("End of P insertion!");
fclose(fid);
}
/***** Read E vector from txt file*****/
void Read_E_from_txt_file()
{
FILE *fid;
double temp_E;
int index = 0;
fid = fopen("E.txt", "r");
if (fid == NULL)
printf("Error opening the E file\n");
while (!feof(fid))
{
// E's values are double!
if (fscanf(fid,"%lf\n", &temp_E))
{
Nodes[index].e = temp_E;
index++;
}
}
//printf("End of E insertion!");
fclose(fid);
}
/***** Create P and E with equal probability *****/
void Random_P_E()
{
int i;
// Sum of P (it must be =1)
double sum_P_1 = 0;
// Sum of E (it must be =1)
double sum_E_1 = 0;
// Arrays initialization
for (i = 0; i < N; i++)
{
Nodes[i].p_t0 = 0;
Nodes[i].p_t1 = 1;
Nodes[i].p_t1 = (double) Nodes[i].p_t1 / N;
sum_P_1 = sum_P_1 + Nodes[i].p_t1;
Nodes[i].e = 1;
Nodes[i].e = (double) Nodes[i].e / N;
sum_E_1 = sum_E_1 + Nodes[i].e;
}
// Assert sum of probabilities is =1
// Print sum of P (it must be =1)
//printf("Sum of P = %f\n",sum_P_1);
// Exit if sum of P is !=1
assert(sum_P_1 = 1);
//printf("\n");
// Print sum of E (it must be =1)
//printf("Sum of E = %f\n",sum_E_1);
// Exit if sum of Pt0 is !=1
assert(sum_E_1 = 1);
}
/***** Main function *****/
int main(int argc, char** argv)
{
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
// Check input arguments
if (argc < 5)
{
printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n");
return 0;
}
// get arguments
char filename[256];
strcpy(filename, argv[1]);
N = atoi(argv[2]);
threshold = atof(argv[3]);
d = atof(argv[4]);
int i;
// a constant value contributed of all nodes with connectivity = 0
// it's going to be addes to all node's new probability
// Allocate memory for N nodes
Nodes = (Node*) malloc(N * sizeof(Node));
for (i = 0; i < N; i++)
{
Nodes[i].con_size = 0;
//Nodes[i].To_id = (int*) malloc(sizeof(int));
}
Read_from_txt_file(filename);
// set random probabilities
Random_P_E();
Node *h_ingoing;
Node *h_outgoing;
h_ingoing = Nodes;
h_outgoing = (Node *)calloc(N, sizeof *h_outgoing);
Node *d_ingoing;
Node *d_outgoing;
cudaMalloc(&d_ingoing, N * sizeof *d_ingoing);
cudaMalloc(&d_outgoing, N * sizeof *d_outgoing);
cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice);
cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice);
float time;
cudaEvent_t begin, end;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize);
cudaEventCreate(&begin);
cudaEventCreate(&end);
cudaEventRecord(begin, 0);
pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d);
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
cudaEventElapsedTime(&time, begin, end);
cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost);
printf("%f\n", time) ;
printf("\n");
// Print final probabilitities
for (i = 0; i <100; i++)
{
printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1);
}
printf("\n");
printf("End of program!\n");
return (EXIT_SUCCESS);
}
When you say main function they print 0s when I do, I assume you are refering to all entries and not just index 0. Indeed, index 0 is not processed by your code with the fisrt version as ((idx > 0) && (idx < N)) is false for idx=0.
Getting further, in your code, we are missing the definition of the Node type. which is mandatory to get a better understanding of what could go wrong in your code.
Depending on the size of Node, its contents, and the structure packing you are using in compilation, it might be that Node size on host side differs from Node size on device. Using printf to verify that would be usefull, or using a debugger.
Also, you do not seem to be checking for error in launch. You definitely want to add a cudaPeekAtLastError and a cudaDeviceSynchronize after your kernel call to make sure no error occurred. (any other method call from cuda Runtime API may also return errors your code does not check).
EDIT
Trying to reproduce, I wrote the following, as close as possible to your code. I don't have a card with sufficient memory, hence the smaller node count.
typedef struct
{
double p_t0;
double p_t1;
double e;
int To_id[460];
int con_size;
} Node ;
__global__ void pagerank(Node* ingoing, Node* outgoing, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x ;
if ((idx > 0) && (idx < N))
outgoing[idx].p_t1 = ingoing[idx].p_t1;
}
#include <cstdlib>
#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR # %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } }
int main()
{
// int N = 916428 ; // does not fit on my GPU
int N = 400000 ;
int blockSize;
int minGridSize;
int gridSize;
Node* Nodes = (Node*)malloc(N * sizeof (Node)) ;
for (int i = 0 ; i < N ; ++i)
Nodes[i].p_t1 = (double)i+1;
Node* h_ingoing = Nodes;
Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ;
Node* d_ingoing ;
Node* d_outgoing ;
cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing));
cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing));
cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice));
cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice));
float time;
cudaEvent_t begin, end ;
//blockSize = 256 ;
cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ;
gridSize = (N + blockSize -1) / blockSize ;
printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ;
cudaCheck (cudaEventCreate (&begin)) ;
cudaCheck (cudaEventCreate (&end)) ;
cudaCheck (cudaEventRecord (begin, 0)) ;
pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ;
cudaCheck (cudaEventRecord (end, 0)) ;
cudaCheck (cudaEventSynchronize (end)) ;
cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ;
for (int i = 0 ; i < 100 ; ++i)
{
printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ;
}
for (int i = 0 ; i < N ; ++i)
{
if (h_outgoing[i].p_t1 != (double)(i+1))
printf ("Error # %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1));
}
return 0 ;
}
Except at index 0 for which the first draft of answer stated there was an issue, each output is correct.

Error in simple reduce on CUDA

I'm newbie in CUDA C... I want to sum elements of array(with reduce) in 1 block and 267 threads use the shared memory. I read a book "CUDA by example, an introdution to General-Purpose to GPU Programming". According to some recomendations from her, i write my version of program:
__global__ void
conva(int* a, int* out)
{
__shared__ int cache[534];
int cacheIndex = threadIdx.x;
for(int n=0; n<2;++n) {
cache[cacheIndex+n] = a[cacheIndex+n];
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex + n] += cache[cacheIndex + n + i];
__syncthreads();
i /= 2;
}
}
//need or not this __syncthreads(), I don't know
__syncthreads();
if (cacheIndex == 0)
out = &cache[0];
}
int main(int argc, char** argv)
{
//enter array for sum
int convolution[534];
for(int i=0; i<534; ++i)
convolution[i] = 1;
//variable in which we take a sum from device
int summa = 0;
//it we copy on device from host
int* tash;
int* convolution_gpu;
cudaMalloc((void**)(&convolution_gpu), 534*sizeof(int));
cudaMalloc((void**)(&tash), sizeof(int));
cudaMemcpy(convolution_gpu, convolution, 534*sizeof(int), cudaMemcpyHostToDevice );
//call core with 1 block and 267 threads
conva<<<1, 267>>>(convolution_gpu, tash);
cudaMemcpy(&summa, tash, sizeof(int), cudaMemcpyDeviceToHost);
//and here I want 534 but I have garbage(may be)
std::cout<<summa<<std::endl;
cudaFree(convolution_gpu);
cudaFree(tash);
getchar();
}
Tell please, where here is error and help me to resolve her...
(sorry for my english)
In your kernel, this:
if (cacheIndex == 0)
out = &cache[0];
is almost certainly wrong. Surely you want something like:
if (cacheIndex == 0)
*out = cache[0];

maximum supported size for cub library

Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...
Here is my code:
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const int size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}
The problem is here:
const int size = num_items * sizeof(mytype);
And it can be fixed by changing it to:
const size_t size = num_items * sizeof(mytype);
The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.
When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).
In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.
Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.
For reference, here is the fixed code (based on what OP posted here):
#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;
bool g_verbose = false; // Whether to display input/output to console
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
typedef int mytype;
/**
* Solve inclusive-scan problem
*/
static void solve(mytype *h_in, mytype *h_cpu, int n)
{
mytype inclusive = 0;
for (int i = 0; i < n; ++i) {
inclusive += h_in[i];
h_cpu[i] = inclusive;
}
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
for (int i = 0; i < n; i++) {
if (h_cpu[i] != h_o[i]) {
return i + 1;
}
}
return 0;
}
/**
* Main
*/
int main(int argc, char** argv)
{
cudaSetDevice(0);
struct timeval start, end;
int num_items = 1073741824;
const int repetitions = 5;
mytype *h_in, *h_out, *h_cpu;
const size_t size = num_items * sizeof(mytype);
// Allocate host arrays
h_in = (mytype *)malloc(size);
h_out = (mytype *)malloc(size);
h_cpu = (mytype *)malloc(size);
// Initialize problem and solution
for (int i = 0; i < num_items; i++) {
h_in[i] = i;
h_out[i] = 0;
h_cpu[i] = 0;
}
solve(h_in, h_cpu, num_items);
// Allocate problem device arrays
mytype *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));
// Initialize device input
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));
// Allocate device output array
mytype *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Run
gettimeofday(&start, NULL);
for (long i = 0; i < repetitions; i++)
DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
cudaThreadSynchronize();
gettimeofday(&end, NULL);
double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;
cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
int cmp = compare(h_cpu, h_out, num_items);
printf("%d\t", num_items);
if (!cmp)
printf("\t%7.4fs \n", ctime);
printf("\n");
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (h_cpu) delete[] h_cpu;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
printf("\n\n");
return 0;
}

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
If you had of included some basic error checking around the function call, perhaps like this:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
it would have been immediately obvious that the kernel launch or execution was failing with an error.