Maximum value of batchsize allowed for cublasDgetrfBatched() from CUBLAS Library - cuda

Is there any maximum batchsize limitation for cublasDgetrfBatched() from CUBLAS library? I am doing a benchmark problem for comparing timings between CPU and GPU. For a batchsize of 1000 i am getting GPU timing greater than CPU timing. But, for a batchsize of 100, i am getting some speedup over CPU.
I have posted below the code that i used for benchmarking.
1. main.cpp
/*main.cpp goes below*/
#include<stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "mathlib_blas.h"
int main(){
double**mat;
double**mat_scratch1;
int *ipvt;
double *fVec;
double *fVecSave;
double *fVec_scratch;
double *A;
double *B;
double **devPtrA;
double **devPtrB;
double **devPtrA_dev;
double **devPtrB_dev;
double *d_x;
double *x;
int *d_pivot_array ;
int *d_info_array;
int *h_info_array;
int batchsize;
int neqn;
cublasHandle_t handle;
cublasStatus_t status;
cudaError_t error;
clock_t start, end, start1, end1;
double rcond;
batchsize = 32;
neqn = 172;
mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
ipvt = (int*) calloc((size_t) neqn, sizeof(int));
fVec = (double*) calloc((size_t) neqn, sizeof(double));
fVecSave = (double*) calloc((size_t) neqn, sizeof(double));
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double));
A = (double*)malloc( neqn*neqn*sizeof(A[0]));
B = (double*)malloc( neqn*neqn*sizeof(B[0]));
devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA));
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB));
for(int b_count =0; b_count<batchsize; b_count++){
cudaMalloc((void **)&devPtrA[b_count], neqn*neqn * sizeof(devPtrA[0][0]));
cudaMalloc((void **)&devPtrB[b_count], batchsize*neqn * sizeof(devPtrB[0][0]));
}
cudaMalloc((void **)&devPtrA_dev, batchsize*sizeof(*devPtrA));
cudaMalloc((void **)&devPtrB_dev, batchsize*sizeof(*devPtrB));
cudaMemcpy(devPtrA_dev, devPtrA, batchsize*sizeof(*devPtrA), cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB_dev, devPtrB, batchsize*sizeof(*devPtrB), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_x, neqn*sizeof(double));
x =(double *)malloc(neqn*sizeof(double));
cudaMalloc((void **)&d_pivot_array, batchsize*neqn*sizeof(int));
cudaMalloc((void **)&d_info_array, batchsize*sizeof(int));
h_info_array =(int*)malloc(batchsize*sizeof(int));
cublasCreate(&handle);
srand(time(NULL));
/* Fill in the CPU and GPU Matrix */
for (int iRow = 0; iRow < neqn; iRow++) {
double sumCol = 0;
for (int iColumn = 0; iColumn < neqn; iColumn++) {
for(int b_count =0; b_count<batchsize; b_count++){
A[neqn*iColumn + iRow] = rand()%10 ;
mat[iRow][iColumn] = A[neqn*iColumn + iRow];
}
sumCol +=A[neqn*iColumn + iRow];
}
fVec[iRow] = sumCol;
fVecSave[iRow] = sumCol;
}
/*CPU_CODE GOES HERE */
start = clock();
for(int b_count =0; b_count<batchsize; b_count++){
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat_scratch1[iColumn][iRow]= mat[iColumn][iRow];
}
}
dgeco_blas(mat_scratch1, neqn, ipvt, &rcond, fVecSave);
}
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat[iColumn][iRow]= mat_scratch1[iColumn][iRow];
}
}
for(int b_count =0; b_count<batchsize; b_count++){
for(int i = 0; i < neqn; i++) fVec_scratch[i] = fVec[i];
dgesl_blas(mat, neqn, ipvt , fVec_scratch, 0);
}
end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("Time in seconds(CPU) : %lf \n", seconds);
/*CPU_CODE ENDS HERE */
start1 = clock();
for(int b_count =0; b_count<batchsize; b_count++){
status = cublasSetMatrix(neqn, neqn, sizeof(A[0]), A, neqn, devPtrA[b_count], neqn);
}
status = cublasDgetrfBatched(handle, neqn, ( double**)devPtrA_dev,neqn,d_pivot_array,d_info_array,batchsize);
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status);
cudaMemcpy(h_info_array, d_info_array, batchsize*sizeof(int), cudaMemcpyDeviceToHost);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy(devPtrB[b_count], fVec, neqn*sizeof(double),cudaMemcpyHostToDevice); /* for testing purpose only */
}
status = cublasDgetrsBatched(handle, CUBLAS_OP_N, neqn, batchsize, (const double**)devPtrA_dev,
neqn, d_pivot_array,devPtrB_dev, neqn, h_info_array, batchsize);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy( fVec,devPtrB[b_count], neqn*sizeof(double),cudaMemcpyDeviceToHost); /* for testing purpose only */
}
end1 = clock();
float seconds1 = (float)(end1 - start1) / CLOCKS_PER_SEC;
printf("Time in seconds(GPU) : %lf \n", seconds1);
printf("Speedup(CPU/GPU) : %lf \n", seconds/seconds1);
system("pause");
/* End of the main portion of the code */
free(mat);
free(mat_scratch1);
free(ipvt);
free(fVec);
free(fVecSave);
free(fVec_scratch);
free(A);
free(B);
cudaFree(devPtrA[0]);
cudaFree(devPtrB[0]);
cudaFree(devPtrA_dev);
cudaFree(devPtrB_dev);
free(devPtrA);
free(devPtrB);
cudaFree(d_x);
free(x);
cudaFree(d_pivot_array);
cudaFree(d_info_array);
free(h_info_array);
cublasDestroy_v2(handle);
}
2. mathlib_blas.h
#include <stdio.h>
#include <math.h>
#define maxm(a,b) (((a) > (b)) ? (a) : (b))
#define minm(a,b) (((a) < (b)) ? (a) : (b))
#define signum(a,b) (((b) < (0)) ? (-a) : (a))
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType);
void dgefa_blas(double **a,int n, int ipvt[],int *info);
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job);
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z);
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType )
{
void** array = nullptr;
array = (void**)calloc(size1, sizeof(void*));
if (array != nullptr) {
if (size2 > 0) {
void* data = calloc(size1*size2, sizeType);
if (data != nullptr) {
char* addr = (char*)data;
for (int index1 = 0; index1 < size1; index1++) {
array[index1] = (void*)addr;
addr += sizeType*size2; /* char is always 1 byte */
}
} else {
free(array);
free(data);
array = nullptr;
}
}
} else {
}
return array;
}
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z)
{
double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm;
int i,info,j,k,kb,kp1,l;
/* Compute 1-norm of a */
anorm = 0.0;
for (j = 0; j < n; j++) {
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(a[i][j]);
anorm = maxm(anorm,vecsum);
}
/* Factor. */
dgefa_blas(a,n,ipvt,&info);
/* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))).
* estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e.
* trans(a) is the transpose of a. The components of e are
* chosen to cause maximum local growth in the elements of
* w, where trans(u)*w=e. The vectors are frequently rescaled
* to avoid overflow.
*/
ek = 1.0;
for (j = 0; j < n; j++)
z[j] = 0.0;
for (k = 0; k < n; k++) {
if (z[k] != 0.0)
ek = signum(ek,-z[k]);
if (fabs(ek-z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(ek-z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ek *= s;
}
wk = ek - z[k];
wkm = -ek - z[k];
s = fabs(wk);
sm = fabs(wkm);
if (a[k][k] != 0.0) {
wk /= a[k][k];
wkm /= a[k][k];
}
else {
wk = 1.0;
wkm = 1.0;
}
kp1 = k + 1;
if (kp1 < n) {
for (j = kp1; j < n; j++) {
sm += (fabs(z[j] + wkm * a[k][j]));
z[j] += (wk * a[k][j]);
s += fabs(z[j]);
}
if (s < sm) {
t = wkm -wk;
wk = wkm;
for (j = kp1; j < n; j++)
z[j] += (t * a[k][j]);
}
}
z[k] = wk;
}
/* dasum(n,s,z,1) */
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
/* Solve trans(l)*y= w
*/
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (k < (n-1)) {
/* sdot(n-k,a(k+1,k),1,z(k+1),1) */
vecdot = 0.0;
for (i = k+1;i < n; i++)
vecdot += (a[i][k] * z[i]);
z[k] += vecdot;
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
}
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
} /* endfor kb */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm = 1.0;
/*
* Solve l * v = y
*/
for (k = 0; k < n; k++) {
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
if (k < (n-1)) {
/* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */
for (i = k+1;i < n; i++)
z[i] += (t * a[i][k]);
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
}
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
/* Solve u * z = v */
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (fabs(z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
if (a[k][k] != 0.0)
z[k] /= a[k][k];
if (a[k][k] == 0.0)
z[k] = 1.0;
t = -z[k];
/* daxpy(k-1,t,a[1][k],1,z[1],1) */
for (i = 0; i < k; i++)
z[i] += (t * a[i][k]);
}
/* Make znorm = 1.0 */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
if (anorm != 0.0) *rcond = ynorm/anorm;
if (anorm == 0.0) *rcond = 0.0;
}
void dgefa_blas(double **a,int n, int ipvt[],int *info)
{
double dmax,t;
int i,j,k,kp1,l,nm1;
*info = 0;
nm1 = n - 1;
if (n > 0) {
for (k = 0; k < nm1; k++) {
kp1 = k + 1;
/* Find l = pivot index. */
dmax = fabs(a[k][k]);
l = k;
for (i = k+1; i < n; i++) {
if (fabs(a[i][k]) <= dmax) continue;
l = i;
}
ipvt[k] = l;
/* Zero pivot implies this column already triangularized. */
if (a[l][k] == 0.0) {
*info = k;
continue;
}
/* Interchange if necessary. */
if (l != k) {
t = a[l][k];
a[l][k] = a[k][k];
a[k][k] = t;
}
/* Compute multipliers. */
if (a[k][k] == 0.0) printf("\n!ERROR. Singular matrix.\n");
t = -1.0/a[k][k];
for (i = k+1; i < n; i++)
a[i][k] *= t;
/* Row elimination with column indexing. */
for (j = kp1; j < n; j++) {
t = a[l][j];
if (l != k) {
a[l][j] = a[k][j];
a[k][j] = t;
}
for (i = k+1; i < n; i++ )
a[i][j] += (t * a[i][k]);
}
}
}
ipvt[n-1] = n-1;
if (a[n-1][n-1] == 0.0) *info = n-1;
}
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job)
{
double t;
int i,k,kb,l,nm1;
nm1 = n - 1;
if (job == 0) {
/* job = 0, solve a * x = b.
* First solve l * y = b.
*/
if (n > 0) {
for (k = 0; k < nm1; k++) {
l = ipvt[k];
t = b[l];
if (l != k) {
b[l] = b[k];
b[k] = t;
}
/* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */
for (i=k+1;i < n;i++)
b[i] += (t * a[i][k]);
}
}
/* Now solve u * x = y. */
for (kb = 0; kb < n; kb++) {
k = n - kb-1;
b[k] /= a[k][k];
t = -b[k];
/* saxpy(k-1,t,a(1,k),1,b(1),1); */
for (i = 0; i < k ; i++)
b[i] += (t * a[i][k]);
}
return;
}
/* job != 0, solve trans(a) * x = b.
* First solve trans(u) * x = y.
*/
for (k = 0; k < n; k++) {
/* t = ddot(k-1,a(1,k),1,b(1),1); */
t = 0;
for (i = 0; i < k; i++)
t += (a[i][k] * b[i]);
b[k] = (b[k] - t)/a[k][k];
}
/* Now solve trans(l) * x = y. */
if (n > 0) {
for (kb = 0; kb < nm1; kb++) {
k = n - 2 - kb;
/* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */
t = 0;
for (i = k+1;i < n; i++)
t += (a[i][k] * b[i]);
b[k] += t;
l = ipvt[k];
if (l != k) {
t = b[l];
b[l] = b[k];
b[k] = t;
}
}
}
}

There should not be any behavioral differences between a batch size of 100 and a batch size of 1000. (Certainly there would be a performance difference - the batch size of 1000 should probably take longer.)
There are no published limits to the batch size, other than implicit memory limits. In fact, unless the GPU is returning incorrect results, there is no reason to think that you've run into any hard limit anyway.
( If you wanted to explore some behavioral or performance issue, this question is not properly written to address that. )

Related

Why is my output printing in wrong order in selection sort?

Here's my code:
#include <stdio.h>
int main(void)
{
int a[6] = {6,1,3,4,5,2};
int size = 6;
for(int i = 0; i < size - 1; i++)
{
int smallest = i;
for(int j = i + 1; j < size; j++)
{
if(a[j] < a[smallest])
{
smallest = j;
}
if(smallest != i)
{
int z = a[smallest];
a[smallest] = a[j];
a[j] = z;
}
else
{
a[i] = a[smallest];
}
}
}
for(int i = 0; i < size; i++)
{
printf("%d, ", a[i]);
}
printf("\n");
return 0;
}
So I have 3 problems.
Output printing in descending order. I want to print it as 1,2,3,4,5,6 but the actual output is 6,5,4,3,2,1. Why?
2)When I changed the printf statement as printf("%d, ", a[size - i]); it gave output as 32767, 1,2,3,4. Why?
When I changed the "for" condition in the last "for statement" above "printf" satement as for(int i = 0; i < size; i++) it gave output as 0,1,2,3,4,5, . Why?

cuda batched cholesky factorization

I kinda understand how to deal with 2D cuda. But batched cholesky has a 4D towards the end of the algorithm. I attached cholesky and my cuda code if anyone could give me a hint.
int i, k, m, n;
// Batched Cholesky factorization.
for (i = 0; i < batch; i++) {
float *pA = &dA[i*N*N];
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
Cuda:
int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.y * blockDim.y + threadIdx.y;
int m = blockIdx.z * blockDim.z + threadIdx.z;
int n = blockIdx.z * blockDim.z + threadIdx.z;
if( k >= N || m >= N || n >= N || i >= batch ) return;
float *pA = &dA[i*N*N];
pA[k*N+k] = sqrtf(pA[k*N+k]);
pA[k*N+m] /= pA[k*N+k];
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
starter:
dim3 dimBlock( (batch+31)/32, (n+31)/32, (n+31)/32 );
dim3 dimGrid( 32, 32, 32);
spotrf_batched_kernel<<< dimBlock, dimGrid, 0, stream>>>(n, batch, dA);
I am going to leave this here without much comment. The code is relatively self-explanatory. This implementation is completely faithful to your serial version, with the following features:
Each block performs exactly one factorization in the batch. Run as many blocks as there are batched matrices to factorize.
Because the factorization is all done at block scope, synchronization between parallel operations is possible, so the order of operations of the factorization is respected
The only parallelism the algorithm exposes is within the row operations of the factorization and update operations
Blocks should be sized according to the number of rows in the batch matrix size in round multiples of the warp size (32 on all CUDA capable devices to date)
The code below has been extremely lightly tested and is not guaranteed to work or be correct. Use at your own peril:
#include <iostream>
#include <algorithm>
__global__
void batchkernel(float** batches, int nbatches, int N, int LDA)
{
if (blockIdx.x < nbatches) {
float* pA = batches[blockIdx.x];
for (int k = 0; k < N; k++) {
// Panel factorization.
if (threadIdx.x == 0) {
pA[k*LDA+k] = sqrtf(pA[k*LDA+k]);
}
__syncthreads();
for (int m = threadIdx.x; ((m < N) && (threadIdx.x > k)); m+=blockDim.x) {
pA[k*LDA+m] /= pA[k*LDA+k];
}
__syncthreads();
// Update of the trailing submatrix.
for (int n = k+1; (n < N); n++) {
for (int m = threadIdx.x; ((m < N) && (threadIdx.x >= n)); m+=blockDim.x) {
pA[n*LDA+m] -= pA[k*LDA+n] * pA[k*LDA+m];
}
}
__syncthreads();
}
}
}
void refCholeskey(float* pA, int N)
{
int k, m, n;
// Single Cholesky factorization.
for (k = 0; k < N; k++) {
// Panel factorization.
pA[k*N+k] = sqrtf(pA[k*N+k]);
for (m = k+1; m < N; m++)
pA[k*N+m] /= pA[k*N+k];
// Update of the trailing submatrix.
for (n = k+1; n < N; n++)
for (m = n; m < N; m++)
pA[n*N+m] -= (pA[k*N+n]*pA[k*N+m]);
}
}
int main()
{
// B = np.random.random((10,10))
// SPDmatrix = (0.5*(B+B.T)) + B.shape[0]*np.eye(B.shape[0])
const int N = 10;
const int LDA = 10;
float SPDmatrix[LDA*N] = {
10.22856331, 0.17380577, 0.61779525, 0.66592082, 0.46915566,
0.09946502, 0.69386511, 0.35224291, 0.53155506, 0.51441469,
0.17380577, 10.67971161, 0.34481401, 0.64766522, 0.22372943,
0.55896022, 0.59083588, 0.48872497, 0.54049871, 0.74764959,
0.61779525, 0.34481401, 10.229388, 0.40904432, 0.5015491,
0.52152334, 0.19684814, 0.28262256, 0.04384535, 0.61919751,
0.66592082, 0.64766522, 0.40904432, 10.78410647, 0.12708693,
0.3241063, 0.6984497, 0.65074097, 0.08027563, 0.56332844,
0.46915566, 0.22372943, 0.5015491, 0.12708693, 10.52234091,
0.76346103, 0.80932473, 0.8234331, 0.52737611, 0.65777357,
0.09946502, 0.55896022, 0.52152334, 0.3241063, 0.76346103,
10.54906761, 0.32865411, 0.32467483, 0.80720007, 0.36287463,
0.69386511, 0.59083588, 0.19684814, 0.6984497, 0.80932473,
0.32865411, 10.29729551, 0.34707933, 0.69379356, 0.87612982,
0.35224291, 0.48872497, 0.28262256, 0.65074097, 0.8234331,
0.32467483, 0.34707933, 10.42929929, 0.78849458, 0.159371,
0.53155506, 0.54049871, 0.04384535, 0.08027563, 0.52737611,
0.80720007, 0.69379356, 0.78849458, 10.49604818, 0.43871288,
0.51441469, 0.74764959, 0.61919751, 0.56332844, 0.65777357,
0.36287463, 0.87612982, 0.159371, 0.43871288, 10.94535485 };
const int nbatches = 8;
float** batches;
cudaMallocManaged((void **)&batches, nbatches * sizeof(float*));
for(int i=0; i<nbatches; i++) {
cudaMallocManaged((void **)&batches[i], N * LDA * sizeof(float));
cudaMemcpy(batches[i], SPDmatrix, N * LDA * sizeof(float), cudaMemcpyDefault);
}
int blocksz = 32;
int nblocks = nbatches;
batchkernel<<<nblocks, blocksz>>>(batches, nbatches, N, LDA);
refCholeskey(SPDmatrix, N);
cudaDeviceSynchronize();
float maxabsrelerror = 0.0f;
for(int i = 0; i < N*N; i++) {
float absrelerror = std::fabs(SPDmatrix[i] - batches[0][i]) / std::fabs(SPDmatrix[i]);
maxabsrelerror = std::max(absrelerror, maxabsrelerror);
}
std::cout << "Maximum absolute relative error = " << maxabsrelerror << std::endl;
cudaDeviceReset();
return 0;
}

My CUDA kernel code is not working

I try to make a small code to generate numbers and return the result in array but once I run this code it's not working, I have tried to use Nsight debugger to understand where is my problem but it freezes and closes immediately.
Could you help me please to understand where is the problem in this code?
__global__ void mykernel( int* PF_tmp, int* PL_tmp, int* QF_tmp, int* QL_tmp,
int m[2], int p[5], int q[5], int i, int* n,
int out[10][5], int N)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
int idx = blockIdx.x;
int idy = blockIdx.y;
int w = idx/100;
int x = idx%100;
int y = idy;
int z = threadIdx.x;
int len = ((i * 2) + 5);
// Fill PF_tmp & QF_tmp
if( i > 0){
for(int k = 0; k < (i * 2); k++)
{
p[k] = PF_tmp[k];
q[k] = QF_tmp[k];
}
}
// Fill X
if( x > 10)
{
p[(i*2)] = (x - (x % 10)) / 10;
p[(i*2)+1] = x % 10;
}else{
p[(i*2)] = 0;
p[(i*2)+1] = x;
}
// Fill Y
if( y > 10)
{
q[(i*2)] = (y - (y % 10)) / 10;
q[(i*2)+1] = y % 10;
}else{
q[(i*2)] = 0;
q[(i*2)+1] = y;
}
// Fill m
p[(i * 2)+2] = m[0];
q[(i * 2)+2] = m[1];
// Fill W
if( w > 10)
{
p[(i*2)+3] = (w - (w % 10)) / 10;
p[(i*2)+4] = w % 10;
}else{
p[(i*2)+3] = 0;
p[(i*2)+4] = w;
}
// Fill Z
if( z > 10)
{
q[(i*2)+3] = (z - (z % 10)) / 10;
q[(i*2)+4] = z % 10;
}else{
q[(i*2)+3] = 0;
q[(i*2)+4] = z;
}
// Fill PL_tmp & QL_tmp
if( i > 0)
{
for(int k = 0; k < (i * 2); k++)
{
p[(len-(i * 2))+k] = PL_tmp[k];
q[(len-(i * 2))+k] = QL_tmp[k];
}
}
if(id<10)
{
for(int k =0; k<5; k++)
out[id][k] = p[k];
}
}
int main()
{
cudaError err;
dim3 blocks(10000, 100);
dim3 threads(100);
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
mykernel<<<blocks, threads>>>(NULL, NULL, NULL, NULL, m, p, q, 0, NULL, hst_out, 100000000);
return 0;
}
The error very obvious, it is all C programming.
when you declare
int m[2] = {4,5};
int hst_out[10][5];
int p[5];
int q[5];
now hst_out, p, q are not a pointer, but later it is used as a pointer:
err = cudaMalloc((void **)&p, 5);
err = cudaMalloc((void **)&q, 5);
err = cudaMalloc((void **)&hst_out, 50);
so u should have declare it initially as a pointer instead, eg,
int *p;
and used it as this way:
err = cudaMalloc((void **)&p, 5*sizeof(int));
And notice too that the size you have declared is just 5 bytes....whereas I declared it as 5*sizeof(int).
For more example see:
http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html

Using cudaMemcpy3D to transfer *** pointer

I am trying to use cudaMemcpy3D to transfer dynamically allocated 3d matrix (tensor). Tensor is allocated as contiguous block of memory (see code below). I tried various combinations of cudaExtent and cudaMemcpy3DParms, however the order of elements gets mixed up. I created the following example to demonstrate the issue:
#include <stdio.h>
int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;
tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}
return tensor;
}
__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;
for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
tensorrow = (int *)(tensorslice + i * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
}
}
}
}
int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;
int ***h_tensor;
struct cudaPitchedPtr d_tensor;
h_tensor = alloc_tensor(Nx, Ny, Nz);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&cpy);
kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
// ... clean-up
}
Output for host variable (h_tensor) and device (d_tensor) differ, looking like
h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...
d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...
What am I doing wrong? What would be the correct way to use cudaMemcpy3D?
Any time you are having trouble with a cuda code, it's a good idea to do proper cuda error checking. The code you have posted here, at least, does not run correctly for me - the cudaMemcpy3D line throws an error. This is due to item 2 below. (I suspect the code you used to generate the output was not identical to the code you have shown here, but that's just a guess.)
Your usage of make_cudaPitchedPtr is not correct:
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
review the API documentation. Making a CUDA pitched pointer this way is no different between 2D and 3D. So it makes no sense to pass 3 different dimensions as you are doing. Instead do this:
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
The remaining issues I found I attribute to incorrect understanding of 3 dimensions in C. The last subscript on a multiply-subscripted array is the rapidly varying dimension, i.e. it is the one where adjacent values in memory occupy adjacent index values. Your usage of Z in the 3rd dimension is confusing to me due to this. Your host allocation was using Nx in the first subscript place, but your device indexing didn't match. There are obviously multiple ways to handle this. If you don't like my arrangement, you can change it, but the host and device indexing must match.
Anyway, the following code modifications worked for me:
#include <stdio.h>
int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;
tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}
return tensor;
}
__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;
for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
tensorrow = (int *)(tensorslice + j * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
}
}
}
}
int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;
int ***h_tensor;
struct cudaPitchedPtr d_tensor;
h_tensor = alloc_tensor(Nz, Ny, Nx);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[k][j][i] = value++;
//printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
//h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
}
}
}
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&cpy);
kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
cudaDeviceSynchronize();
// ... clean-up
}

CUDA Broken Float Math on Kepler/Fermi Arch

I have a program that does a lot of single precision math. It produces correct results if I specify 1.0 architecture but is broken for 2.X and 3.X architectures. What would cause this?
Included below:
Very long code sample.
Compile command and good output.
Compile command and bad output.
If I run the same routing in the CPU using gcc, I get results that match the 1.0 architecture.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
/*
* svdcomp - SVD decomposition routine.
* Takes an mxn matrix a and decomposes it into udv, where u,v are
* left and right orthogonal transformation matrices, and d is a
* diagonal matrix of singular values.
*
* This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is
* code from Numerical Recipes adapted by Luke Tierney and David Betz.
* Originally from: "Numerical Recipes in C: The Art of Scientific Computing",
* Press, Flannery, Teukolosky, Vetterling. 1992.
*
* Input to dsvd is as follows:
* a = mxn matrix to be decomposed, gets overwritten with u
* m = row dimension of a
* n = column dimension of a
* w = returns the vector of singular values of a
* v = returns the right orthogonal transformation matrix
*/
#define SIGN(a, b) ((b) >= 0.0f ? fabsf(a) : -fabsf(a))
#define MIN(x,y) ( (x) < (y) ? (x) : (y) )
#define MAX(x,y) ((x)>(y)?(x):(y))
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
#define ERRCHECK \
if (cudaPeekAtLastError()) { \
fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
__device__ int
svd(float *a, int m, int n, float *w, float *v, int skip_u)
{
int flag, i, its, j, jj, k, l, nm;
float c, f, h, s, x, y, z;
float anorm = 0.0f, g = 0.0f, scale = 0.0f;
float rv1[3];
/* Householder reduction to bidiagonal form */
for (i = 0; i < n; i++)
{
/* left-hand reduction */
l = i + 1;
rv1[i] = scale * g;
g = s = scale = 0.0f;
if (i < m)
{
for (k = i; k < m; k++)
scale += fabsf(a[k*n+i]);
if (scale)
{
for (k = i; k < m; k++)
{
a[k*n+i] /= scale;
s += powf(a[k*n+i], 2);
}
f = a[i*n+i];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+i] = f - g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = i; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = s / h;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (k = i; k < m; k++)
a[k*n+i] *= scale;
}
}
w[i] = scale * g;
/* right-hand reduction */
g = s = scale = 0.0f;
if (i < m && i != n - 1)
{
for (k = l; k < n; k++)
scale += fabsf(a[i*n+k]);
if (scale)
{
for (k = l; k < n; k++)
{
a[i*n+k] /= scale;
s += powf(a[i*n+k], 2);
}
f = a[i*n+l];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+l] = f - g;
for (k = l; k < n; k++)
rv1[k] = a[i*n+k] / h;
if (i != m - 1)
{
for (j = l; j < m; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[j*n+k] * a[i*n+k];
for (k = l; k < n; k++)
a[j*n+k] += s * rv1[k];
}
}
for (k = l; k < n; k++)
a[i*n+k] *= scale;
}
}
anorm = MAX(anorm, fabsf(w[i]) + fabsf(rv1[i]));
}
/* accumulate the right-hand transformation */
for (i = n - 1; i >= 0; i--)
{
if (i < n - 1)
{
if (g)
{
for (j = l; j < n; j++)
v[j*n+i] = (a[i*n+j] / a[i*n+l]) / g;
/* float division to avoid underflow */
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[i*n+k] * v[k*n+j];
for (k = l; k < n; k++)
v[k*n+j] += s * v[k*n+i];
}
}
for (j = l; j < n; j++)
v[i*n+j] = v[j*n+i] = 0.0f;
}
v[i*n+i] = 1.0f;
g = rv1[i];
l = i;
}
/* accumulate the left-hand transformation */
if (!skip_u) {
for (i = n - 1; i >= 0; i--)
{
l = i + 1;
g = w[i];
if (i < n - 1)
for (j = l; j < n; j++)
a[i*n+j] = 0.0f;
if (g)
{
g = 1.0f / g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = (s / a[i*n+i]) * g;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (j = i; j < m; j++)
a[j*n+i] = a[j*n+i]*g;
}
else
{
for (j = i; j < m; j++)
a[j*n+i] = 0.0f;
}
++a[i*n+i];
}
}
/* diagonalize the bidiagonal form */
for (k = n - 1; k >= 0; k--)
{ /* loop over singular values */
for (its = 0; its < 30; its++)
{ /* loop over allowed iterations */
flag = 1;
for (l = k; l >= 0; l--)
{ /* test for splitting */
nm = l - 1;
if (fabsf(rv1[l]) + anorm == anorm)
{
flag = 0;
break;
}
if (fabsf(w[nm]) + anorm == anorm)
break;
}
if (flag)
{
c = 0.0f;
s = 1.0f;
for (i = l; i <= k; i++)
{
f = s * rv1[i];
if (fabsf(f) + anorm != anorm)
{
g = w[i];
h = hypotf(f, g);
w[i] = h;
h = 1.0f / h;
c = g * h;
s = (- f * h);
if (!skip_u) {
for (j = 0; j < m; j++)
{
y = a[j*n+nm];
z = a[j*n+i];
a[j*n+nm] = y * c + z * s;
a[j*n+i] = z * c - y * s;
}
}
}
}
}
z = w[k];
if (l == k)
{ /* convergence */
if (z < 0.0f)
{ /* make singular value nonnegative */
w[k] = -z;
for (j = 0; j < n; j++)
v[j*n+k] = -v[j*n+k];
}
break;
}
if (its >= 30) {
}
/* shift from bottom 2 x 2 minor */
x = w[l];
nm = k - 1;
y = w[nm];
g = rv1[nm];
h = rv1[k];
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
g = hypotf(f, 1.0f);
f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
/* next QR transformation */
c = s = 1.0f;
for (j = l; j <= nm; j++)
{
i = j + 1;
g = rv1[i];
y = w[i];
h = s * g;
g = c * g;
z = hypotf(f, h);
rv1[j] = z;
c = f / z;
s = h / z;
f = x * c + g * s;
g = g * c - x * s;
h = y * s;
y = y * c;
for (jj = 0; jj < n; jj++)
{
x = v[jj*n+j];
z = v[jj*n+i];
v[jj*n+j] = x * c + z * s;
v[jj*n+i] = z * c - x * s;
}
z = hypotf(f, h);
w[j] = z;
if (z)
{
z = 1.0f / z;
c = f * z;
s = h * z;
}
f = (c * g) + (s * y);
x = (c * y) - (s * g);
if (!skip_u) {
for (jj = 0; jj < m; jj++)
{
y = a[jj*n+j];
z = a[jj*n+i];
a[jj*n+j] = y * c + z * s;
a[jj*n+i] = z * c - y * s;
}
}
}
rv1[l] = 0.0f;
rv1[k] = f;
w[k] = x;
}
}
return(0);
}
__global__ void
svd_kernel(float *v)
{
float a[9], w[3];
a[0] = 8.0f;
a[1] = 3.0f;
a[2] = 7.0f;
a[3] = 7.0f;
a[4] = 9.0f;
a[5] = 1.0f;
a[6] = 3.0f;
a[7] = 7.0f;
a[8] = 2.0f;
svd(a, 3, 3, w, v, 1);
}
int main()
{
int i, j;
float *v_d, v[9];
PERR(cudaMalloc(&v_d, 9*sizeof(float)));
svd_kernel<<<1,1>>>(v_d);
cudaDeviceSynchronize();
ERRCHECK;
PERR(cudaMemcpy(v, v_d, 9*sizeof(float), cudaMemcpyDeviceToHost));
for (i = 0; i < 3; i++) {
for (j = 0; j < 3; j++) {
printf("%6.3f\t", v[i*3+j]);
}
printf("\n");
}
return 0;
}
Correct Results:
$ nvcc -arch=sm_10 -o svd svd.cu
$ ./svd
-0.657 -0.685 0.314
-0.668 0.337 -0.664
-0.349 0.646 0.679
Broken Results:
$ nvcc -arch=sm_20 -o svd svd.cu
$ ./svd
-0.661 -0.660 0.356
-0.642 0.253 -0.724
0.019 0.460 0.888
It seems that CUDA 6 fixes the issue.