Why is my output printing in wrong order in selection sort? - selection-sort

Here's my code:
#include <stdio.h>
int main(void)
{
int a[6] = {6,1,3,4,5,2};
int size = 6;
for(int i = 0; i < size - 1; i++)
{
int smallest = i;
for(int j = i + 1; j < size; j++)
{
if(a[j] < a[smallest])
{
smallest = j;
}
if(smallest != i)
{
int z = a[smallest];
a[smallest] = a[j];
a[j] = z;
}
else
{
a[i] = a[smallest];
}
}
}
for(int i = 0; i < size; i++)
{
printf("%d, ", a[i]);
}
printf("\n");
return 0;
}
So I have 3 problems.
Output printing in descending order. I want to print it as 1,2,3,4,5,6 but the actual output is 6,5,4,3,2,1. Why?
2)When I changed the printf statement as printf("%d, ", a[size - i]); it gave output as 32767, 1,2,3,4. Why?
When I changed the "for" condition in the last "for statement" above "printf" satement as for(int i = 0; i < size; i++) it gave output as 0,1,2,3,4,5, . Why?

Related

Recursion of a function fails

I am working on a sudoku solver using backtracking. For some unknown by me reasons my code blocks can't use recursion. I mean that a function, even if the program reach the code line where I wrote the recursion, won't call itself. The program just continue as if nothing was there.
#include <bits/stdc++.h>
using namespace std;
ifstream in("data.in");
ofstream out("data.out");
int sudoku[10][10];
int f[10];
vector< pair<int, int> > v;
bool continuare(int pas){
int x = v[pas].first;
int y = v[pas].second;
for(int i = x; i <= 9; i++)
f[ sudoku[i][y] ]++;
for(int i = x - 1; i >= 1; i--)
f[ sudoku[i][y] ]++;
for(int j = x + 1; j <= 9; j++)
f[ sudoku[x][j] ]++;
for(int j = x - 1; j >= 1; j--)
f[ sudoku[x][j] ]++;
for( int i = x - 3 + x%3, c1 = 0; c1 < 3; c1++, i++ )
for( int j = y - 3 + y%3, c2 = 0; c2 < 3; c2++, j++ )
f[ sudoku[i][j] ]++;
for(int i = 1; i <= 9; i++){
if( f[i] > 3 )
return false;
f[i] = 0;
}
return true;
}
void afisare(){
for(int i = 1; i <= 9; i++){
for(int j = 1; j <= 9; j++)
out<<sudoku[i][j]<<" ";
out<<"\n";
}
}
void backtracking( int pas ){
if( pas > v.size() )
afisare();
else
for(int i = 1; i <= 9; i++){
sudoku[ v[pas].first ][ v[pas].second ] = i;
if( continuare(pas) )
backtracking( pas + 1 );
}
}
int main()
{
for(int i = 1; i <= 9; i++)
for(int j = 1; j <= 9; j++){
in>>sudoku[i][j];
if(sudoku[i][j] == 0)
v.push_back( make_pair(i, j) );
}
backtracking(1);
return 0;
}
As you may have noticed, the problem is when backtracking() calls itself and as I said nothing happens there.
Copied from comment which seemed to have solved your question:
compile with the -g flag and run your executable against gdb, I just did that and saw that it seg faults at f[ sudoku[i][j] ]++; in continuare function.

Maximum value of batchsize allowed for cublasDgetrfBatched() from CUBLAS Library

Is there any maximum batchsize limitation for cublasDgetrfBatched() from CUBLAS library? I am doing a benchmark problem for comparing timings between CPU and GPU. For a batchsize of 1000 i am getting GPU timing greater than CPU timing. But, for a batchsize of 100, i am getting some speedup over CPU.
I have posted below the code that i used for benchmarking.
1. main.cpp
/*main.cpp goes below*/
#include<stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "mathlib_blas.h"
int main(){
double**mat;
double**mat_scratch1;
int *ipvt;
double *fVec;
double *fVecSave;
double *fVec_scratch;
double *A;
double *B;
double **devPtrA;
double **devPtrB;
double **devPtrA_dev;
double **devPtrB_dev;
double *d_x;
double *x;
int *d_pivot_array ;
int *d_info_array;
int *h_info_array;
int batchsize;
int neqn;
cublasHandle_t handle;
cublasStatus_t status;
cudaError_t error;
clock_t start, end, start1, end1;
double rcond;
batchsize = 32;
neqn = 172;
mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
ipvt = (int*) calloc((size_t) neqn, sizeof(int));
fVec = (double*) calloc((size_t) neqn, sizeof(double));
fVecSave = (double*) calloc((size_t) neqn, sizeof(double));
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double));
A = (double*)malloc( neqn*neqn*sizeof(A[0]));
B = (double*)malloc( neqn*neqn*sizeof(B[0]));
devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA));
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB));
for(int b_count =0; b_count<batchsize; b_count++){
cudaMalloc((void **)&devPtrA[b_count], neqn*neqn * sizeof(devPtrA[0][0]));
cudaMalloc((void **)&devPtrB[b_count], batchsize*neqn * sizeof(devPtrB[0][0]));
}
cudaMalloc((void **)&devPtrA_dev, batchsize*sizeof(*devPtrA));
cudaMalloc((void **)&devPtrB_dev, batchsize*sizeof(*devPtrB));
cudaMemcpy(devPtrA_dev, devPtrA, batchsize*sizeof(*devPtrA), cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB_dev, devPtrB, batchsize*sizeof(*devPtrB), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_x, neqn*sizeof(double));
x =(double *)malloc(neqn*sizeof(double));
cudaMalloc((void **)&d_pivot_array, batchsize*neqn*sizeof(int));
cudaMalloc((void **)&d_info_array, batchsize*sizeof(int));
h_info_array =(int*)malloc(batchsize*sizeof(int));
cublasCreate(&handle);
srand(time(NULL));
/* Fill in the CPU and GPU Matrix */
for (int iRow = 0; iRow < neqn; iRow++) {
double sumCol = 0;
for (int iColumn = 0; iColumn < neqn; iColumn++) {
for(int b_count =0; b_count<batchsize; b_count++){
A[neqn*iColumn + iRow] = rand()%10 ;
mat[iRow][iColumn] = A[neqn*iColumn + iRow];
}
sumCol +=A[neqn*iColumn + iRow];
}
fVec[iRow] = sumCol;
fVecSave[iRow] = sumCol;
}
/*CPU_CODE GOES HERE */
start = clock();
for(int b_count =0; b_count<batchsize; b_count++){
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat_scratch1[iColumn][iRow]= mat[iColumn][iRow];
}
}
dgeco_blas(mat_scratch1, neqn, ipvt, &rcond, fVecSave);
}
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat[iColumn][iRow]= mat_scratch1[iColumn][iRow];
}
}
for(int b_count =0; b_count<batchsize; b_count++){
for(int i = 0; i < neqn; i++) fVec_scratch[i] = fVec[i];
dgesl_blas(mat, neqn, ipvt , fVec_scratch, 0);
}
end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("Time in seconds(CPU) : %lf \n", seconds);
/*CPU_CODE ENDS HERE */
start1 = clock();
for(int b_count =0; b_count<batchsize; b_count++){
status = cublasSetMatrix(neqn, neqn, sizeof(A[0]), A, neqn, devPtrA[b_count], neqn);
}
status = cublasDgetrfBatched(handle, neqn, ( double**)devPtrA_dev,neqn,d_pivot_array,d_info_array,batchsize);
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status);
cudaMemcpy(h_info_array, d_info_array, batchsize*sizeof(int), cudaMemcpyDeviceToHost);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy(devPtrB[b_count], fVec, neqn*sizeof(double),cudaMemcpyHostToDevice); /* for testing purpose only */
}
status = cublasDgetrsBatched(handle, CUBLAS_OP_N, neqn, batchsize, (const double**)devPtrA_dev,
neqn, d_pivot_array,devPtrB_dev, neqn, h_info_array, batchsize);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy( fVec,devPtrB[b_count], neqn*sizeof(double),cudaMemcpyDeviceToHost); /* for testing purpose only */
}
end1 = clock();
float seconds1 = (float)(end1 - start1) / CLOCKS_PER_SEC;
printf("Time in seconds(GPU) : %lf \n", seconds1);
printf("Speedup(CPU/GPU) : %lf \n", seconds/seconds1);
system("pause");
/* End of the main portion of the code */
free(mat);
free(mat_scratch1);
free(ipvt);
free(fVec);
free(fVecSave);
free(fVec_scratch);
free(A);
free(B);
cudaFree(devPtrA[0]);
cudaFree(devPtrB[0]);
cudaFree(devPtrA_dev);
cudaFree(devPtrB_dev);
free(devPtrA);
free(devPtrB);
cudaFree(d_x);
free(x);
cudaFree(d_pivot_array);
cudaFree(d_info_array);
free(h_info_array);
cublasDestroy_v2(handle);
}
2. mathlib_blas.h
#include <stdio.h>
#include <math.h>
#define maxm(a,b) (((a) > (b)) ? (a) : (b))
#define minm(a,b) (((a) < (b)) ? (a) : (b))
#define signum(a,b) (((b) < (0)) ? (-a) : (a))
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType);
void dgefa_blas(double **a,int n, int ipvt[],int *info);
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job);
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z);
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType )
{
void** array = nullptr;
array = (void**)calloc(size1, sizeof(void*));
if (array != nullptr) {
if (size2 > 0) {
void* data = calloc(size1*size2, sizeType);
if (data != nullptr) {
char* addr = (char*)data;
for (int index1 = 0; index1 < size1; index1++) {
array[index1] = (void*)addr;
addr += sizeType*size2; /* char is always 1 byte */
}
} else {
free(array);
free(data);
array = nullptr;
}
}
} else {
}
return array;
}
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z)
{
double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm;
int i,info,j,k,kb,kp1,l;
/* Compute 1-norm of a */
anorm = 0.0;
for (j = 0; j < n; j++) {
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(a[i][j]);
anorm = maxm(anorm,vecsum);
}
/* Factor. */
dgefa_blas(a,n,ipvt,&info);
/* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))).
* estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e.
* trans(a) is the transpose of a. The components of e are
* chosen to cause maximum local growth in the elements of
* w, where trans(u)*w=e. The vectors are frequently rescaled
* to avoid overflow.
*/
ek = 1.0;
for (j = 0; j < n; j++)
z[j] = 0.0;
for (k = 0; k < n; k++) {
if (z[k] != 0.0)
ek = signum(ek,-z[k]);
if (fabs(ek-z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(ek-z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ek *= s;
}
wk = ek - z[k];
wkm = -ek - z[k];
s = fabs(wk);
sm = fabs(wkm);
if (a[k][k] != 0.0) {
wk /= a[k][k];
wkm /= a[k][k];
}
else {
wk = 1.0;
wkm = 1.0;
}
kp1 = k + 1;
if (kp1 < n) {
for (j = kp1; j < n; j++) {
sm += (fabs(z[j] + wkm * a[k][j]));
z[j] += (wk * a[k][j]);
s += fabs(z[j]);
}
if (s < sm) {
t = wkm -wk;
wk = wkm;
for (j = kp1; j < n; j++)
z[j] += (t * a[k][j]);
}
}
z[k] = wk;
}
/* dasum(n,s,z,1) */
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
/* Solve trans(l)*y= w
*/
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (k < (n-1)) {
/* sdot(n-k,a(k+1,k),1,z(k+1),1) */
vecdot = 0.0;
for (i = k+1;i < n; i++)
vecdot += (a[i][k] * z[i]);
z[k] += vecdot;
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
}
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
} /* endfor kb */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm = 1.0;
/*
* Solve l * v = y
*/
for (k = 0; k < n; k++) {
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
if (k < (n-1)) {
/* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */
for (i = k+1;i < n; i++)
z[i] += (t * a[i][k]);
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
}
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
/* Solve u * z = v */
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (fabs(z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
if (a[k][k] != 0.0)
z[k] /= a[k][k];
if (a[k][k] == 0.0)
z[k] = 1.0;
t = -z[k];
/* daxpy(k-1,t,a[1][k],1,z[1],1) */
for (i = 0; i < k; i++)
z[i] += (t * a[i][k]);
}
/* Make znorm = 1.0 */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
if (anorm != 0.0) *rcond = ynorm/anorm;
if (anorm == 0.0) *rcond = 0.0;
}
void dgefa_blas(double **a,int n, int ipvt[],int *info)
{
double dmax,t;
int i,j,k,kp1,l,nm1;
*info = 0;
nm1 = n - 1;
if (n > 0) {
for (k = 0; k < nm1; k++) {
kp1 = k + 1;
/* Find l = pivot index. */
dmax = fabs(a[k][k]);
l = k;
for (i = k+1; i < n; i++) {
if (fabs(a[i][k]) <= dmax) continue;
l = i;
}
ipvt[k] = l;
/* Zero pivot implies this column already triangularized. */
if (a[l][k] == 0.0) {
*info = k;
continue;
}
/* Interchange if necessary. */
if (l != k) {
t = a[l][k];
a[l][k] = a[k][k];
a[k][k] = t;
}
/* Compute multipliers. */
if (a[k][k] == 0.0) printf("\n!ERROR. Singular matrix.\n");
t = -1.0/a[k][k];
for (i = k+1; i < n; i++)
a[i][k] *= t;
/* Row elimination with column indexing. */
for (j = kp1; j < n; j++) {
t = a[l][j];
if (l != k) {
a[l][j] = a[k][j];
a[k][j] = t;
}
for (i = k+1; i < n; i++ )
a[i][j] += (t * a[i][k]);
}
}
}
ipvt[n-1] = n-1;
if (a[n-1][n-1] == 0.0) *info = n-1;
}
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job)
{
double t;
int i,k,kb,l,nm1;
nm1 = n - 1;
if (job == 0) {
/* job = 0, solve a * x = b.
* First solve l * y = b.
*/
if (n > 0) {
for (k = 0; k < nm1; k++) {
l = ipvt[k];
t = b[l];
if (l != k) {
b[l] = b[k];
b[k] = t;
}
/* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */
for (i=k+1;i < n;i++)
b[i] += (t * a[i][k]);
}
}
/* Now solve u * x = y. */
for (kb = 0; kb < n; kb++) {
k = n - kb-1;
b[k] /= a[k][k];
t = -b[k];
/* saxpy(k-1,t,a(1,k),1,b(1),1); */
for (i = 0; i < k ; i++)
b[i] += (t * a[i][k]);
}
return;
}
/* job != 0, solve trans(a) * x = b.
* First solve trans(u) * x = y.
*/
for (k = 0; k < n; k++) {
/* t = ddot(k-1,a(1,k),1,b(1),1); */
t = 0;
for (i = 0; i < k; i++)
t += (a[i][k] * b[i]);
b[k] = (b[k] - t)/a[k][k];
}
/* Now solve trans(l) * x = y. */
if (n > 0) {
for (kb = 0; kb < nm1; kb++) {
k = n - 2 - kb;
/* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */
t = 0;
for (i = k+1;i < n; i++)
t += (a[i][k] * b[i]);
b[k] += t;
l = ipvt[k];
if (l != k) {
t = b[l];
b[l] = b[k];
b[k] = t;
}
}
}
}
There should not be any behavioral differences between a batch size of 100 and a batch size of 1000. (Certainly there would be a performance difference - the batch size of 1000 should probably take longer.)
There are no published limits to the batch size, other than implicit memory limits. In fact, unless the GPU is returning incorrect results, there is no reason to think that you've run into any hard limit anyway.
( If you wanted to explore some behavioral or performance issue, this question is not properly written to address that. )

how to print values in reverse order for this logic?

Here in my code i'm printing values 1 to 64.But i need to print those values in reverse order that is start from 64 to 1.And all values in single table.For my current logic values are printing in 2 tables.number 1 to 56 in first table and 57 to 64 in second table.How to change this logic.
<%
int apps = 64;
int N = 1, k;
label:
for (int i = 1; i <= apps; i++) {
out.println("<table>");
for (int j = 1; j <= 7; j++) {
out.println("<tr>");
for (k = N; k <= N + 7; k++) {
out.println("<td>");
out.println("" + k + "");
out.println("</td>");
if (k == apps) {
break label;
}
}
out.println("</tr>");
N = N + 8;
}
out.println("</table>");
}
%>
your help will be appreciated.
out.println("<table>");
int row=8;
int col =8;
int count = 64;
for(int i = row;i>0 ;i--){
out.println("<tr>");
for(int j=col;j>0;j--){
out.println("<td>"+count+"</td>");
count--;
}
out.println("</tr>");
}
out.println("</table>");
Try this.
Rather than specifying rows and columns variable we can write it as follows:
out.println("<table>");
int apps = 64;
double rowColumn = Math.sqrt(apps);
for (double i = rowColumn; i > 0; i--) {
out.println("<tr>");
for (double j = rowColumn; j > 0; j--) {
out.println("<td>" + apps + "</td>");
apps--;
}
out.println("</tr>");
}
out.println("</table>");
It will work fine.

using shared memory in cuda gives memory write error

I had a kernel which works fine as
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
const Integer TID = CudaGetTargetID();
const Integer ID = TID;
if(ID >= ComputeParticleNumber)
{
return ;
}
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction = make_Scalar3(0,0,0);
if(c_daOutputParticleID[ID] < -1)
{
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
return;
}
Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&TargetPosition,CIDX, CIDY, CIDZ);
Integer Range = 1;
if(CID >=0 && CID < c_CellNum)
{
for(Integer k = -Range; k <= Range; ++k)
{
for(Integer j = -Range; j <= Range; ++j)
{
for(Integer i = -Range; i <= Range; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &TargetPosition, &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
here c_daParticlePosition is constant memory float3 data type . so here I want to use shared memory so I tried to create float3 type shared memory and tried to copy constant date to shared memory however it shows unknown error and with cuda-memcheck it says
here thread number is 255 with 2 block size
shared_memory code
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
//const Integer TID = CudaGetTargetID();
const Integer ID =CudaGetTargetID();
extern __shared__ float3 s[];
/*if(ID >= ComputeParticleNumber)
{
return ;
}*/
s[ID] = c_daParticlePosition[ID];
__syncthreads();
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction.x = 0;
NearestDistance.Direction.y = 0;
NearestDistance.Direction.z = 0;//make_Scalar3(0,0,0);
//if(c_daOutputParticleID[ID] < -1)
//{
// c_daSTLDistance[ID] = NearestDistance;
// c_daSTLID[ID] = NearestID;
// return;
//}
//Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&s[ID],CIDX, CIDY, CIDZ);
if(CID >=0 && CID < c_CellNum)
{
//Integer Range = 1;
for(Integer k = -1; k <= 1; ++k)
{
for(Integer j = -1; j <= 1; ++j)
{
for(Integer i = -1; i <= 1; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
/*if(c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag)
{
continue;
}*/
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &s[ID], &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
error
Invalid __shared__ write of size 4
========= at 0x00000128 in CalcSTLDistance_Kernel(int)
========= by thread (159,0,0) in block (0,0,0)
========= Address 0x0000077c is out of bounds
You may find useful info on how to work with shared memory in this article. Focus especially on static shared memory and dynamic shared memory sections.
Based on above article you should find out that you are simply writing out of bounds of your array s, exactly as the error message says. To fix the issue you can:
either specify the size of shared memory array s at compile time,
if you know it in advance, such as __shared__ float3 s[123456];
or use dynamically sized s array, thats basically what you are doing at the moment, but ALSO specify the third kernel launch parameter as CalcSTLDistance_Kernel<<<gridSize, blockSize, sharedMemorySizeInBytes>>>. In case you will be using an array of 123456 float3s then use int sharedMemorySizeInBytes = 123456 * sizeof(float3)

Using cudaMemcpy3D to transfer *** pointer

I am trying to use cudaMemcpy3D to transfer dynamically allocated 3d matrix (tensor). Tensor is allocated as contiguous block of memory (see code below). I tried various combinations of cudaExtent and cudaMemcpy3DParms, however the order of elements gets mixed up. I created the following example to demonstrate the issue:
#include <stdio.h>
int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;
tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}
return tensor;
}
__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;
for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
tensorrow = (int *)(tensorslice + i * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
}
}
}
}
int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;
int ***h_tensor;
struct cudaPitchedPtr d_tensor;
h_tensor = alloc_tensor(Nx, Ny, Nz);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&cpy);
kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
// ... clean-up
}
Output for host variable (h_tensor) and device (d_tensor) differ, looking like
h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...
d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...
What am I doing wrong? What would be the correct way to use cudaMemcpy3D?
Any time you are having trouble with a cuda code, it's a good idea to do proper cuda error checking. The code you have posted here, at least, does not run correctly for me - the cudaMemcpy3D line throws an error. This is due to item 2 below. (I suspect the code you used to generate the output was not identical to the code you have shown here, but that's just a guess.)
Your usage of make_cudaPitchedPtr is not correct:
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
review the API documentation. Making a CUDA pitched pointer this way is no different between 2D and 3D. So it makes no sense to pass 3 different dimensions as you are doing. Instead do this:
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
The remaining issues I found I attribute to incorrect understanding of 3 dimensions in C. The last subscript on a multiply-subscripted array is the rapidly varying dimension, i.e. it is the one where adjacent values in memory occupy adjacent index values. Your usage of Z in the 3rd dimension is confusing to me due to this. Your host allocation was using Nx in the first subscript place, but your device indexing didn't match. There are obviously multiple ways to handle this. If you don't like my arrangement, you can change it, but the host and device indexing must match.
Anyway, the following code modifications worked for me:
#include <stdio.h>
int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;
tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));
for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}
return tensor;
}
__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;
for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
tensorrow = (int *)(tensorslice + j * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
}
}
}
}
int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;
int ***h_tensor;
struct cudaPitchedPtr d_tensor;
h_tensor = alloc_tensor(Nz, Ny, Nx);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[k][j][i] = value++;
//printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
//h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
}
}
}
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&cpy);
kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
cudaDeviceSynchronize();
// ... clean-up
}