I was learning using shared memory to optimize cuda code.
I followed most of the implementations from Nvidia materials.
But I found that my device code is never executed. Anyone could help me figure out why?
Did I miss something? Thanks.
#include <stdio.h>
#include <cuda_runtime.h>
#include <chrono>
#define BLOCKSIZE 16
typedef struct {
int height;
int width;
int stride;
float *element;
} Matrix;
void initData(float *p, int size){
for (int t=0; t<size; t++){
p[t] = (float)(rand()&0xffff)/1000.0f;
}
}
__device__ float getElement(Matrix a, int row, int col)
{
return a.element[row*a.stride+col];
}
__device__ Matrix getSubM(Matrix a, int row, int col)
{
Matrix res;
res.height = BLOCKSIZE;
res.width = BLOCKSIZE;
res.stride = a.width;
res.element = &a.element[row*BLOCKSIZE*a.stride+col*BLOCKSIZE];
return res;
}
__device__ void setElement(Matrix a, int row, int col, float val)
{
a.element[row*a.stride+col] = val;
}
__global__ void shmMM(Matrix a, Matrix b, Matrix c)
{
int blockRow = blockDim.y;
int blockCol = blockDim.x;
Matrix Csub = getSubM(c, blockRow, blockCol);
int row = threadIdx.y;
int col = threadIdx.x;
float tmp = 0;
for (int i=0; i < a.width/BLOCKSIZE; i++)
{
Matrix a_sub = getSubM(a, blockRow, i);
Matrix b_sub = getSubM(b, i, blockCol);
__shared__ float A[BLOCKSIZE][BLOCKSIZE];
__shared__ float B[BLOCKSIZE][BLOCKSIZE];
A[row][col] = getElement(a, row, col);
B[row][col] = getElement(b, row, col);
__syncthreads();
for (int e = 0; e < BLOCKSIZE; e++)
{
tmp += A[row][e]*B[e][col];
}
__syncthreads();
}
//printf("debug: %f.\n", tmp);
setElement(Csub, row, col, tmp);
}
int main()
{
Matrix a, b, c;
int size = 1<<12;
a.height = a.width = size;
b.height = b.width = size;
c.height = c.width = size;
a.stride = a.width;
b.stride = b.width;
c.stride = c.width;
float *a_h, *b_h, *c_h;
cudaMallocHost((float**)&a_h, a.height*a.width*sizeof(float));
cudaMallocHost((float**)&b_h, b.height*b.width*sizeof(float));
initData(a_h, a.height*a.width);
initData(b_h, b.height*b.width);
c_h = (float*)malloc(c.height*c.width*sizeof(float));
float *a_d, *b_d, *c_d;
cudaMalloc((float**)&a.element, a.height*a.width*sizeof(float));
cudaMalloc((float**)&b.element, b.height*b.width*sizeof(float));
cudaMalloc((float**)&c.element, c.height*c.width*sizeof(float));
cudaMemcpy(a.element, a_h, a.height*a.width*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b.element, b_h, b.height*b.width*sizeof(float), cudaMemcpyHostToDevice);
dim3 block(BLOCKSIZE, BLOCKSIZE);
dim3 grid((b.width-1)/block.x+1, (a.height-1)/block.y+1);
//naiveMM<<<block, grid>>>(a, b, c);
shmMM<<<block, grid>>>(a, b, c);
cudaMemcpy(c_h, c.element, c.height*c.width*sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaFree(a_h);
cudaFree(b_h);
free(c_h);
cudaFree(a.element);
cudaFree(b.element);
cudaFree(c.element);
return 0;
}
I couldn't figure it out since there is no reported compiling error and runtime error.
since there is no reported compiling error and runtime error.
You won't get any reported runtime errors if you fail to use proper CUDA error checking. I recommend that any time you are having trouble with a CUDA code. It's also good practice to run your code with a sanitizer such as cuda-memcheck or compute-sanitizer, depending on your GPU.
If you had done any of that, you would have gotten an invalid configuration argument error on your kernel launch. That would have or should have focused your attention on this code:
dim3 block(BLOCKSIZE, BLOCKSIZE);
dim3 grid((b.width-1)/block.x+1, (a.height-1)/block.y+1);
//naiveMM<<<block, grid>>>(a, b, c);
shmMM<<<block, grid>>>(a, b, c);
The problem there is that you have your block and grid arguments reversed, it should be:
shmMM<<<grid, block>>>(a, b, c);
I'm not suggesting I have fully debugged your application. But that is the source of the reason for this:
CUDA kernel code does not execute
These lines of code are also incorrect:
cudaFree(a_h);
cudaFree(b_h);
but that isn't the source of the problem you are asking about. The corresponding free operation for cudaMallocHost is cudaFreeHost, as mentioned here
Related
I'm working on a CUDA matrix multiplication, but I did some modifications to observe how they affect performances.
I want to observe the behavior and performances of a matrix multiplication kernel, making some changes. I'm measuring the changes in GPU events time, I'm testing it in two speicific different conditions:
I have an amount of matrices (say matN) for A, B and C, then I transfer (H2D) one matrix for A, one for B and multply them, to transfer back (D2H) one C;
I have matN for A, B and C, but I transfer >1(say chunk) matrices for A and for B, I compute exactly chunk multiplications, and transfer back chunk result matrices C.
In the first case (chunk = 1) all works as expected, but in the second case (chunk > 1) I get some of Cs are correct, while others are not.
But if I put a cudaDeviceSynchronize() after the cudaMemcpyAsync, I get correct results.
Here's the code doing what I've just described:
/**** main.cpp ****/
int chunk = matN/iters;
#ifdef LOWPAR
GRIDx= 1;
GRIDy= 1;
label="LOW";
#else
int sizeX = M;
int sizeY = N;
GRIDx = ceil((sizeX)/BLOCK);
GRIDy = ceil((sizeY)/BLOCK);
label="";
#endif
const int bytesA = M*K*sizeof(float);
const int bytesB = K*N*sizeof(float);
const int bytesC = M*N*sizeof(float);
//device mem allocation
float *Ad, *Bd, *Cd;
gpuErrchk( cudaMalloc((void **)&Ad, bytesA*chunk) );
gpuErrchk( cudaMalloc((void **)&Bd, bytesB*chunk) );
gpuErrchk( cudaMalloc((void **)&Cd, bytesC*chunk) );
//host pinned mem allocation
float *A, *B, *C;
gpuErrchk( cudaMallocHost((void **)&A, bytesA*matN) );
gpuErrchk( cudaMallocHost((void **)&B, bytesB*matN) );
gpuErrchk( cudaMallocHost((void **)&C, bytesC*matN) );
//host data init
for(int i=0; i<matN; ++i){
randomMatrix(M, K, A+(i*M*K));
randomMatrix(K, N, B+(i*K*N));
}
//event start
createAndStartEvent(&startEvent, &stopEvent);
if (square)
{
label += "SQUARE";
int size = N*N;
for (int i = 0; i < iters; ++i) {
int j = i%nStream;
int idx = i*size*chunk;
newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[j]);
}
}
else {
...
}
msTot = endEvent(&startEvent, &stopEvent);
#ifdef MEASURES
printMeasures(square, label, msTot, millis.count(), matN, iters, devId);
#else
float *_A, *_B, *_C, *tmpC;
tmpC = (float *)calloc(1,bytesC*chunk);
for (int s=0; s<matN; ++s)
{
_A = A+(s*M*K);
_B = B+(s*K*N);
_C = C+(s*M*N);
memset(tmpC, 0, bytesC*chunk);
hostMatMul(_A, _B, tmpC, M, K, N);
checkMatEquality(_C, tmpC, M, N);
}
#endif
/**** matmul.cu ****/
__global__ void squareMatMulKernel(float* A, float* B, float* C, int N, int chunk) {
int ROW = blockIdx.x*blockDim.x+threadIdx.x;
int COL = blockIdx.y*blockDim.y+threadIdx.y;
if (ROW<N && COL<N) {
int size=N*N;
int offs = 0;
float tmpSum=0.0f;
for (int s=0; s<chunk; ++s)
{
offs = s*size;
tmpSum = 0.0f;
for (int i = 0; i < N; ++i) {
tmpSum += A[offs+(ROW*N)+i] * B[offs+(i*N)+COL];
}
C[offs+(ROW*N)+COL] = tmpSum;
}
}
return ;
}
void newSquareMatMulKer(float *A, float *B, float *C, float *Ad, float *Bd, float *Cd,
int n, int chunk, cudaStream_t strm)
{
int size = n*n;
int bytesMat = size*sizeof(float);
dim3 dimBlock(BLOCK,BLOCK,1);
dim3 dimGrid(GRIDx, GRIDy,1);
gpuErrchk( cudaMemcpyAsync(Ad, A, bytesMat*chunk, cudaMemcpyHostToDevice, strm) );
gpuErrchk( cudaMemcpyAsync(Bd, B, bytesMat*chunk, cudaMemcpyHostToDevice, strm) );
#ifdef LOWPAR
squareMatMulGridStrideKer<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
#else
squareMatMulKernel<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
#endif
squareMatMulKernel<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
gpuErrchk( cudaMemcpyAsync( C, Cd, bytesMat*chunk, cudaMemcpyDeviceToHost, strm) );
cudaDeviceSynchronize();
^ ^ ^ ^ ^ ^
}
I tried to debug using cuda-gdb but nothing strange showed up, gpuErrchk doesn't throw errors in CUDA API calls.
I run the code using memcheck too, both with and without cudaDeviceSynchronize and I got no error.
I think it can be a synchronization issue, but I can't understand the reason behind that.
Can someone spot where I'm wrong?
Other code advices are really appreciated too.
If you are using multiples streams, you may override Ad and Bd before using them.
Example with iters = 2 and nStream = 2 :
for (int i = 0; i < iters; ++i) {
int j = i%nStream;
int idx = i*size*chunk;
newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[j]);
}
From this loop, you will call
newSquareMatMulKer(A, B, C, Ad, Bd, Cd, N, chunk, stream[0]); // call 0
newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[1]); // call 1
As you are using the same memory area on device for both call, you may have several synchronizations issues:
call 1 start to copy A and B on device before call 0:squareMatMulKernel end, so you may use incorrect values of A and/or B to compute your first iteration.
call 1:squareMatMulKernel start before you retrieve the values of C from call 0, so you may override C with values from call 1.
To fix this problem, I see two approaches:
Using synchronization as in your example with cudaDeviceSynchronize();.
You can allocate more memory two device side (one workspace per stream), for example.
''
//device mem allocation
float *Ad, *Bd, *Cd;
gpuErrchk( cudaMalloc((void **)&Ad, bytesA*chunk*nStream) );
gpuErrchk( cudaMalloc((void **)&Bd, bytesB*chunk*nStream) );
gpuErrchk( cudaMalloc((void **)&Cd, bytesC*chunk*nStream) );
/* code here */
for (int i = 0; i < iters; ++i) {
int j = i%nStream;
int idx = i*size*chunk;
int offset_stream = j*size*chunk;
newSquareMatMulKer(A+idx, B+idx, C+idx,
Ad + offset_stream ,
Bd + offset_stream ,
Cd + offset_stream , N, chunk, stream[j]);
}
In this case you don't need synchronization before the end of the loop.
If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before.
The code looks like this(more or less)
__global__ void convert(float kelvin[], float celsius[]) //can pass
arrays in kernel
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<N)
kelvin[i]=celsius[i]+273.15;
}
int main()
{
float *celsius =(float *)malloc(N*sizeof(float));
float *kelvin =(float *)malloc(N*sizeof(float));
cudaMallocManaged(&celsius, N*sizeof(float));
cudaMallocManaged(&kelvin, N*sizeof(float));
// init celsius here
dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();
//Doing stuff with the output here
return 0;
}
The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ // I thonk, 2D arrays can be passed as pointer to pointers
float **A = (float **)malloc(N*N*sizeof(float));
float **B = (float **)malloc(N*N*sizeof(float));
float **C = (float **)malloc(N*N*sizeof(float));
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all
}
But, It shows the following error
matrix_add.cu(22): error: too many initializer values
matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"
Your help is highly appreciated.
You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:
#include <algorithm>
#include <iostream>
const int N = 3;
__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
float* A; cudaMallocManaged(&A, N*N*sizeof(float));
float* B; cudaMallocManaged(&B, N*N*sizeof(float));
float* C; cudaMallocManaged(&C, N*N*sizeof(float));
const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);
std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(1, 1);
MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
reinterpret_cast<float (*)[N]>(B),
C_vals );
cudaDeviceSynchronize();
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
std::cout << C_vals[i][j] << " ";
}
std::cout << std::endl;
}
return 0;
}
Some important points:
Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
Initialization syntax and assignment syntax for arrays are not interchangeable.
All I can suggest is that you study it thoroughly until you understand how it works.
I have 2000 2D-arrays (each array is 1000x1000). I need to compute the mean of each one and put the result in one 2000 vector.
I tried to do that by calling the kernel for each 2D-array, but it is naive, and I want to do the computation once.
What I have been doing is a kernel for one 2D-array. I want to make my kernel do this for 2000 2D-arrays, but in one kernel.
#include <stdio.h>
#include <cuda.h>
#include <time.h>
void init_mat(float *a, const int N, const int M);
void print_mat(float *a, const int N, const int M, char *d);
void print_array(float *a, const int N, char *d);
const int threadsPerBlock=256;
__global__
void kernel(float *mat, float *out, const int N, const int M){
__shared__ float cache[threadsPerBlock];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float sum=0;
if (tid<M) {
for(int i=0; i<N; i++)
sum += mat[(i*M)+tid];
cache[cacheIndex] = sum;
out[tid] =cache[cacheIndex];
}
__syncthreads();
int i = blockDim.x/2;
while(i != 0) {
if(cacheIndex<i)
cache[cacheIndex]+= cache[cacheIndex +i];
__syncthreads();
I /= 2;
}
if (cacheIndex == 0)
out[blockIdx.x]=cache[0];
}
int main (void) {
srand( time(NULL) );
float *a, *b, *c;
float *dev_a, *dev_b, *dev_c;
int N=1000;
int M=1000;
b=(float*)malloc(sizeof(float)*N*M);
c=(float*)malloc(sizeof(float)*M);
init_mat(b, N, M);
printf("<<<<<<<<<< initial data:\n");
print_mat(b, N, M, "matrix");
cudaMalloc((void**)&dev_b, sizeof(float)*N*M);
cudaMalloc((void**)&dev_c, sizeof(float)*M);
cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);
printf("\n\nRunning Kernel...\n\n");
kernel<<<M/256+1, 256>>>(dev_b, dev_c, N, M);
cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf(">>>>>>>>>> final data:\n");
print_array(c, M, "out-vector");
};
void init_mat(float *a, const int N, const int M) {
int i, j;
for(i=0; i<N; i++)
for(j=0; j<M; j++)
a[i*M+j] = rand() % 100 + 1;
}
void print_mat(float *a, const int N, const int M, char *d) {
int i, j;
for(i=0; i<N; i++){
printf("\n%s[%d]:", d, i);
for (j=0; j<M; j++)
printf("\t%6.4f", a[i*M+j]);
}
printf("\n");
}
void print_array(float *a, const int N, char *d) {
int i;
for(i=0; i<N; i++)
printf("\n%s[%d]: %f",d, i, a[i]);
printf("\n");
}
For a reasonably large number of arrays (e.g. 2000) and reasonably large sized arrays (e.g. 2000), the GPU can be fairly efficient if we assign a block to perform the sum reduction (and mean calculation) for each array. This means if you have 2000 arrays we will launch 2000 blocks.
In order to handle arbitrary sized arrays with a fixed number of threads per block, we will use an idea like the grid-striding loop but instead we will cause each block to use a block-striding loop to load all the data associated with a particular array. This means the threads of each block will "stride" through the assigned array, to load all the elements of that array.
Apart from this, the main reduction operation is similar to what you have written, and calculation of the mean is trivial this way - we can calculate the mean before writing the result to global memory, once we have the sum calculated via reduction.
Here is a worked example. If you compile with -DMEAN the code will output the mean of each array. If you omit that compile switch, the code will output the sum of each array. Let N be the number of arrays, and let K be the size of each array.
$ cat t1285.cu
#include <stdio.h>
const size_t N = 1000; // number of arrays
const size_t K = 1000; // size of each array
const int nTPB = 256; // number of threads per block, must be a power-of-2
typedef float mytype; // type of data to be summed
// produce the sum or mean of each array
template <typename T>
__global__ void breduce(const T * __restrict__ idata, T * __restrict__ odata, const int bsize){
__shared__ T sdata[nTPB];
T sum = 0;
//block-striding loop
size_t offset = blockIdx.x*bsize + threadIdx.x;
while (offset < (blockIdx.x+1)*bsize){
sum += idata[offset];
offset += blockDim.x;}
sdata[threadIdx.x] = sum;
__syncthreads();
//shared memory reduction sweep
for (int i = nTPB>>1; i > 0; i>>=1){
if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
// write output sum for this block/array
#ifndef MEAN
if (!threadIdx.x) odata[blockIdx.x] = sdata[0];
#else
if (!threadIdx.x) odata[blockIdx.x] = sdata[0]/bsize;
#endif
}
int main(){
mytype *h_idata, *h_odata, *d_idata, *d_odata;
h_idata=(mytype *)malloc(N*K*sizeof(mytype));
h_odata=(mytype *)malloc(N*sizeof(mytype));
cudaMalloc(&d_idata, N*K*sizeof(mytype));
cudaMalloc(&d_odata, N*sizeof(mytype));
for (size_t i = 0; i < N; i++)
for (size_t j = 0; j < K; j++)
h_idata[i*K+j] = 1 + (i&1); // fill alternating arrays with 1 and 2
memset(h_odata, 0, N*sizeof(mytype)); // zero out
cudaMemset(d_odata, 0, N*sizeof(mytype)); // zero out
cudaMemcpy(d_idata, h_idata, N*K*sizeof(mytype), cudaMemcpyHostToDevice);
breduce<<<N, nTPB>>>(d_idata, d_odata, K);
cudaMemcpy(h_odata, d_odata, N*sizeof(mytype), cudaMemcpyDeviceToHost);
// validate
for (size_t i = 0; i < N; i++)
#ifndef MEAN
if (h_odata[i] != (K*(1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)(K*(1 + (i&1)))); return 1;}
#else
if (h_odata[i] != ((1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)((1 + (i&1)))); return 1;}
#endif
return 0;
}
$ nvcc -arch=sm_35 -o t1285 t1285.cu -DMEAN
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -arch=sm_35 -o t1285 t1285.cu
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.
I have a global function as follows:
__global__ void sort(float* D, float* new_D)
{
int i = threadIdx.x + blockIdx.x * blockDim.x ; // i>=0 && i<N
new_D[ 4*(i/4)+i%2] = D[ 4*(i/4)+2*(i%2) ];
}
And it's called like this:
sort<<<(N/threadperblock),threadperblock>>>(D,new_D);
The function operates incorrectly when I define "N" more than 2048 in single precision,
and 4096 in double precision as I get wrong answers. What's going wrong?
It is absolutely impossible to say anything about why you might not be getting the expected results from your code. An obvious source of error would be uninitialised memory. Your indexing scheme is only assign values to half of new_D, so if you have not taken deliberate steps to assign values to the other values, then the results will contain uninitialised values and miscomparisons or unexpected values between the GPU version and a host implementation could occur.
To illustrate my point, here is a complete repro case which works correctly at any input size which is a power of two:
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
const int N = (2<<20);
__global__ void sort(float* D, float* new_D)
{
int i = threadIdx.x + blockIdx.x * blockDim.x ; // i>=0 && i<N
new_D[ 4*(i/4)+i%2] = D[ 4*(i/4)+2*(i%2) ];
}
__host__ void host_sort(const float* D, float* new_D)
{
for(int i=0; i<N; i++)
new_D[ 4*(i/4)+i%2] = D[ 4*(i/4)+2*(i%2) ];
}
int main(void)
{
const size_t dsize =sizeof(float) * size_t(N);
float *D = (float *)malloc(dsize);
float *new_D = (float *)malloc(dsize);
for(int i=0; i<N; i++) {
D[i] = (float)i;
new_D[i] = -999.0f;
}
float *D_gpu, *new_D_gpu;
assert( cudaMalloc((void**)&D_gpu, dsize) == cudaSuccess );
assert( cudaMemcpy(D_gpu, D, dsize, cudaMemcpyHostToDevice) == cudaSuccess);
assert( cudaMalloc((void**)&new_D_gpu, dsize) == cudaSuccess );
assert( cudaMemcpy(new_D_gpu, new_D, dsize, cudaMemcpyHostToDevice) == cudaSuccess);
dim3 blocksize = dim3(128,1,1);
dim3 gridsize = dim3(N/blocksize.x,1,1);
host_sort(D, new_D);
sort<<< gridsize, blocksize >>>(D_gpu,new_D_gpu);
assert( cudaPeekAtLastError() == cudaSuccess );
assert( cudaThreadSynchronize() == cudaSuccess );
float *new_D_host = (float *)malloc(dsize);
assert( cudaMemcpy(new_D_host, new_D_gpu, dsize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<N; i++)
assert( new_D_host[i] == new_D[i] );
return 0;
}
You should be aware that half of the threads in your kernel are effectively doing redundant assignments and unnecessarily burning memory bandwidth as a result.
What is the threadperblock value? Does it change when you are working in single precission and double precission?
The reason I am asking --- threadIdx.x, blockIdx.x and blockDim.x work as unsigned short. The maximum value that they can hold is 65535, until you cast it to int. If you exceed that value, also when doing mathematical operations, you can get really weird results.
Try this:
int i=blockDim.x;
i=i*blockIdx.x+threadIdx.x