Performance difference due to indexing during matrix multiplication - cuda

I'm trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:
#include <iostream>
#include <assert.h>
using namespace std;
# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void init_values(int *a, int *b, int sz) {
for(int i=0; i<sz; i++) {
a[i] = rand()%513 - 256;
b[i] = rand()%513 - 256;
}
}
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int t = 0;
for(int i=0; i<n; i++) {
t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
}
c[IDX(x, y, n)] = t;
}
void matmul_verify(int *a, int *b, int *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
int t = 0;
for(int k=0; k<n; k++)
t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
// cout << i << " " << j << " " << c[IDX(i, j, n)] << " " << t << endl;
assert(c[IDX(i, j, n)] == t);
}
}
}
int main()
{
int *a, *b, *c;
int *da, *db, *dc;
size_t sz = N * N * sizeof(int);
a = (int*)malloc(sz);
b = (int*)malloc(sz);
c = (int*)malloc(sz);
init_values(a, b, N*N);
gpuErrchk(cudaMalloc((void**)&da, sz));
gpuErrchk(cudaMalloc((void**)&db, sz));
gpuErrchk(cudaMalloc((void**)&dc, sz));
gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
// init grid size
dim3 grids(N/THREADS, N/THREADS);
dim3 blocks(THREADS, THREADS);
// time it
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matmul<<<grids, blocks>>>(da, db, dc, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cout << "Took " << milliseconds << " milliseconds.\n";
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
matmul_verify(a, b, c, N);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(a);
free(b);
free(c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
and for the tiled implementation, I change the kernel as
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int ty = threadIdx.y, by = blockIdx.y;
int tx = threadIdx.x, bx = blockIdx.x;
int x = bx * blockDim.x + tx;
int y = by * blockDim.y + ty;
// block IDs tell us which block to solve for
// (bx, by) --> (bx: bx + tx, by:by + ty)
__shared__ int A[SHMEM_SIZE];
__shared__ int B[SHMEM_SIZE];
const int tile_size = THREADS;
// to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
int res = 0;
for(int blk=0; blk < n; blk+=tile_size) {
// block index
A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
__syncthreads();
for(int k=0; k<tile_size; k++) {
res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
}
__syncthreads();
}
// for(int k=0; k<n; k++)
// res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
c[IDX(x, y, n)] = res;
}
nothing else really changes. However, in the tiled implementation, if I simply change
int ty = threadIdx.x, by = blockIdx.x;
int tx = threadIdx.y, bx = blockIdx.y;
for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.

CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth. In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.
Other factors that can deteriorate memory bandwidth are the size of the load (one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.) and alignment.
The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.
Therefore to allow for coalescing, you have to access memory as
A[ty * s + tx] // coalesced access
instead of
A[tx * s + ty] // strided access
to achieve optimal performance.
What is probably meant in the book you mentioned is that there shouldn't be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. These divisions probably happen internally when having a block that is not flat in the first place.

Related

cudaMallocManaged for 2D and 3D array

If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before.
The code looks like this(more or less)
__global__ void convert(float kelvin[], float celsius[]) //can pass
arrays in kernel
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<N)
kelvin[i]=celsius[i]+273.15;
}
int main()
{
float *celsius =(float *)malloc(N*sizeof(float));
float *kelvin =(float *)malloc(N*sizeof(float));
cudaMallocManaged(&celsius, N*sizeof(float));
cudaMallocManaged(&kelvin, N*sizeof(float));
// init celsius here
dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();
//Doing stuff with the output here
return 0;
}
The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ // I thonk, 2D arrays can be passed as pointer to pointers
float **A = (float **)malloc(N*N*sizeof(float));
float **B = (float **)malloc(N*N*sizeof(float));
float **C = (float **)malloc(N*N*sizeof(float));
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all
}
But, It shows the following error
matrix_add.cu(22): error: too many initializer values
matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"
Your help is highly appreciated.
You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:
#include <algorithm>
#include <iostream>
const int N = 3;
__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
float* A; cudaMallocManaged(&A, N*N*sizeof(float));
float* B; cudaMallocManaged(&B, N*N*sizeof(float));
float* C; cudaMallocManaged(&C, N*N*sizeof(float));
const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);
std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(1, 1);
MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
reinterpret_cast<float (*)[N]>(B),
C_vals );
cudaDeviceSynchronize();
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
std::cout << C_vals[i][j] << " ";
}
std::cout << std::endl;
}
return 0;
}
Some important points:
Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
Initialization syntax and assignment syntax for arrays are not interchangeable.
All I can suggest is that you study it thoroughly until you understand how it works.

Can't find out why cudaMemcpy fails using curand library functions

I'm trying to initialize in a random manner the weights ( stored as floats ) of a neural network using CURAND functions.
I first initialize the neural netwotk with some values and after that I attempt to copy the two matrices in the nn struct ( nn stands for neural network ), that should store the weight values ( nn.wih, and nn.who ) into the Device memory.
Then I call a function that should randomize the matrices' values (assignRandomWeight), which launches two kernels that holds curand functions.
Finally I try to copy the resulting matrices back to the host memory through a cudaMemcpy call, but at this point I get the error "an illegal memory access was encountered".
I tried to print the values of the Device copy matrices of wih and who, which are d_wih and d_who. They seems to be correct; I left in the code two functions usefull for debugging :
checkCudaError can be called to check the last cudaError_t string message
showValues is useful to print the values of a Device allcated arraay
I extracted a sample of my code that compile and presents the same error, plese help me out
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include<cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct TNeuralNetwork {
int input_neurons;
int hidden_neurons;
int output_neurons;
float *wih; //first layer of weights (from input layer to hidden layer)
float *who; //second layer of weights (from hidden layer to output layer)
float *wih_old; //for the momentum
float *who_old; //for the momentum
float *erro;
float *errh;
float l; //learning rate
float m; //momentum
float *i; //values into input neurons
float *h; //values into hidden neurons
float *o; //values into output neurons
};
__host__ void checkCudaError(char *str);
__global__ void showValues(float *d_v, int dim);
__global__ void init_rand(unsigned int seed, curandState_t state_wih);
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out);
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who);
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel);
int main(int argc, char **argv) {
struct TNeuralNetwork nn;
//Declare Device variables
float *d_wih;
float *d_who;
unsigned int v;
cudaError_t cudaStatus;
initNeuralNetwork(&nn, 102, 10);
//Allocate Device Memory
v = (nn.input_neurons + 1)*(nn.hidden_neurons);
cudaMalloc((void**)&d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float));
checkCudaError("malloc1");
//cudaMalloc((void**)&d_who, (nn.hidden_neurons + 1)*nn.output_neurons * sizeof(float));
//checkCudaError("malloc2");
for (int i = 0; i < (nn.input_neurons + 1); i++){
for (int j = 0; j < nn.hidden_neurons; j++){
nn.wih[i*nn.hidden_neurons + j] = 0;
}
}
for (int i = 0; i < (nn.hidden_neurons + 1); i++){
for (int j = 0; j < nn.output_neurons; j++){
nn.who[i*nn.output_neurons + j] = 0;
}
}
cudaMemcpy(d_wih, nn.wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError("memcpy0");
//showValues << <v, 1 >> >(d_wih, v); TEST
//cudaMemcpy(d_who, nn.who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyHostToDevice);
//checkCudaError("memcpy0.1");
assignRandomWeight(&nn, d_wih, d_who);
cudaMemcpy(nn.wih, d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyDeviceToHost);
//showValues << <v, 1 >> >(d_wih, v); TEST
checkCudaError("memcpy1");
//cudaMemcpy(nn.who, d_who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyDeviceToHost);
//checkCudaError("memcpy2");
//printf("WIH:\n");
//for (int i = 0; i < (nn.input_neurons + 1); i++){
// for (int j = 0; j < (nn.hidden_neurons); j++){
// printf("%.12f\t", nn.wih[i*(nn.hidden_neurons) + j]);
// }
// printf("\n\n");
//}
//printf("WHO:\n");
//for (int i = 0; i < (nn.hidden_neurons + 1); i++){
// for (int j = 0; j < nn.output_neurons; j++){
// printf("%.12f\t", nn.wih[i*nn.output_neurons + j]);
// }
// printf("\n\n");
//}
cudaFree(d_wih);
cudaFree(d_who);
return 0;
}
__host__ void checkCudaError(char *str){
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
printf("Cuda Error at %s: %s \n", str, cudaGetErrorString(err));
exit(-1);
}
}
__global__ void showValues(float *d_v, int dim){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < dim){
printf("elemento[%d] = %.4f\n", tid, d_v[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, 0, tid, &state_wih);
}
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
printf("%.7f", (float)curand(&state_wih + tid));
if (tid <= (inp + 1)*hid){
wih[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", wih[tid]);
}
if (tid <= (hid + 1)*out){
who[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", who[tid]);
}
}
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel) {
nn->input_neurons = bands;
nn->output_neurons = nlabel;
//nn->hidden_neurons = (int)((bands + nlabel)/2.0f);
nn->hidden_neurons = (int)((bands + nlabel)*2.0f / 3.0f);
nn->l = 0.001;
nn->m = 0.2;
nn->wih = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->wih_old = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who_old = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->i = (float*)malloc(bands * sizeof(float));
nn->h = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->o = (float*)malloc(nlabel * sizeof(float));
nn->errh = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->erro = (float*)malloc(nlabel * sizeof(float));
memset(nn->wih_old, 0, (bands + 1)*(nn->hidden_neurons) * sizeof(float));
memset(nn->who_old, 0, (nn->hidden_neurons + 1)*nlabel * sizeof(float));
}
//curand
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
"Incorrect Indexing" will produce out-of-bounds memory access within the kernel. The CUDA runtime will destroy your context at the point where the error occurred within the kernel, after which no CUDA operations which rely the the context can be performed. The cudaMemcpycall fails because your context has been destroyed. There is no way to avoid this.
NVIDIA supply a utility called cuda-memcheck with the CUDA toolkit. Use that instead to diagnose what is going wrong with your kernel.
I'v found my error:
I was making a bad use of the "curandState_t" type variable in the assignRandomWight function, I had to use a pointer.
this is the correct version:
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t *state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
and the correct version for the two kernels:
__global__ void generateRandomValues( curandState_t *state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
if (tid<=(inp+1)*hid ){
printf("\ncasual : %f", (float)curand_uniform(&state_wih[tid]));
wih[tid] = (float)curand_uniform(&state_wih[tid]);
}
if (tid<=(hid+1)*out){
who[tid] = (float)curand_uniform(&state_wih[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t *state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state_wih[tid]);
}

Cuda program not working

i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.

CUDA Kernel for copying array location from neighbour location

I have a cuda kernel that copies from i+1 th location to the ith location in a device array. The copying is not done from the locations whose index values are multiples of 32. [32]->[31] not copied, [64]->[63] not copied. This happens irrespective of the block size. How this could be resolved?
Here is the full program. No calls for syncthreads(). Still the problem exists.
#include <cstdio>
struct SodA { float *df0; size_t pitch; };
__global__ void stream_kernel (SodA dA1, SodA dA2, int M, int N);
int main(int argc, char **argv){
int i, M=32, N=32;float *f0;
SodA dA1, dA2;
dim3 blockSize = dim3(32,32);
dim3 gridSize = dim3(1,1);
f0 = (float *)malloc(M*N*sizeof(float));
cudaMallocPitch((void **)&dA1.df0, &dA1.pitch, sizeof(float)*M, N);
cudaMallocPitch((void **)&dA2.df0, &dA2.pitch, sizeof(float)*M, N);
for (i=0; i<M*N; i++) f0[i] = (float)rand()/RAND_MAX;
cudaMemcpy2D((void *)dA1.df0, dA1.pitch, (void *)f0, sizeof(float)*M, sizeof(float)*M, N, cudaMemcpyHostToDevice);
printf("\n");
for(int i=28;i<70; i++)
printf("%5d ", i);
printf("\n\n");
printf("\n");
for(int i=28;i<70; i++)
printf("%.3f ", f0[i]);
printf("\n\n");
stream_kernel<<<gridSize, blockSize>>>(dA1, dA2, M, N);
cudaMemcpy2D( (void *)f0, sizeof(float)*M, (void *)dA2.df0, dA2.pitch,sizeof(float)*M, N, cudaMemcpyDeviceToHost);
printf("\n");
for(int i=28;i<70; i++)
printf("%.3f ", f0[i]);
printf("\n\n");
free(f0);cudaFree(dA2.df0);
cudaFree(dA1.df0);
printf("\n\n");
return 0;
}
__global__ void stream_kernel (SodA dA1, SodA dA2, int M, int N)
{
int i, j, i2d;
i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;
i2d = i + j * M;
if (i2d>0) { dA2.df0[i2d-1] = dA1.df0[i2d];}
}
The output
28 29 30 31 32 33 ....
0.999 0.218 0.513 0.839 0.613 0.296 0.638....
0.218 0.513 0.839 0.198 0.296 0.638 ....
Thanks for the comments. In a 2D array stored in row major order, this kernel moves the (i,j)th position to its previous position. Since the array is pitched, as mentioned in the comments, the previous element of the first element in each row could not be found using -1 offset. This special case is handled by computing the last element in the previous array. I got the answer. Thanks.

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
If you had of included some basic error checking around the function call, perhaps like this:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
it would have been immediately obvious that the kernel launch or execution was failing with an error.