CUDA basic operations error - cuda

I have a global function as follows:
__global__ void sort(float* D, float* new_D)
{
int i = threadIdx.x + blockIdx.x * blockDim.x ; // i>=0 && i<N
new_D[ 4*(i/4)+i%2] = D[ 4*(i/4)+2*(i%2) ];
}
And it's called like this:
sort<<<(N/threadperblock),threadperblock>>>(D,new_D);
The function operates incorrectly when I define "N" more than 2048 in single precision,
and 4096 in double precision as I get wrong answers. What's going wrong?

It is absolutely impossible to say anything about why you might not be getting the expected results from your code. An obvious source of error would be uninitialised memory. Your indexing scheme is only assign values to half of new_D, so if you have not taken deliberate steps to assign values to the other values, then the results will contain uninitialised values and miscomparisons or unexpected values between the GPU version and a host implementation could occur.
To illustrate my point, here is a complete repro case which works correctly at any input size which is a power of two:
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
const int N = (2<<20);
__global__ void sort(float* D, float* new_D)
{
int i = threadIdx.x + blockIdx.x * blockDim.x ; // i>=0 && i<N
new_D[ 4*(i/4)+i%2] = D[ 4*(i/4)+2*(i%2) ];
}
__host__ void host_sort(const float* D, float* new_D)
{
for(int i=0; i<N; i++)
new_D[ 4*(i/4)+i%2] = D[ 4*(i/4)+2*(i%2) ];
}
int main(void)
{
const size_t dsize =sizeof(float) * size_t(N);
float *D = (float *)malloc(dsize);
float *new_D = (float *)malloc(dsize);
for(int i=0; i<N; i++) {
D[i] = (float)i;
new_D[i] = -999.0f;
}
float *D_gpu, *new_D_gpu;
assert( cudaMalloc((void**)&D_gpu, dsize) == cudaSuccess );
assert( cudaMemcpy(D_gpu, D, dsize, cudaMemcpyHostToDevice) == cudaSuccess);
assert( cudaMalloc((void**)&new_D_gpu, dsize) == cudaSuccess );
assert( cudaMemcpy(new_D_gpu, new_D, dsize, cudaMemcpyHostToDevice) == cudaSuccess);
dim3 blocksize = dim3(128,1,1);
dim3 gridsize = dim3(N/blocksize.x,1,1);
host_sort(D, new_D);
sort<<< gridsize, blocksize >>>(D_gpu,new_D_gpu);
assert( cudaPeekAtLastError() == cudaSuccess );
assert( cudaThreadSynchronize() == cudaSuccess );
float *new_D_host = (float *)malloc(dsize);
assert( cudaMemcpy(new_D_host, new_D_gpu, dsize, cudaMemcpyDeviceToHost) == cudaSuccess);
for(int i=0; i<N; i++)
assert( new_D_host[i] == new_D[i] );
return 0;
}
You should be aware that half of the threads in your kernel are effectively doing redundant assignments and unnecessarily burning memory bandwidth as a result.

What is the threadperblock value? Does it change when you are working in single precission and double precission?
The reason I am asking --- threadIdx.x, blockIdx.x and blockDim.x work as unsigned short. The maximum value that they can hold is 65535, until you cast it to int. If you exceed that value, also when doing mathematical operations, you can get really weird results.
Try this:
int i=blockDim.x;
i=i*blockIdx.x+threadIdx.x

Related

Performance difference due to indexing during matrix multiplication

I'm trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:
#include <iostream>
#include <assert.h>
using namespace std;
# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void init_values(int *a, int *b, int sz) {
for(int i=0; i<sz; i++) {
a[i] = rand()%513 - 256;
b[i] = rand()%513 - 256;
}
}
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int t = 0;
for(int i=0; i<n; i++) {
t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
}
c[IDX(x, y, n)] = t;
}
void matmul_verify(int *a, int *b, int *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
int t = 0;
for(int k=0; k<n; k++)
t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
// cout << i << " " << j << " " << c[IDX(i, j, n)] << " " << t << endl;
assert(c[IDX(i, j, n)] == t);
}
}
}
int main()
{
int *a, *b, *c;
int *da, *db, *dc;
size_t sz = N * N * sizeof(int);
a = (int*)malloc(sz);
b = (int*)malloc(sz);
c = (int*)malloc(sz);
init_values(a, b, N*N);
gpuErrchk(cudaMalloc((void**)&da, sz));
gpuErrchk(cudaMalloc((void**)&db, sz));
gpuErrchk(cudaMalloc((void**)&dc, sz));
gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
// init grid size
dim3 grids(N/THREADS, N/THREADS);
dim3 blocks(THREADS, THREADS);
// time it
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matmul<<<grids, blocks>>>(da, db, dc, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cout << "Took " << milliseconds << " milliseconds.\n";
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
matmul_verify(a, b, c, N);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(a);
free(b);
free(c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
and for the tiled implementation, I change the kernel as
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int ty = threadIdx.y, by = blockIdx.y;
int tx = threadIdx.x, bx = blockIdx.x;
int x = bx * blockDim.x + tx;
int y = by * blockDim.y + ty;
// block IDs tell us which block to solve for
// (bx, by) --> (bx: bx + tx, by:by + ty)
__shared__ int A[SHMEM_SIZE];
__shared__ int B[SHMEM_SIZE];
const int tile_size = THREADS;
// to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
int res = 0;
for(int blk=0; blk < n; blk+=tile_size) {
// block index
A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
__syncthreads();
for(int k=0; k<tile_size; k++) {
res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
}
__syncthreads();
}
// for(int k=0; k<n; k++)
// res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
c[IDX(x, y, n)] = res;
}
nothing else really changes. However, in the tiled implementation, if I simply change
int ty = threadIdx.x, by = blockIdx.x;
int tx = threadIdx.y, bx = blockIdx.y;
for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.
CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth. In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.
Other factors that can deteriorate memory bandwidth are the size of the load (one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.) and alignment.
The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.
Therefore to allow for coalescing, you have to access memory as
A[ty * s + tx] // coalesced access
instead of
A[tx * s + ty] // strided access
to achieve optimal performance.
What is probably meant in the book you mentioned is that there shouldn't be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. These divisions probably happen internally when having a block that is not flat in the first place.

CUDA kernel code does not execute when using shared memory

I was learning using shared memory to optimize cuda code.
I followed most of the implementations from Nvidia materials.
But I found that my device code is never executed. Anyone could help me figure out why?
Did I miss something? Thanks.
#include <stdio.h>
#include <cuda_runtime.h>
#include <chrono>
#define BLOCKSIZE 16
typedef struct {
int height;
int width;
int stride;
float *element;
} Matrix;
void initData(float *p, int size){
for (int t=0; t<size; t++){
p[t] = (float)(rand()&0xffff)/1000.0f;
}
}
__device__ float getElement(Matrix a, int row, int col)
{
return a.element[row*a.stride+col];
}
__device__ Matrix getSubM(Matrix a, int row, int col)
{
Matrix res;
res.height = BLOCKSIZE;
res.width = BLOCKSIZE;
res.stride = a.width;
res.element = &a.element[row*BLOCKSIZE*a.stride+col*BLOCKSIZE];
return res;
}
__device__ void setElement(Matrix a, int row, int col, float val)
{
a.element[row*a.stride+col] = val;
}
__global__ void shmMM(Matrix a, Matrix b, Matrix c)
{
int blockRow = blockDim.y;
int blockCol = blockDim.x;
Matrix Csub = getSubM(c, blockRow, blockCol);
int row = threadIdx.y;
int col = threadIdx.x;
float tmp = 0;
for (int i=0; i < a.width/BLOCKSIZE; i++)
{
Matrix a_sub = getSubM(a, blockRow, i);
Matrix b_sub = getSubM(b, i, blockCol);
__shared__ float A[BLOCKSIZE][BLOCKSIZE];
__shared__ float B[BLOCKSIZE][BLOCKSIZE];
A[row][col] = getElement(a, row, col);
B[row][col] = getElement(b, row, col);
__syncthreads();
for (int e = 0; e < BLOCKSIZE; e++)
{
tmp += A[row][e]*B[e][col];
}
__syncthreads();
}
//printf("debug: %f.\n", tmp);
setElement(Csub, row, col, tmp);
}
int main()
{
Matrix a, b, c;
int size = 1<<12;
a.height = a.width = size;
b.height = b.width = size;
c.height = c.width = size;
a.stride = a.width;
b.stride = b.width;
c.stride = c.width;
float *a_h, *b_h, *c_h;
cudaMallocHost((float**)&a_h, a.height*a.width*sizeof(float));
cudaMallocHost((float**)&b_h, b.height*b.width*sizeof(float));
initData(a_h, a.height*a.width);
initData(b_h, b.height*b.width);
c_h = (float*)malloc(c.height*c.width*sizeof(float));
float *a_d, *b_d, *c_d;
cudaMalloc((float**)&a.element, a.height*a.width*sizeof(float));
cudaMalloc((float**)&b.element, b.height*b.width*sizeof(float));
cudaMalloc((float**)&c.element, c.height*c.width*sizeof(float));
cudaMemcpy(a.element, a_h, a.height*a.width*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b.element, b_h, b.height*b.width*sizeof(float), cudaMemcpyHostToDevice);
dim3 block(BLOCKSIZE, BLOCKSIZE);
dim3 grid((b.width-1)/block.x+1, (a.height-1)/block.y+1);
//naiveMM<<<block, grid>>>(a, b, c);
shmMM<<<block, grid>>>(a, b, c);
cudaMemcpy(c_h, c.element, c.height*c.width*sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaFree(a_h);
cudaFree(b_h);
free(c_h);
cudaFree(a.element);
cudaFree(b.element);
cudaFree(c.element);
return 0;
}
I couldn't figure it out since there is no reported compiling error and runtime error.
since there is no reported compiling error and runtime error.
You won't get any reported runtime errors if you fail to use proper CUDA error checking. I recommend that any time you are having trouble with a CUDA code. It's also good practice to run your code with a sanitizer such as cuda-memcheck or compute-sanitizer, depending on your GPU.
If you had done any of that, you would have gotten an invalid configuration argument error on your kernel launch. That would have or should have focused your attention on this code:
dim3 block(BLOCKSIZE, BLOCKSIZE);
dim3 grid((b.width-1)/block.x+1, (a.height-1)/block.y+1);
//naiveMM<<<block, grid>>>(a, b, c);
shmMM<<<block, grid>>>(a, b, c);
The problem there is that you have your block and grid arguments reversed, it should be:
shmMM<<<grid, block>>>(a, b, c);
I'm not suggesting I have fully debugged your application. But that is the source of the reason for this:
CUDA kernel code does not execute
These lines of code are also incorrect:
cudaFree(a_h);
cudaFree(b_h);
but that isn't the source of the problem you are asking about. The corresponding free operation for cudaMallocHost is cudaFreeHost, as mentioned here

Computing the mean of 2000 2D-arrays with CUDA C

I have 2000 2D-arrays (each array is 1000x1000). I need to compute the mean of each one and put the result in one 2000 vector.
I tried to do that by calling the kernel for each 2D-array, but it is naive, and I want to do the computation once.
What I have been doing is a kernel for one 2D-array. I want to make my kernel do this for 2000 2D-arrays, but in one kernel.
#include <stdio.h>
#include <cuda.h>
#include <time.h>
void init_mat(float *a, const int N, const int M);
void print_mat(float *a, const int N, const int M, char *d);
void print_array(float *a, const int N, char *d);
const int threadsPerBlock=256;
__global__
void kernel(float *mat, float *out, const int N, const int M){
__shared__ float cache[threadsPerBlock];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float sum=0;
if (tid<M) {
for(int i=0; i<N; i++)
sum += mat[(i*M)+tid];
cache[cacheIndex] = sum;
out[tid] =cache[cacheIndex];
}
__syncthreads();
int i = blockDim.x/2;
while(i != 0) {
if(cacheIndex<i)
cache[cacheIndex]+= cache[cacheIndex +i];
__syncthreads();
I /= 2;
}
if (cacheIndex == 0)
out[blockIdx.x]=cache[0];
}
int main (void) {
srand( time(NULL) );
float *a, *b, *c;
float *dev_a, *dev_b, *dev_c;
int N=1000;
int M=1000;
b=(float*)malloc(sizeof(float)*N*M);
c=(float*)malloc(sizeof(float)*M);
init_mat(b, N, M);
printf("<<<<<<<<<< initial data:\n");
print_mat(b, N, M, "matrix");
cudaMalloc((void**)&dev_b, sizeof(float)*N*M);
cudaMalloc((void**)&dev_c, sizeof(float)*M);
cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);
printf("\n\nRunning Kernel...\n\n");
kernel<<<M/256+1, 256>>>(dev_b, dev_c, N, M);
cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf(">>>>>>>>>> final data:\n");
print_array(c, M, "out-vector");
};
void init_mat(float *a, const int N, const int M) {
int i, j;
for(i=0; i<N; i++)
for(j=0; j<M; j++)
a[i*M+j] = rand() % 100 + 1;
}
void print_mat(float *a, const int N, const int M, char *d) {
int i, j;
for(i=0; i<N; i++){
printf("\n%s[%d]:", d, i);
for (j=0; j<M; j++)
printf("\t%6.4f", a[i*M+j]);
}
printf("\n");
}
void print_array(float *a, const int N, char *d) {
int i;
for(i=0; i<N; i++)
printf("\n%s[%d]: %f",d, i, a[i]);
printf("\n");
}
For a reasonably large number of arrays (e.g. 2000) and reasonably large sized arrays (e.g. 2000), the GPU can be fairly efficient if we assign a block to perform the sum reduction (and mean calculation) for each array. This means if you have 2000 arrays we will launch 2000 blocks.
In order to handle arbitrary sized arrays with a fixed number of threads per block, we will use an idea like the grid-striding loop but instead we will cause each block to use a block-striding loop to load all the data associated with a particular array. This means the threads of each block will "stride" through the assigned array, to load all the elements of that array.
Apart from this, the main reduction operation is similar to what you have written, and calculation of the mean is trivial this way - we can calculate the mean before writing the result to global memory, once we have the sum calculated via reduction.
Here is a worked example. If you compile with -DMEAN the code will output the mean of each array. If you omit that compile switch, the code will output the sum of each array. Let N be the number of arrays, and let K be the size of each array.
$ cat t1285.cu
#include <stdio.h>
const size_t N = 1000; // number of arrays
const size_t K = 1000; // size of each array
const int nTPB = 256; // number of threads per block, must be a power-of-2
typedef float mytype; // type of data to be summed
// produce the sum or mean of each array
template <typename T>
__global__ void breduce(const T * __restrict__ idata, T * __restrict__ odata, const int bsize){
__shared__ T sdata[nTPB];
T sum = 0;
//block-striding loop
size_t offset = blockIdx.x*bsize + threadIdx.x;
while (offset < (blockIdx.x+1)*bsize){
sum += idata[offset];
offset += blockDim.x;}
sdata[threadIdx.x] = sum;
__syncthreads();
//shared memory reduction sweep
for (int i = nTPB>>1; i > 0; i>>=1){
if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
// write output sum for this block/array
#ifndef MEAN
if (!threadIdx.x) odata[blockIdx.x] = sdata[0];
#else
if (!threadIdx.x) odata[blockIdx.x] = sdata[0]/bsize;
#endif
}
int main(){
mytype *h_idata, *h_odata, *d_idata, *d_odata;
h_idata=(mytype *)malloc(N*K*sizeof(mytype));
h_odata=(mytype *)malloc(N*sizeof(mytype));
cudaMalloc(&d_idata, N*K*sizeof(mytype));
cudaMalloc(&d_odata, N*sizeof(mytype));
for (size_t i = 0; i < N; i++)
for (size_t j = 0; j < K; j++)
h_idata[i*K+j] = 1 + (i&1); // fill alternating arrays with 1 and 2
memset(h_odata, 0, N*sizeof(mytype)); // zero out
cudaMemset(d_odata, 0, N*sizeof(mytype)); // zero out
cudaMemcpy(d_idata, h_idata, N*K*sizeof(mytype), cudaMemcpyHostToDevice);
breduce<<<N, nTPB>>>(d_idata, d_odata, K);
cudaMemcpy(h_odata, d_odata, N*sizeof(mytype), cudaMemcpyDeviceToHost);
// validate
for (size_t i = 0; i < N; i++)
#ifndef MEAN
if (h_odata[i] != (K*(1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)(K*(1 + (i&1)))); return 1;}
#else
if (h_odata[i] != ((1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)((1 + (i&1)))); return 1;}
#endif
return 0;
}
$ nvcc -arch=sm_35 -o t1285 t1285.cu -DMEAN
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -arch=sm_35 -o t1285 t1285.cu
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Cuda program not working

i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.

Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D

I'm new in CUDA, I appreciate your help and hope you can help me.
I need to store multiple elements of a 2D array into a vector, and then work with the vector, but my code does not work well, when I debug, I find a mistake in allocating the 2D array in the device with cudaMallocPitch and copying to that array with cudaMemcpy2D. This is my code:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cmath>
#define maxThreads 96
__global__ void extract(int mSize, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while(idx<N)
{
dev_vector[idx] = *(dev_matrix+(mSize*idx+N));
idx += blockDim.x * gridDim.x;
}
}
int main()
{
//CPU variables
int mSize = 5;
float* matrix;
int N = 4; // Vector size
int i,j;
float* vector;
int blocks, threads;
float* dev_matrix;
float* dev_vector;
blocks = 1+((N-1)/maxThreads);
threads = 1+((N-1)/blocks);
unsigned long int pitch;
unsigned long int memsize_vector = N*sizeof(float);
unsigned long int memsize_matrix = mSize*sizeof(float);
matrix = new float[memsize_matrix*memsize_matrix];
vector = new float[memsize_vector];
//Create 2D array
for(i=0; i<mSize; i++)
for(j=0; j<mSize; j++)
{
matrix[i+mSize*j] = ((i+1)+(j+1));
}
printf("\n");
for (i=0; i<mSize; i++){
for(j=0; j<mSize; j++){
printf("% 1.5f ", matrix[i+mSize*j]);
}
printf("\n");
}
printf("\n");
cudaMallocPitch((void **)&dev_matrix, &pitch, memsize_matrix, mSize);
cudaMalloc((void **)&dev_vector, memsize_vector);
cudaMemcpy2D(dev_matrix, pitch, matrix, memsize_matrix, memsize_matrix, mSize,
cudaMemcpyHostToDevice);
extract<<<blocks,threads>>>(mSize, dev_vector, dev_matrix, N);
cudaDeviceSynchronize();
cudaMemcpy(vector, dev_vector, memsize_vector, cudaMemcpyDeviceToHost);
printf("Vector values are:\n");
for(i=0; i<N; i++)
printf(" % 1.5f ", vector[i]);
printf("\n");
cudaFree(dev_matrix);
cudaFree(dev_vector);
}
There are lots of problems in this code, including but not limited to using array sizes in bytes and word sizes interchangeably in several places in code, using incorrect types (note that size_t exists for a very good reason) , potential truncation and type casting problems, and more.
But the core problem is the addressing of pitched memory inside the kernel, to which you are never even passing the pitch value. Reading the documentation for cudaMallocPitch will give you the correct method for addressing pitched memory inside a kernel. Your kernel might then look like this:
__global__ void extract(size_t mpitch, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * mpitch) + N );
idx += stride;
}
}
[disclaimer: never compiled or tested, use at own risk].
You will have to fix then all the problems in the host code to reflect whatever kernel changes you make.
Thanks to all, Alex I had not seen that, and fix it, thanks.
talonmies, thank you, my code works, with your suggestions. thanks a lot, finally this my kernel:
__global__ void sumreduct(size_t pitch, float* dev_vector, float* dev_matrix, int columns, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * pitch) + columns);
idx += stride;
}
}
About "size_t", I was using "Unsigned int" because Nsight show me the next warning:
Type 'size_t' could not be resolved
Thanks
Did you really mean to declare a source matrix of length [memsizeMatrix*memsizeMatrix] ?
This will allocate 400 floats, or 1600 bytes. This means your source-pitch is off, and the Memcpy2D call is failing.
I'm assuming you meant to say
matrix = new float[mSize*mSize];