I am trying to do simple matrix multiplication on CUDA. I know arrays can be flattened for passing it to the device. However I am using cudaMallocPitch and cudaMemcpy2d to do the multiplication. While executing the code below I get an error " illegal memory was encountered" when I try to copy the result onto the host I highly appreciate any advice on where I am going wrong. Thanks!
weights-first matrix,dim:30x784
input- second matrix,dim:784x100
results_d - result on the device(GPU)
result - result copied on the host
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch)
{
int row = threadIdx.x;
int col= threadIdx.y;
double value;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
printf("%d",threadIdx);
for(int i =0 ; i < in_pitch ; i++)
{
double *element1 = ((double*)((char*)input + row*in_pitch) + i) ;
double *element2 = ((double*)((char*)weights + i*w1_pitch) + col);
value =+ (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
static double arr1[30][784];
static double arr2[784][100];
static double result[30][100];
for (int i = 0 ; i < 30; i++)
{
for(int j =0;j <784 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < 784; i ++)
{
for(int j=0;j < 100 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,100*sizeof(double),784));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,784*sizeof(double),30));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,100*sizeof(double),30));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,100*sizeof(double),100*sizeof(double),784,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,784*sizeof(double),784*sizeof(double),30,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,100*sizeof(double),100*sizeof(double),30,cudaMemcpyHostToDevice));
//using GPU
dim3 dimGrid(1,1,1);
dim3 dimBlock(32,32,1);
printf("before kernel fucntion");
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch);
printf("after kernel fucntion");
cudaThreadSynchronize();
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < 100; i ++)
{
for(int j=0;j < 30 ; j++)
{
printf("%f",result);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
There are a number of errors in this code. First of all, it's not clear that doing pitched allocations is going to give any benefit here. Second, if you're serious about wanting fast matrix multiply performance, you should use CUBLAS.
Issues:
You don't seem to understand pitched allocations. The pitch value returned is a value in bytes. You cannot sensibly use that for a loop index for matrix multiply. Also, the pitch value is the overall width of the pitch allocation. It does not correspond to the valid data area. For that, you should use the appropriate matrix dimension.
Your code will not do a matrix multiplication over the entire matrix area. You are only creating a single block of 32x32 threads, but you need enough blocks/threads to cover the entire matrix area. This requires changes to your grid dimensions, passing matrix dimensions to your kernel, as well as a "thread check" in your kernel to prevent out-of-bounds access.
This construct for pitched access is not correct:
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
it does not match the other constructions you have for the 2 input matrices, it has a misplaced close parenthesis.
You have the sense of your two input matrices reversed. You are indexing into the input matrix as if it were the weight matrix, and vice-versa. We need to swap the sense of row, column and i to make these match the actual matrix dimensions.
Your final cudaMemcpy2D operation has the pitch values reversed:
cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)
^^^^^ ^^^^^
You forgot to initialize to zero your loop sum variable:
double value;
I don't know what you intended here, it should be += not =+:
value =+ ...
The following code has these issues addressed, and seems to run without error for me:
$ cat t104.cu
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
const int d1 = 30;
const int d2 = 784;
const int d3 = 100;
double arr1[d1][d2];
double arr2[d2][d3];
double result[d1][d3];
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol)
{
int col = threadIdx.x + blockDim.x*blockIdx.x;
int row= threadIdx.y + blockDim.y*blockIdx.y;
if ((row >= rrow) || (col >= rcol)) return;
double value = 0;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch) + col);
for(int i =0 ; i < dim ; i++)
{
double *element1 = ((double*)((char*)input + i*in_pitch) + col) ;
double *element2 = ((double*)((char*)weights + row*w1_pitch) + i);
value += (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
for (int i = 0 ; i < d1; i++)
{
for(int j =0;j <d2 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < d2; i ++)
{
for(int j=0;j < d3 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice));
//using GPU
dim3 dimBlock(32,32,1);
dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3);
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < d3; i ++)
{
for(int j=0;j < d1 ; j++)
{
printf("%f", result[j][i]);
}
printf("\n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
$ nvcc -arch=sm_61 -o t104 t104.cu
$
Related
I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.
I have 2000 2D-arrays (each array is 1000x1000). I need to compute the mean of each one and put the result in one 2000 vector.
I tried to do that by calling the kernel for each 2D-array, but it is naive, and I want to do the computation once.
What I have been doing is a kernel for one 2D-array. I want to make my kernel do this for 2000 2D-arrays, but in one kernel.
#include <stdio.h>
#include <cuda.h>
#include <time.h>
void init_mat(float *a, const int N, const int M);
void print_mat(float *a, const int N, const int M, char *d);
void print_array(float *a, const int N, char *d);
const int threadsPerBlock=256;
__global__
void kernel(float *mat, float *out, const int N, const int M){
__shared__ float cache[threadsPerBlock];
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float sum=0;
if (tid<M) {
for(int i=0; i<N; i++)
sum += mat[(i*M)+tid];
cache[cacheIndex] = sum;
out[tid] =cache[cacheIndex];
}
__syncthreads();
int i = blockDim.x/2;
while(i != 0) {
if(cacheIndex<i)
cache[cacheIndex]+= cache[cacheIndex +i];
__syncthreads();
I /= 2;
}
if (cacheIndex == 0)
out[blockIdx.x]=cache[0];
}
int main (void) {
srand( time(NULL) );
float *a, *b, *c;
float *dev_a, *dev_b, *dev_c;
int N=1000;
int M=1000;
b=(float*)malloc(sizeof(float)*N*M);
c=(float*)malloc(sizeof(float)*M);
init_mat(b, N, M);
printf("<<<<<<<<<< initial data:\n");
print_mat(b, N, M, "matrix");
cudaMalloc((void**)&dev_b, sizeof(float)*N*M);
cudaMalloc((void**)&dev_c, sizeof(float)*M);
cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);
printf("\n\nRunning Kernel...\n\n");
kernel<<<M/256+1, 256>>>(dev_b, dev_c, N, M);
cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf(">>>>>>>>>> final data:\n");
print_array(c, M, "out-vector");
};
void init_mat(float *a, const int N, const int M) {
int i, j;
for(i=0; i<N; i++)
for(j=0; j<M; j++)
a[i*M+j] = rand() % 100 + 1;
}
void print_mat(float *a, const int N, const int M, char *d) {
int i, j;
for(i=0; i<N; i++){
printf("\n%s[%d]:", d, i);
for (j=0; j<M; j++)
printf("\t%6.4f", a[i*M+j]);
}
printf("\n");
}
void print_array(float *a, const int N, char *d) {
int i;
for(i=0; i<N; i++)
printf("\n%s[%d]: %f",d, i, a[i]);
printf("\n");
}
For a reasonably large number of arrays (e.g. 2000) and reasonably large sized arrays (e.g. 2000), the GPU can be fairly efficient if we assign a block to perform the sum reduction (and mean calculation) for each array. This means if you have 2000 arrays we will launch 2000 blocks.
In order to handle arbitrary sized arrays with a fixed number of threads per block, we will use an idea like the grid-striding loop but instead we will cause each block to use a block-striding loop to load all the data associated with a particular array. This means the threads of each block will "stride" through the assigned array, to load all the elements of that array.
Apart from this, the main reduction operation is similar to what you have written, and calculation of the mean is trivial this way - we can calculate the mean before writing the result to global memory, once we have the sum calculated via reduction.
Here is a worked example. If you compile with -DMEAN the code will output the mean of each array. If you omit that compile switch, the code will output the sum of each array. Let N be the number of arrays, and let K be the size of each array.
$ cat t1285.cu
#include <stdio.h>
const size_t N = 1000; // number of arrays
const size_t K = 1000; // size of each array
const int nTPB = 256; // number of threads per block, must be a power-of-2
typedef float mytype; // type of data to be summed
// produce the sum or mean of each array
template <typename T>
__global__ void breduce(const T * __restrict__ idata, T * __restrict__ odata, const int bsize){
__shared__ T sdata[nTPB];
T sum = 0;
//block-striding loop
size_t offset = blockIdx.x*bsize + threadIdx.x;
while (offset < (blockIdx.x+1)*bsize){
sum += idata[offset];
offset += blockDim.x;}
sdata[threadIdx.x] = sum;
__syncthreads();
//shared memory reduction sweep
for (int i = nTPB>>1; i > 0; i>>=1){
if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
__syncthreads();}
// write output sum for this block/array
#ifndef MEAN
if (!threadIdx.x) odata[blockIdx.x] = sdata[0];
#else
if (!threadIdx.x) odata[blockIdx.x] = sdata[0]/bsize;
#endif
}
int main(){
mytype *h_idata, *h_odata, *d_idata, *d_odata;
h_idata=(mytype *)malloc(N*K*sizeof(mytype));
h_odata=(mytype *)malloc(N*sizeof(mytype));
cudaMalloc(&d_idata, N*K*sizeof(mytype));
cudaMalloc(&d_odata, N*sizeof(mytype));
for (size_t i = 0; i < N; i++)
for (size_t j = 0; j < K; j++)
h_idata[i*K+j] = 1 + (i&1); // fill alternating arrays with 1 and 2
memset(h_odata, 0, N*sizeof(mytype)); // zero out
cudaMemset(d_odata, 0, N*sizeof(mytype)); // zero out
cudaMemcpy(d_idata, h_idata, N*K*sizeof(mytype), cudaMemcpyHostToDevice);
breduce<<<N, nTPB>>>(d_idata, d_odata, K);
cudaMemcpy(h_odata, d_odata, N*sizeof(mytype), cudaMemcpyDeviceToHost);
// validate
for (size_t i = 0; i < N; i++)
#ifndef MEAN
if (h_odata[i] != (K*(1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)(K*(1 + (i&1)))); return 1;}
#else
if (h_odata[i] != ((1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)((1 + (i&1)))); return 1;}
#endif
return 0;
}
$ nvcc -arch=sm_35 -o t1285 t1285.cu -DMEAN
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -arch=sm_35 -o t1285 t1285.cu
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
I'm struggling in the kernel code. I have updated this to include support files, but those were provided and should be correct.
This is one of my first GPU programs and I've spent several hours trying new things and I can't seem to get this right. It is compiling and running, but the results are incorrect.
I am basically having trouble understanding what exactly I need to be doing differently because this kernel is giving incorrect results. I'm trying to load a tile of the input image to shared memory (Ns[][], which I think I've done correctly) and apply the filter on the input image tile (which I am struggling with).
I would greatly appreciate it if someone who is more experienced could assist me in figuring out exactly where I've gone wrong and give me an idea how to resolve the issue. I appreciate your time and apologies if I've asked this question incorrectly.
main.cu:
#include <stdio.h>
#include "support.h"
#include "kernel.cu"
#include <time.h>
int main(int argc, char* argv[]){
Timer timer;
time_t t;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image
Matrix N_d, P_d;
unsigned imageHeight, imageWidth;
cudaError_t cuda_ret;
dim3 dim_grid, dim_block;
/* Read image dimensions */
if (argc == 1) {
imageHeight = 600;
imageWidth = 1000;
} else if (argc == 2) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[1]);
} else if (argc == 3) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./convolution # Image is 600 x 1000"
"\n Usage: ./convolution <m> # Image is m x m"
"\n Usage: ./convolution <m> <n> # Image is m x n"
"\n");
exit(0);
}
/* Allocate host memory */
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE);
N_h = allocateMatrix(imageHeight, imageWidth);
P_h = allocateMatrix(imageHeight, imageWidth);
/* Initialize filter and images */
initMatrix(M_h);
initMatrix(N_h);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Image: %u x %u\n", imageHeight, imageWidth);
printf(" Mask: %u x %u\n", FILTER_SIZE, FILTER_SIZE);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
N_d = allocateDeviceMatrix(imageHeight, imageWidth);
P_d = allocateDeviceMatrix(imageHeight, imageWidth);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
/* Copy image to device global memory */
copyToDeviceMatrix(N_d, N_h);
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));
dim_grid = dim3(((N_h.width / BLOCK_SIZE) + 1), ((N_h.height / BLOCK_SIZE) + 1));
dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
convolution<<<dim_grid, dim_block>>>(N_d, P_d);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
copyFromDeviceMatrix(P_h, P_d);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(M_h, N_h, P_h);
// Free memory ------------------------------------------------------------
freeMatrix(M_h);
freeMatrix(N_h);
freeMatrix(P_h);
freeDeviceMatrix(N_d);
freeDeviceMatrix(P_d);
return 0;
}
kernel.cu:
__constant__ float M_c[FILTER_SIZE][FILTER_SIZE];
__global__ void convolution(Matrix N, Matrix P){
__shared__ float Ns[TILE_SIZE + 5 - 1][TILE_SIZE + 5 -1];
int i, j;
float output = 0.0f;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row_o = blockIdx.y * TILE_SIZE + ty;
int col_o = blockIdx.x * TILE_SIZE + tx;
int row_i = row_o - 2;
int col_i = col_o - 2;
if((row_i >= 0) && (row_i < N.height) && (col_i >= 0) && (col_i < N.width)){
Ns[ty][tx] = N.elements[row_i * N.width + col_i];
}
else{
Ns[ty][tx] = 0.0f;
}
__syncthreads();
if(ty < TILE_SIZE && tx < TILE_SIZE){
for(i = 0; i < 5; i++){
for(j = 0; j < 5; j++){
output += M_c[i][j] * Ns[i + ty][j + tx];
}
}
}
if(row_o < P.height && col_o < P.width){
P.elements[row_o * P.width + col_o] = output;
}
}
support.h:
#ifndef __FILEH__
#define __FILEH__
#include <sys/time.h>
typedef struct {
struct timeval startTime;
struct timeval endTime;
} Timer;
// Matrix Structure declaration
typedef struct {
unsigned int width;
unsigned int height;
unsigned int pitch;
float* elements;
} Matrix;
#define FILTER_SIZE 5
#define TILE_SIZE 12
#define BLOCK_SIZE (TILE_SIZE + FILTER_SIZE - 1)
Matrix allocateMatrix(unsigned height, unsigned width);
void initMatrix(Matrix mat);
Matrix allocateDeviceMatrix(unsigned height, unsigned width);
void copyToDeviceMatrix(Matrix dst, Matrix src);
void copyFromDeviceMatrix(Matrix dst, Matrix src);
void verify(Matrix M, Matrix N, Matrix P);
void freeMatrix(Matrix mat);
void freeDeviceMatrix(Matrix mat);
void startTime(Timer* timer);
void stopTime(Timer* timer);
float elapsedTime(Timer timer);
#define FATAL(msg, ...) \
do {\
fprintf(stderr, "[%s:%d] "msg"\n", __FILE__, __LINE__, ##__VA_ARGS__);\
exit(-1);\
} while(0)
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
#endif
support.cu:
#include <stdlib.h>
#include <stdio.h>
#include "support.h"
Matrix allocateMatrix(unsigned height, unsigned width)
{
Matrix mat;
mat.height = height;
mat.width = mat.pitch = width;
mat.elements = (float*)malloc(height*width*sizeof(float));
if(mat.elements == NULL) FATAL("Unable to allocate host");
return mat;
}
void initMatrix(Matrix mat)
{
for (unsigned int i=0; i < mat.height*mat.width; i++) {
mat.elements[i] = (rand()%100)/100.00;
}
}
Matrix allocateDeviceMatrix(unsigned height, unsigned width)
{
Matrix mat;
cudaError_t cuda_ret;
mat.height = height;
mat.width = mat.pitch = width;
cuda_ret = cudaMalloc((void**)&(mat.elements), height*width*sizeof(float));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
return mat;
}
void copyToDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy to device");
}
void copyFromDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy from device");
}
void verify(Matrix M, Matrix N, Matrix P) {
const float relativeTolerance = 1e-6;
for(int row = 0; row < N.height; ++row) {
for(int col = 0; col < N.width; ++col) {
float sum = 0.0f;
for(int i = 0; i < M.height; ++i) {
for(int j = 0; j < M.width; ++j) {
int iN = row - M.height/2 + i;
int jN = col - M.width/2 + j;
if(iN >= 0 && iN < N.height && jN >= 0 && jN < N.width) {
sum += M.elements[i*M.width + j]*N.elements[iN*N.width + jN];
}
}
}
float relativeError = (sum - P.elements[row*P.width + col])/sum;
if (relativeError > relativeTolerance
|| relativeError < -relativeTolerance) {
printf("TEST FAILED\n\n");
exit(0);
}
}
}
printf("TEST PASSED\n\n");
}
void freeMatrix(Matrix mat)
{
free(mat.elements);
mat.elements = NULL;
}
void freeDeviceMatrix(Matrix mat)
{
cudaFree(mat.elements);
mat.elements = NULL;
}
void startTime(Timer* timer) {
gettimeofday(&(timer->startTime), NULL);
}
void stopTime(Timer* timer) {
gettimeofday(&(timer->endTime), NULL);
}
float elapsedTime(Timer timer) {
return ((float) ((timer.endTime.tv_sec - timer.startTime.tv_sec) \
+ (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
}
One set of problems is here:
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));
If you ran your code with cuda-memcheck it would point you right at this line as being a problem.
The first parameter should be the destination symbol, i.e. M_c, and the second parameter should be the host source pointer, i.e. M_h.
Furthermore, shouldn't it be FILTER_SIZE*FILTER_SIZE ? Isn't the size of data you want to transfer equal to the dimension squared?
Finally, M_h is not a valid source pointer. You should use M_h.elements.
So something like this:
cudaMemcpyToSymbol(M_c, M_h.elements,FILTER_SIZE*FILTER_SIZE*sizeof(float));
I don't believe this fixes all the issues in your code. To continue the debug, I would print out one element in the GPU result that does not match your verify routine, and work through the arithmetic for that one element. Use printf in device code if that helps.
In the future, please run your code with cuda-memcheck before asking for help here. Even if you don't understand the output, it will be useful for those trying to help you.
I've written a piece of code to call the kernel in the book GPU Gems 3, Chapter 39: Parallel Prefix Sum (Scan) with CUDA.
However the results that I get are a bunch of negative numbers instead of prefix scan.
Is my kernel call wrong or is there something wrong with the code from the GPU Gems 3 book?
Here is my code:
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
__global__ void kernel(int *g_odata, int *g_idata, int n, int dim)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += g_idata[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void Initialize(int *h_in,int num_items)
{
int j;
for(j=0;j<num_items;j++)
h_in[j]=j;
printf(" input: ");
printf("\n\n");
}
int main(int argc, char** argv)
{
int num_items = 512;
int* h_in = new int[num_items];
// Initialize problem
Initialize(h_in, num_items);
int *d_in = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * num_items);
if(cudaSuccess != cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");
// Allocate device output array
int *d_out = NULL;
cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));
kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);
int* h_out= new int[num_items+1];
if(cudaSuccess != cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
int i;
printf(" \n");
for(i=0;i<num_items;i++)
printf(" ,%d ",h_out[i]);
// Cleanup
if (h_in) delete[] h_in;
if (h_out) delete[] h_out;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
printf("\n\n");
return 0;
}
It seems that you've made at least 1 error in transcribing the code from the GPU Gems 3 chapter into your kernel. This line is incorrect:
temp[bi] += g_idata[ai];
it should be:
temp[bi] += temp[ai];
When I make that one change to the code you have now posted, it seems to print out the correct (exclusive-scan) prefix sum for me. There's a few other things I would mention:
Even without that change, I get some results that are close to correct. So if you're getting widely different stuff (e.g. negative numbers) you may have a problem with your machine setup or CUDA install. I would suggest using more rigorous cuda error checking than what you have now (although a machine setup problem should have been indicated in one of your checks.)
The routine as crafted will have some limitations. It can only be used in a single threadblock, it will have bank conflicts on shared memory access, and it will be limited in data set size to what can be handled by a single threadblock (this routine produces two output elements per thread, so the data set size is expected to be equal to twice the number of threads). As has been already covered, the dynamic shared memory allocation needs to be as large as the data set size (ie. twice the thread size, in number of elements).
This may be useful for learning, but if you want a robust, fast prefix scan, you are advised to use a routine from thrust or cub instead of your own code, even if derived from this (old) article.
The following code is similar to yours, but it has the above issues fixed, and I have templated the kernel for use with various datatypes:
#include <stdio.h>
#define DSIZE 512
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
template <typename T>
__global__ void prescan(T *g_odata, T *g_idata, int n)
{
extern __shared__ T temp[]; // allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
T t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(){
mytype *h_i, *d_i, *h_o, *d_o;
int dszp = (DSIZE)*sizeof(mytype);
h_i = (mytype *)malloc(dszp);
h_o = (mytype *)malloc(dszp);
if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
cudaMalloc(&d_i, dszp);
cudaMalloc(&d_o, dszp);
cudaCheckErrors("cudaMalloc fail");
for (int i = 0 ; i < DSIZE; i++){
h_i[i] = i;
h_o[i] = 0;}
cudaMemset(d_o, 0, dszp);
cudaCheckErrors("cudaMemset fail");
cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
mytype psum = 0;
for (int i =1; i < DSIZE; i++){
psum += h_i[i-1];
if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
}
return 0;
}
I am doing something in CUDA (FFT), but I have no idea why it is generating exceptions when calling the kernel function.
All includes and definitions:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define CPU_ARRAY_SIZE 1024 // 1024, 2048, 4096 8192
#define GPU_ARRAY_SIZE 512 //
#define THREAD_SIZE 16 // fixed
#define BLOCK_SIZE (GPU_ARRAY_SIZE/THREAD_SIZE) // 32
#define PI 3.14
As I am running it in a NVIDIA GTX480, I thought it could be the shared memory space, although it doesn't seem to be (as there are "some many" shared variables). So, I aws changing the GPU_ARRAY_SIZE to see how it works, and it was giving me different results when I define it as 32, 64, 256, 512 (in the 512 case, it returns ALL zeros, which I guess CUDA couldn't make anything - in other cases, it returns weird, as I don't know the reason why it jumps 16 cells without any calculation). In most cases, in the Output window of my Microsoft Visual Studio, it returns billions of exceptions of the style "First-chance exception at 0x75b9b9bc in .exe: Microsoft C++ exception: cudaError_enum at memory location ". Before you ask me to debug, I cannot debug it, as the VS doesn't do that for files that are not recognized by VS (like .cpp - at least this theory works in my case).
Do you guys have any idea for the questions:
1. why is it generating exceptions?
2. why is it calculating, what it should do for every cell in every block, just within few cells
How could I solve this problem... any idea?
Kernel function:
__global__ void twiddle_factor(double *d_isub_matrix, double *d_osub_matrix)
{
__shared__ double block[THREAD_SIZE][THREAD_SIZE];
__shared__ double spectrum[THREAD_SIZE][THREAD_SIZE];
__shared__ double sum_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double sum_sin[THREAD_SIZE][THREAD_SIZE];
__shared__ double local_cos[THREAD_SIZE][THREAD_SIZE]; // declaring the shared sum_cos.. similarly for sum_sin
__shared__ double local_sin[THREAD_SIZE][THREAD_SIZE];
unsigned int xIndex = threadIdx.x + blockIdx.x* blockDim.x;
unsigned int yIndex = threadIdx.y + blockIdx.y* blockDim.y;
int u;
int x=0,y=0;
int tx = threadIdx.x;
int ty = threadIdx.y;
double sum_sines=0.0,sum_cosines=0.0;
double angle=(2*PI)/GPU_ARRAY_SIZE;
block[tx][ty] = d_isub_matrix[yIndex*GPU_ARRAY_SIZE+xIndex];
__syncthreads();
//for every column!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one row is activate. The thread in row adds all element of its column. */
if (ty == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
/* Access each column to add all elements of the column.*/
for (y=0; y<THREAD_SIZE; y++)
{
sum_sines += local_sin[tx][y];
sum_cosines += local_cos[tx][y];
}
//if (sum_sines < 0)
//sum_sin[u][tx] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[u][tx] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[u][tx] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[u][tx] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)
+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
block[tx][ty] = spectrum[tx][ty];
__syncthreads();
//for every row!
for(u=0; u<THREAD_SIZE; u++)
{
/* All threads calculate its own sin and cos value. */
local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);
/* Only one column is activate. The thread in colum adds all element of its row. */
if (tx == u)
{
sum_sines = 0.0;
sum_cosines = 0.0;
for (x=0; x<THREAD_SIZE; x++)
{
sum_sines += local_sin[x][ty];
sum_cosines += local_cos[x][ty];
}
//if (sum_sines < 0)
//sum_sin[ty][u] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
//else
sum_sin[ty][u] = sum_sines/GPU_ARRAY_SIZE;
//if (sum_cosines < 0)
//sum_cos[ty][u] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
//else
sum_cos[ty][u] = sum_cosines/GPU_ARRAY_SIZE;
}
__syncthreads();
}
spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)+(double)pow(sum_cos[tx][ty],2));
__syncthreads();
/* Transpose! I think this is not necessary part. */
d_osub_matrix[xIndex*GPU_ARRAY_SIZE + yIndex] = spectrum[threadIdx.y][threadIdx.x];
__syncthreads();
}
The main function:
int main(int argc, char** argv)
{
int i,j, w, h, sw, sh;
int numSubblock = CPU_ARRAY_SIZE / GPU_ARRAY_SIZE;
double *d_isub_matrix,*d_osub_matrix;
double *big_matrix = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *big_matrix2 = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
double *isub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
double *osub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
cudaEvent_t start,stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (i=0; i<CPU_ARRAY_SIZE; i++)
{
for (j=0; j<CPU_ARRAY_SIZE; j++)
big_matrix[i*CPU_ARRAY_SIZE + j] = rand();//i*CPU_ARRAY_SIZE + j;
}
cudaEventRecord(start,0);
//cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
//cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
for(i = 0; i < numSubblock; i++)
{
for (j=0; j < numSubblock; j++)
{
// start position of subarea of big array
cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
h = i*GPU_ARRAY_SIZE;
w = j*GPU_ARRAY_SIZE;
//printf("h = %d, w=%d",h,w);
//system("PAUSE");
// move subarea of big array into isub array.
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
isub_matrix[sh*GPU_ARRAY_SIZE+sw] = big_matrix[(h+sh)*CPU_ARRAY_SIZE + (w+sw)];
}
}
cudaMemcpy(d_isub_matrix,isub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);
//call the cuda kernel
dim3 blocks(BLOCK_SIZE, BLOCK_SIZE);
dim3 threads(THREAD_SIZE, THREAD_SIZE);
twiddle_factor<<<blocks, threads>>>(d_isub_matrix,d_osub_matrix);
cudaMemcpy(osub_matrix,d_osub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
{
big_matrix2[(h+sh)*CPU_ARRAY_SIZE + (w+sw)] = osub_matrix[sh*GPU_ARRAY_SIZE+sw];
printf(" sh %d sw %d %lf \n", sh, sw, osub_matrix[sh*GPU_ARRAY_SIZE+sw]);
}
}
printf("passei por aqui algumas vezes\n");
cudaFree(d_osub_matrix);
cudaFree(d_isub_matrix);
}
}
// cudaFree(d_osub_matrix);
// cudaFree(d_isub_matrix);
//Stop the time
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime,start,stop);
//showing the processing time
printf("The processing time took... %fms to execute everything",elapsedtime);
system("PAUSE");
for (sh = 0; sh < CPU_ARRAY_SIZE; sh++)
{
for (sw = 0; sw <CPU_ARRAY_SIZE; sw++)
{
printf(" sh %d sw %d %lf \n", sh, sw, big_matrix2[sh*CPU_ARRAY_SIZE+sw]);
}
}
system("PAUSE");
// I guess the result is "[1][0] = [1], [1][512] = [513], [513][0] = [524289], [513][512] = [524801]".
}
By a short look the problem could and should be the folling lines:
// start position of subarea of big array
cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
You are allocating just to few memory for your double values on the GPU. Your sub matrix is allocated with 4 byte per point where 8 byte are needed.