Gaussian Image Blurring using CUDA C/C++ [closed] - cuda

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 3 years ago.
Improve this question
I am trying to apply gaussian image blurring using CUDA C/C++ programming. The CPU part is working good with producing a good result. But, in the case of GPU, it's producing only a black image. I am not sure where is the problem. Here is my full code.
How can I resolve this issue?
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include <iomanip>
#include <fstream>
#define IMW 1600
#define IMH 1600
#define CHANNEL_NUM 3
#define IMAGE_BUFFER_SIZE (IMW*IMH*CHANNEL_NUM)
#define BLOCKX 16
#define BLOCKY BLOCKX
#define BLUR_DEGREE 3
#define BLUR_SIZE 1
unsigned int width, height;
int hmask[3][3] = { 1, 2, 1,
2, 4, 2,
1, 2, 1
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
__global__ void blurKernel(unsigned char * in, unsigned char * out, int w, int h)
{
int Col = blockIdx.x * blockDim.x + threadIdx.x;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
if (Col < w && Row < h)
{
int pixVal = 0;
int pixels = 0;
// Get the average of the surrounding BLUR_SIZE x BLUR_SIZE box
for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE+1; ++blurRow)
{
for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE+1; ++blurCol)
{
int curRow = Row + blurRow;
int curCol = Col + blurCol;
// Verify we have a valid image pixel
if(curRow > -1 && curRow < h && curCol > -1 && curCol < w)
{
pixVal += in[curRow * w + curCol];
pixels++; // Keep track of number of pixels in the avg
}
}
}
// Write our new pixel value out
out[Row * w + Col] = (unsigned char)(pixVal / pixels);
}
}
int main(int argc, char **argv)
{
/************ Setup work ***********************/
unsigned char *d_resultPixels;
unsigned char *h_resultPixels;
unsigned char *h_devicePixels;
unsigned char *h_pixels = NULL;
unsigned char *d_pixels = NULL;
int nBlurDegree;
int imageSize = sizeof(unsigned char) * IMAGE_BUFFER_SIZE;
h_pixels = (unsigned char *)malloc(imageSize);
h_resultPixels = (unsigned char *)malloc(imageSize);
h_devicePixels = (unsigned char *)malloc(imageSize);
int width1, height1, bpp;
h_pixels = stbi_load("rana_1600_1600.png", &width1, &height1, &bpp, CHANNEL_NUM);
width = width1;
height = height1;
printf("Image size: %u\n", imageSize);
printf("Image width: %u\n", width);
printf("Image height: %u\n", height);
//memcpy(h_devicePixels, h_pixels, imageSize);
/************************** Start host processing ************************/
unsigned long long cputime = dtime_usec(0);
// cpu code here.....
cputime = dtime_usec(cputime);
stbi_write_png("host_output.png", width, height, CHANNEL_NUM, h_resultPixels, width*CHANNEL_NUM);
/************************** End host processing **************************/
/************************** Start device processing **********************/
// allocate memory on the GPU for the output image
cudaMalloc((void**)&d_pixels, imageSize);
cudaMalloc((void**)&d_resultPixels, imageSize);
cudaMemcpy(d_pixels, h_pixels, imageSize, cudaMemcpyHostToDevice);
checkCUDAError("CUDA memcpy to device");
dim3 blocksPerGrid(IMW / 16, 1);
dim3 threadsPerBlock(16, 1);
unsigned long long gputime = dtime_usec(0);
for (nBlurDegree = 0; nBlurDegree < BLUR_DEGREE; nBlurDegree++)
{
cudaMemset(d_resultPixels, 0, imageSize);
blurKernel << <blocksPerGrid, threadsPerBlock >> >(d_pixels, d_resultPixels, width, height);
cudaMemcpy(d_pixels, d_resultPixels, imageSize, cudaMemcpyDeviceToDevice);
cudaThreadSynchronize();
}
cudaDeviceSynchronize();
gputime = dtime_usec(gputime);
cudaMemcpy(h_devicePixels, d_resultPixels, imageSize, cudaMemcpyDeviceToHost);
printf("GPU time: %f seconds, CPU time: %f seconds\n", gputime/(float)USECPSEC, cputime/(float)USECPSEC);
printf("Speedup: %f\n", (cputime/(float)USECPSEC)/(gputime/(float)USECPSEC));
validate(h_pixels, h_devicePixels, imageSize);
stbi_write_png("device_output.png", width, height, CHANNEL_NUM, h_devicePixels, width*CHANNEL_NUM);
/************************** End device processing ************************/
// Release resources
cudaFree(d_pixels);
cudaFree(d_resultPixels);
//stbi_image_free(h2_pixels);
free(h_devicePixels);
free(h_pixels);
free(h_resultPixels);
return 0;
} // End main
I need help on how to get the GPU output image "device_output.png".

Related

CUB reduction using 2D grid of blocks

I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.

Can't find out why cudaMemcpy fails using curand library functions

I'm trying to initialize in a random manner the weights ( stored as floats ) of a neural network using CURAND functions.
I first initialize the neural netwotk with some values and after that I attempt to copy the two matrices in the nn struct ( nn stands for neural network ), that should store the weight values ( nn.wih, and nn.who ) into the Device memory.
Then I call a function that should randomize the matrices' values (assignRandomWeight), which launches two kernels that holds curand functions.
Finally I try to copy the resulting matrices back to the host memory through a cudaMemcpy call, but at this point I get the error "an illegal memory access was encountered".
I tried to print the values of the Device copy matrices of wih and who, which are d_wih and d_who. They seems to be correct; I left in the code two functions usefull for debugging :
checkCudaError can be called to check the last cudaError_t string message
showValues is useful to print the values of a Device allcated arraay
I extracted a sample of my code that compile and presents the same error, plese help me out
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include<cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct TNeuralNetwork {
int input_neurons;
int hidden_neurons;
int output_neurons;
float *wih; //first layer of weights (from input layer to hidden layer)
float *who; //second layer of weights (from hidden layer to output layer)
float *wih_old; //for the momentum
float *who_old; //for the momentum
float *erro;
float *errh;
float l; //learning rate
float m; //momentum
float *i; //values into input neurons
float *h; //values into hidden neurons
float *o; //values into output neurons
};
__host__ void checkCudaError(char *str);
__global__ void showValues(float *d_v, int dim);
__global__ void init_rand(unsigned int seed, curandState_t state_wih);
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out);
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who);
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel);
int main(int argc, char **argv) {
struct TNeuralNetwork nn;
//Declare Device variables
float *d_wih;
float *d_who;
unsigned int v;
cudaError_t cudaStatus;
initNeuralNetwork(&nn, 102, 10);
//Allocate Device Memory
v = (nn.input_neurons + 1)*(nn.hidden_neurons);
cudaMalloc((void**)&d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float));
checkCudaError("malloc1");
//cudaMalloc((void**)&d_who, (nn.hidden_neurons + 1)*nn.output_neurons * sizeof(float));
//checkCudaError("malloc2");
for (int i = 0; i < (nn.input_neurons + 1); i++){
for (int j = 0; j < nn.hidden_neurons; j++){
nn.wih[i*nn.hidden_neurons + j] = 0;
}
}
for (int i = 0; i < (nn.hidden_neurons + 1); i++){
for (int j = 0; j < nn.output_neurons; j++){
nn.who[i*nn.output_neurons + j] = 0;
}
}
cudaMemcpy(d_wih, nn.wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError("memcpy0");
//showValues << <v, 1 >> >(d_wih, v); TEST
//cudaMemcpy(d_who, nn.who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyHostToDevice);
//checkCudaError("memcpy0.1");
assignRandomWeight(&nn, d_wih, d_who);
cudaMemcpy(nn.wih, d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyDeviceToHost);
//showValues << <v, 1 >> >(d_wih, v); TEST
checkCudaError("memcpy1");
//cudaMemcpy(nn.who, d_who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyDeviceToHost);
//checkCudaError("memcpy2");
//printf("WIH:\n");
//for (int i = 0; i < (nn.input_neurons + 1); i++){
// for (int j = 0; j < (nn.hidden_neurons); j++){
// printf("%.12f\t", nn.wih[i*(nn.hidden_neurons) + j]);
// }
// printf("\n\n");
//}
//printf("WHO:\n");
//for (int i = 0; i < (nn.hidden_neurons + 1); i++){
// for (int j = 0; j < nn.output_neurons; j++){
// printf("%.12f\t", nn.wih[i*nn.output_neurons + j]);
// }
// printf("\n\n");
//}
cudaFree(d_wih);
cudaFree(d_who);
return 0;
}
__host__ void checkCudaError(char *str){
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
printf("Cuda Error at %s: %s \n", str, cudaGetErrorString(err));
exit(-1);
}
}
__global__ void showValues(float *d_v, int dim){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < dim){
printf("elemento[%d] = %.4f\n", tid, d_v[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, 0, tid, &state_wih);
}
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
printf("%.7f", (float)curand(&state_wih + tid));
if (tid <= (inp + 1)*hid){
wih[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", wih[tid]);
}
if (tid <= (hid + 1)*out){
who[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", who[tid]);
}
}
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel) {
nn->input_neurons = bands;
nn->output_neurons = nlabel;
//nn->hidden_neurons = (int)((bands + nlabel)/2.0f);
nn->hidden_neurons = (int)((bands + nlabel)*2.0f / 3.0f);
nn->l = 0.001;
nn->m = 0.2;
nn->wih = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->wih_old = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who_old = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->i = (float*)malloc(bands * sizeof(float));
nn->h = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->o = (float*)malloc(nlabel * sizeof(float));
nn->errh = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->erro = (float*)malloc(nlabel * sizeof(float));
memset(nn->wih_old, 0, (bands + 1)*(nn->hidden_neurons) * sizeof(float));
memset(nn->who_old, 0, (nn->hidden_neurons + 1)*nlabel * sizeof(float));
}
//curand
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
"Incorrect Indexing" will produce out-of-bounds memory access within the kernel. The CUDA runtime will destroy your context at the point where the error occurred within the kernel, after which no CUDA operations which rely the the context can be performed. The cudaMemcpycall fails because your context has been destroyed. There is no way to avoid this.
NVIDIA supply a utility called cuda-memcheck with the CUDA toolkit. Use that instead to diagnose what is going wrong with your kernel.
I'v found my error:
I was making a bad use of the "curandState_t" type variable in the assignRandomWight function, I had to use a pointer.
this is the correct version:
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t *state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
and the correct version for the two kernels:
__global__ void generateRandomValues( curandState_t *state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
if (tid<=(inp+1)*hid ){
printf("\ncasual : %f", (float)curand_uniform(&state_wih[tid]));
wih[tid] = (float)curand_uniform(&state_wih[tid]);
}
if (tid<=(hid+1)*out){
who[tid] = (float)curand_uniform(&state_wih[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t *state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state_wih[tid]);
}

2D Convolution Incorrect Results Cuda Constant Memory

I'm struggling in the kernel code. I have updated this to include support files, but those were provided and should be correct.
This is one of my first GPU programs and I've spent several hours trying new things and I can't seem to get this right. It is compiling and running, but the results are incorrect.
I am basically having trouble understanding what exactly I need to be doing differently because this kernel is giving incorrect results. I'm trying to load a tile of the input image to shared memory (Ns[][], which I think I've done correctly) and apply the filter on the input image tile (which I am struggling with).
I would greatly appreciate it if someone who is more experienced could assist me in figuring out exactly where I've gone wrong and give me an idea how to resolve the issue. I appreciate your time and apologies if I've asked this question incorrectly.
main.cu:
#include <stdio.h>
#include "support.h"
#include "kernel.cu"
#include <time.h>
int main(int argc, char* argv[]){
Timer timer;
time_t t;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image
Matrix N_d, P_d;
unsigned imageHeight, imageWidth;
cudaError_t cuda_ret;
dim3 dim_grid, dim_block;
/* Read image dimensions */
if (argc == 1) {
imageHeight = 600;
imageWidth = 1000;
} else if (argc == 2) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[1]);
} else if (argc == 3) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./convolution # Image is 600 x 1000"
"\n Usage: ./convolution <m> # Image is m x m"
"\n Usage: ./convolution <m> <n> # Image is m x n"
"\n");
exit(0);
}
/* Allocate host memory */
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE);
N_h = allocateMatrix(imageHeight, imageWidth);
P_h = allocateMatrix(imageHeight, imageWidth);
/* Initialize filter and images */
initMatrix(M_h);
initMatrix(N_h);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Image: %u x %u\n", imageHeight, imageWidth);
printf(" Mask: %u x %u\n", FILTER_SIZE, FILTER_SIZE);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
N_d = allocateDeviceMatrix(imageHeight, imageWidth);
P_d = allocateDeviceMatrix(imageHeight, imageWidth);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
/* Copy image to device global memory */
copyToDeviceMatrix(N_d, N_h);
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));
dim_grid = dim3(((N_h.width / BLOCK_SIZE) + 1), ((N_h.height / BLOCK_SIZE) + 1));
dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
convolution<<<dim_grid, dim_block>>>(N_d, P_d);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
copyFromDeviceMatrix(P_h, P_d);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(M_h, N_h, P_h);
// Free memory ------------------------------------------------------------
freeMatrix(M_h);
freeMatrix(N_h);
freeMatrix(P_h);
freeDeviceMatrix(N_d);
freeDeviceMatrix(P_d);
return 0;
}
kernel.cu:
__constant__ float M_c[FILTER_SIZE][FILTER_SIZE];
__global__ void convolution(Matrix N, Matrix P){
__shared__ float Ns[TILE_SIZE + 5 - 1][TILE_SIZE + 5 -1];
int i, j;
float output = 0.0f;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row_o = blockIdx.y * TILE_SIZE + ty;
int col_o = blockIdx.x * TILE_SIZE + tx;
int row_i = row_o - 2;
int col_i = col_o - 2;
if((row_i >= 0) && (row_i < N.height) && (col_i >= 0) && (col_i < N.width)){
Ns[ty][tx] = N.elements[row_i * N.width + col_i];
}
else{
Ns[ty][tx] = 0.0f;
}
__syncthreads();
if(ty < TILE_SIZE && tx < TILE_SIZE){
for(i = 0; i < 5; i++){
for(j = 0; j < 5; j++){
output += M_c[i][j] * Ns[i + ty][j + tx];
}
}
}
if(row_o < P.height && col_o < P.width){
P.elements[row_o * P.width + col_o] = output;
}
}
support.h:
#ifndef __FILEH__
#define __FILEH__
#include <sys/time.h>
typedef struct {
struct timeval startTime;
struct timeval endTime;
} Timer;
// Matrix Structure declaration
typedef struct {
unsigned int width;
unsigned int height;
unsigned int pitch;
float* elements;
} Matrix;
#define FILTER_SIZE 5
#define TILE_SIZE 12
#define BLOCK_SIZE (TILE_SIZE + FILTER_SIZE - 1)
Matrix allocateMatrix(unsigned height, unsigned width);
void initMatrix(Matrix mat);
Matrix allocateDeviceMatrix(unsigned height, unsigned width);
void copyToDeviceMatrix(Matrix dst, Matrix src);
void copyFromDeviceMatrix(Matrix dst, Matrix src);
void verify(Matrix M, Matrix N, Matrix P);
void freeMatrix(Matrix mat);
void freeDeviceMatrix(Matrix mat);
void startTime(Timer* timer);
void stopTime(Timer* timer);
float elapsedTime(Timer timer);
#define FATAL(msg, ...) \
do {\
fprintf(stderr, "[%s:%d] "msg"\n", __FILE__, __LINE__, ##__VA_ARGS__);\
exit(-1);\
} while(0)
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
#endif
support.cu:
#include <stdlib.h>
#include <stdio.h>
#include "support.h"
Matrix allocateMatrix(unsigned height, unsigned width)
{
Matrix mat;
mat.height = height;
mat.width = mat.pitch = width;
mat.elements = (float*)malloc(height*width*sizeof(float));
if(mat.elements == NULL) FATAL("Unable to allocate host");
return mat;
}
void initMatrix(Matrix mat)
{
for (unsigned int i=0; i < mat.height*mat.width; i++) {
mat.elements[i] = (rand()%100)/100.00;
}
}
Matrix allocateDeviceMatrix(unsigned height, unsigned width)
{
Matrix mat;
cudaError_t cuda_ret;
mat.height = height;
mat.width = mat.pitch = width;
cuda_ret = cudaMalloc((void**)&(mat.elements), height*width*sizeof(float));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
return mat;
}
void copyToDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy to device");
}
void copyFromDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy from device");
}
void verify(Matrix M, Matrix N, Matrix P) {
const float relativeTolerance = 1e-6;
for(int row = 0; row < N.height; ++row) {
for(int col = 0; col < N.width; ++col) {
float sum = 0.0f;
for(int i = 0; i < M.height; ++i) {
for(int j = 0; j < M.width; ++j) {
int iN = row - M.height/2 + i;
int jN = col - M.width/2 + j;
if(iN >= 0 && iN < N.height && jN >= 0 && jN < N.width) {
sum += M.elements[i*M.width + j]*N.elements[iN*N.width + jN];
}
}
}
float relativeError = (sum - P.elements[row*P.width + col])/sum;
if (relativeError > relativeTolerance
|| relativeError < -relativeTolerance) {
printf("TEST FAILED\n\n");
exit(0);
}
}
}
printf("TEST PASSED\n\n");
}
void freeMatrix(Matrix mat)
{
free(mat.elements);
mat.elements = NULL;
}
void freeDeviceMatrix(Matrix mat)
{
cudaFree(mat.elements);
mat.elements = NULL;
}
void startTime(Timer* timer) {
gettimeofday(&(timer->startTime), NULL);
}
void stopTime(Timer* timer) {
gettimeofday(&(timer->endTime), NULL);
}
float elapsedTime(Timer timer) {
return ((float) ((timer.endTime.tv_sec - timer.startTime.tv_sec) \
+ (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
}
One set of problems is here:
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));
If you ran your code with cuda-memcheck it would point you right at this line as being a problem.
The first parameter should be the destination symbol, i.e. M_c, and the second parameter should be the host source pointer, i.e. M_h.
Furthermore, shouldn't it be FILTER_SIZE*FILTER_SIZE ? Isn't the size of data you want to transfer equal to the dimension squared?
Finally, M_h is not a valid source pointer. You should use M_h.elements.
So something like this:
cudaMemcpyToSymbol(M_c, M_h.elements,FILTER_SIZE*FILTER_SIZE*sizeof(float));
I don't believe this fixes all the issues in your code. To continue the debug, I would print out one element in the GPU result that does not match your verify routine, and work through the arithmetic for that one element. Use printf in device code if that helps.
In the future, please run your code with cuda-memcheck before asking for help here. Even if you don't understand the output, it will be useful for those trying to help you.

Cuda program not working

i'm a beginner in cuda programming. I'm trying an own easy code but it's not working and I don't know what else to do.
My code:
#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int getGlobalIdx_2D_2D();
// Host function
int main(int argc, char** argv)
{
unsigned int i, N, gridX, gridY, blockX, blockY;
N = 4096000;
char *str = (char *) malloc(N*sizeof(char));
for(i=0; i < N; i++) str[i]='c';
MPI_Init (&argc, &argv);
char *d_str;
size_t size = (size_t) N*sizeof(char);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
gridX = 100;
gridY = 10;
blockX = blockY = 64;
dim3 dimGrid(gridX, gridY); // 4096 chars per block
dim3 dimBlock(blockX, blockY); // one thread per character, 2D
printf("dimGrid(%d, %d)\t", gridX, gridY);
printf("dimBlock(%d, %d)\t", blockX, blockY);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
MPI_Barrier (MPI_COMM_WORLD);
cudaFree(d_str);
printf("\nRes:\n");
for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);
MPI_Finalize ();
free(str);
return 0.0;
}
// Device kernel
__global__ void helloWorld(char* str)
{
// determine where in the thread grid we are
int pos = getGlobalIdx_2D_2D();
if (pos % 2 == 0) str[pos] -= 2;
else str[pos] += 8;
}
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) +
(threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
My desired output is: jajajajajajaja... x4096000
I've read that '%' operation is not efficient, but I don't think is the problem there.
Thanks!
You are performing absolutely no CUDA error checking, it is really beneficial to do so. Once you enable it you can find that block dimensions 64 x 64 are invalid as it results into 4096 threads within one block, which is not a valid configuration.

Unspecified launch failure after cudaDeviceSynchronize() call when program starts. But no errors using step-through debugging. CUDA

I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}