Complicated for loop to be ported to a CUDA kernel - cuda

I have the next for nested loop and I would like to port it to CUDA to be run on a GPU
int current=0;
int ptr=0;
for (int i=0; i < Nbeans; i++){
for(int j=0;j< NbeamletsPerbeam[i];j++){
current = j + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[i];
}
}
I would be very happy if any body has an idea of how to do it or how can be done.
We are talking about Nbeams=5, NbeamletsPerBeam around 200 each.
This is what I currently have but I am not sure it is right...
for (int i= blockIdx.x; i < d_params->Nbeams; i += gridDim.x){
for (int j= threadIdx.y; j < d_beamletsPerBeam[i]; j+= blockDim.y){
currentBeamlet= j+k;
for (int ivoxel= threadIdx.x; ivoxel < totalVoxels; ivoxel += blockDim.x){

I would suggest this idea. But you might need to do some minor modifications based on your code.
dim3 blocks(NoOfThreads, 1);
dim3 grid(Nbeans, 1);
kernel<<grid, blocks, 1>>()
__global__ kernel()
{
int noOfBlocks = ( NbeamletsPerbeam[blockIdx.x] + blockDim.x -1)/blockDim.x;
for(int j=0; j< noOfBlocks;j++){
// use threads and compute....
if( (threadIdx.x * j) < NbeamletsPerbeam[blockIdx.x]) {
current = (threadIdx.x * j) + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[blockIdx.x];
}
}
}
This should do the trick and gives you better parallelization.

Related

how to create a matrix in gpu and print it on cpu?

This is a code to create a matrix on gpu and print it out on cpu. Can anyone tell me where am I going wrong. Thank you.
# include <stdio.h>
__global__ void create(int **d_a){
int i = threadIdx.x;
int j = threadIdx.y;
d_a[i][j] = 1;
}
void errorCheck(){
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess){
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
}
# define N 5
int main(){
int **d_a, **a;
a = (int**)malloc(N * sizeof(int*));
for (int i =0; i < N; i++){
a[i] = (int*)malloc(N*sizeof(int));
}
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
errorCheck();
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
for (int i =0; i < N; i++ ){
for (int j = 0; j < N; j++ ){
printf("%d", a[i][j]);
}
printf("\n");
}
cudaFree(d_a);
free(a);
return 0;
}
Is there something wrong with memory allocation or memcpy ?
Is there something wrong with memory allocation or memcpy ?
Yes on both counts.
This:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
would have to be done like this:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i=0; i < N; i++){
int *row;
cudaMalloc((void**)&row, N*sizeof(int));
cudaMemcpy(d_a+i, &row, sizeof(int*), cudaMemcpyHostToDevice);
}
And then this:
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
would have to be done like this:
create <<<1, dim3(N,N)>>>(d_a);
errorCheck();
for(int i=0; i<N; i++) {
int* row;
cudaMemcpy(&row, d_a+i, sizeof(int*), cudaMemcpyDeviceToHost);
cudaMemcpy(a[i], row, sizeof(int) * N, cudaMemcpyDeviceToHost);
}
[All code written in browser and not tested, use at own risk]
In short, you have decided to work with an array of pointers. This requires additional CUDA API operations because the row pointers in the GPU copy are not accessible on the host by standard assignment. You must use cudaMemcpy in every case.

System get stuck on running matrix multiplication using CUDA

When i'm running this code on my system, after some seconds my system get stuck and i have to restart system again. So my question is what's i'm doing wrong here? Any suggestion will appreciated.
__global__ void matMul(float* d_M, float* d_N, float* d_P, int width) {
int row = blockIdx.y*width + threadIdx.y;
int col = blockIdx.x*width + threadIdx.x;
if (row < width && col < width) {
float product_val = 0;
for (int k = 0; k < width; k++) {
product_val += d_M[row*width + k] * d_N[k*width + col];
}
d_P[row*width + col] = product_val;
}
}
int main() {
const int n = 9;
float* d_M;
float* d_N;
float* d_P;
cudaMallocManaged(&d_M, SIZE * sizeof(float));
cudaMallocManaged(&d_N, SIZE * sizeof(float));
cudaMallocManaged(&d_P, SIZE * sizeof(float));
for (int i = 0; i < n; ++i) {
d_P[i] = 0;
}
int count = 0;
for (int i = 0; i < n; ++i) {
d_N[i] = ++count;
}
count = 0;
for (int i = 0; i < n; ++i) {
d_M[i] = ++count;
}
matMul <<<1, n>>> (d_M, d_N, d_P, 3);
cudaDeviceSynchronize();
for (int i = 0; i < n; ++i) {
printf("%f\n", d_P[i]);
}
cudaFree(d_N);
cudaFree(d_M);
cudaFree(d_P);
return 0;
}
Assuming that when you mean your system gets stuck, you get some kind of error in your program, it's likely that you're accessing memory that is invalid.
This could be in the higher indexes of your d_M and d_N iterations when k + row*width is indexing beyond the size of memory that you've allocated in cudaMallocManaged.
It's always good practice in situations like these to add some error handling using commands such as cudaPeekatLastError().
This link might be helpful for implementing some debugging.

CUDA transpose More Than one Thread

im trying to do transpose square matrix using tiling (blocks method) via CUDA, i have successfuly done it but onnly when entering one thread per dimension , as below in the Host function :
dim3 dimGrid((nEven + TILE_DIM - 1) / TILE_DIM, (nEven + TILE_DIM - 1) / TILE_DIM, 1);
dim3 dimBlock(1, 1, 1);
considering : nEven size of matrix + TILE_DIM is the tile size block
i have really trouble into understanding how the threads work in GPU, so ive managed to code as the below my kernel which works with only one thread per block :
__global__ void transposeMain(int *idata)
{
__shared__ int tile2[TILE_DIM][TILE_DIM ];
int yy = blockIdx.y * TILE_DIM + threadIdx.y;
int xx = blockIdx.x * TILE_DIM + threadIdx.x;
if (xx < nEven && yy < nEven)
{
for (int i = 0; i < TILE_DIM; i++)
for (int j = 0; j < TILE_DIM; j++)
tile[i][j] = idata[(i + xx)*nEven + (j + yy)];
__syncthreads();
for (int i = 0; i < TILE_DIM; i++)
for (int j = 0; j < TILE_DIM; j++){
temp1 = tile[i][j];
idata[(j + yy)*nEven + (i + xx)] = temp1;
}
}
Please help me how can i manage more than one threads into my tiling, as i feel im missing something , i tried many ways but it keeps getting out of bound memory and gives wrong data,
many thanks
Each thread in a block represents a value in range [0..TILE_DIM-1], in both x and y dimention. Thus, a single instruction working with xx and yy will cover the whole area in your tile. There is no need for additional for loops.

CUDA DCT works only when blockDim.x is 1

I'm just starting learning CUDA and knows only very basic stuff. I'm trying to develop a CUDA program that does 8x8 DCT using matrix multiplication method. An 8x8 DCT coefficient matrix D is computed, and the DCT transform is then D'AD. Each thread computes 1 data point and each block is 8x8. I wrote a sequential DCT and compare the results in an output file.
Here's the problem. When the number of blocks is 1xN, everything works fine. When the number of blocks is MxN, (M is any number greater than 1), the kernel gives wrong result. I think the problem should be my block indexing, but I couldn't find the problem.
Could anyone offer some help? I know it's a very basic program, but I really need it.
Any comments are gratefully appreciated!
Thanks in advance!
#include <stdio.h>
#include <stdlib.h>
#include "types.h"
#include "cuda.h"
static int DCT_bases[64]= {2896, 2896, 2896, 2896, 2896, 2896, 2896, 2896,
4017, 3406, 2276, 799, -799, -2276, -3406, -4017,
3784, 1568, -1568, -3784, -3784, -1568, 1568, 3784,
3406, -799, -4017, -2276, 2276, 4017, 799, -3406,
2896, -2896, -2896, 2896, 2896, -2896, -2896, 2896,
2276, -4017, 799, 3406, -3406, -799, 4017, -2276,
1568, -3784, 3784, -1568, -1568, 3784, -3784, 1568,
799, -2276, 3406, -4017, 4017, -3406, 2276, -799 };
__device__ __constant__ int dDCT_bases[64];
__global__ void cudaDCT2D(int *src, int width) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int k;
int sum = 0;
int dct_i = threadIdx.y;
int dct_j = threadIdx.x;
__shared__ int temp[8][8];
temp[dct_i][dct_j] = src[i*width+j];
__syncthreads();
sum = 0;
for (k=0; k<8; k++) {
sum += temp[dct_i][k] * dDCT_bases[dct_j*8+k];
}
__syncthreads();
temp[dct_i][dct_j] = sum >> 13;
__syncthreads();
sum = 0;
for (k = 0; k < 8; k++) {
sum += dDCT_bases[dct_i*8+k] * temp[k][dct_j];
}
__syncthreads();
src[i*width+j] = sum >> 13;
}
void myDCT2D(int *src, int width, int height) {
int bi, bj;
int i, j, k;
int sum = 0;
int temp[64];
for (bi = 0; bi < width / 8; bi++) {
for (bj = 0; bj < height / 8; bj++) {
for (i=0; i<8; i++) {
for (j=0; j<8; j++) {
for (k = 0; k < 8; k++) {
sum += src[i*8+k] * DCT_bases[j*8+k];
}
temp[i*8+j] = sum >> 13;
sum = 0;
}
}
for (i=0; i<8; i++) {
for (j=0; j<8; j++) {
for (k=0; k < 8; k++) {
sum += DCT_bases[i*8+k] * temp[k*8+j];
}
src[i*8+j] = sum >> 13;
sum = 0;
}
}
src += 64;
}
}
}
int main (int argc, char *argv[])
{
int *matrix;
int *m0;
int i, j;
int *d_m;
int *m1;
FILE* fp;
int width = 8;
int height = 8;
if (argc > 1) {
width = atoi(argv[1]);
height = atoi(argv[2]);
}
if (width % 8 || height % 8) {
printf("Width and Height has to be multiple of 8!\n");
getchar();
return 0;
}
matrix = (int *) malloc(sizeof(int) * width * height);
m0 = (int *) malloc(sizeof(int) * width * height);
m1 = (int *) malloc(sizeof(int) * width * height);
fp = fopen("cuda_test.txt", "w");
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
matrix[i*width+j] = rand()% 256;
m0[i*width+j] = matrix[i*width+j];
m1[i*width+j] = matrix[i*width+j];
fprintf(fp,"%d ", m0[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp, "\n");
cudaMalloc((void**) &d_m, sizeof(int) * width * height);
cudaMemcpy(d_m, m0, sizeof(int) * width * height, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(dDCT_bases, DCT_bases, sizeof(DCT_bases));
// printf("%s\n", cudaGetErrorString(cudaGetLastError()));
dim3 dimGrid(width / 8, height / 8);
dim3 dimBlock(8,8);
cudaDCT2D<<<dimGrid,dimBlock>>> (d_m, width);
cudaMemcpy(m0, d_m, sizeof(int) * width * height, cudaMemcpyDeviceToHost);
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
fprintf(fp,"%d ", m0[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp, "\n");
myDCT2D(m1, width, height);
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
fprintf(fp,"%d ", m1[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp,"\n");
free(matrix);
free(m0);
free(m1);
cudaFree(d_m);
return 0;
}
I find the answer myself.
In fact there's nothing wrong with the cuda program, but i'm interpreting the matrix in different way.
In CUDA, I use a 2-D block structure, so a 16x16 matrix would be interpreted by cuda in this way:
[ M1_8x8 M2_8x8
M3_8x8 M4_8x8]
But in my C test program, I'm assuming that the first 8x8 numbers are in the first matrix, so it becomes this:
[M1 16x4
M2 16x4
M3 16x4
M4 16x4]
So the matrix is different! That's why the results are not the same!
I think it will only happen for starters like me.... :(

cuda reach device function from global

I am trying to call a device function from global function. This function is only declaring an array to be used by all threads. But my problem when I printed the array its elements are not in the same order as declared. Is it because of all threads are creating the array again ? I confused about threads. If it is , Can I learn which thread is run first in global function and can I only allow it to declare the array for the others. Thanks.
Here my function to create array :
__device__ float myArray[20][20];
__device__ void calculation(int no){
filterWidth = 3+(2*no);
filterHeight = 3+(2*no);
int arraySize = filterWidth;
int middle = (arraySize - 1) / 2;
int startIndex = middle;
int stopIndex = middle;
// at first , all values of array are 0
for(int i=0; i<arraySize; i++)
for (int j = 0; j < arraySize; j++)
{
myArray[i][j] = 0;
}
// until middle line of the array, required indexes are 1
for (int i = 0; i < middle; i++)
{
for (int j = startIndex; j <= stopIndex; j++)
{ myArray[i][j] = 1; sum+=1; }
startIndex -= 1;
stopIndex += 1;
}
// for middle line
for (int i = 0; i < arraySize; i++)
{myArray[middle][i] = 1; sum+=1;}
// after middle line of the array, required indexes are 1
startIndex += 1;
stopIndex -= 1;
for (int i = (middle + 1); i < arraySize; i++)
{
for (int j = startIndex; j <= stopIndex; j++)
{ myArray[i][j] = 1; sum+=1; }
startIndex +=1 ;
stopIndex -= 1;
}
filterFactor = 1.0f / sum;
}
And global function :
__global__ void FilterKernel(Format24bppRgb* imageData)
{
int tidX = threadIdx.x + blockIdx.x * blockDim.x;
int tidY = threadIdx.y + blockIdx.y * blockDim.y;
Colour Cpixel = Colour (imageData[tidX + tidY*imageWidth] );
float depthPixel = Colour(depthData[tidX + tidY*imageWidth]).Red;
float absoluteDistanceFromFocus = fabs (depthPixel - focusDepth);
if(depthPixel == 0)
return;
Colour Cresult = Cpixel;
for (int i=0;i<8;i++)
{
calculation(i);
...
...
}
}
If you really want to select and force one thread to call the function and the rest to wait for it to do so, use __shared__ memory for the array created by the device function so that all threads in a block see the same one, and you can call it with:
for (int i=0;i<8;i++)
{
if (threadIdx.x == 0 && threadIdx.y == 0)
calculation(i);
__syncthreads();
...
}
Of course, this won't work between blocks - in a globally defined function, you have no control over the order in which blocks are computed.
Instead, if you can, you should do the initialization calculation (that only 1 thread needs to do) on the CPU and memcpy it to the GPU before launching your kernel. It looks like you'll use 8x the memory for your myArray's, but it'll dramatically speed up your computation.