CUDA DCT works only when blockDim.x is 1 - cuda

I'm just starting learning CUDA and knows only very basic stuff. I'm trying to develop a CUDA program that does 8x8 DCT using matrix multiplication method. An 8x8 DCT coefficient matrix D is computed, and the DCT transform is then D'AD. Each thread computes 1 data point and each block is 8x8. I wrote a sequential DCT and compare the results in an output file.
Here's the problem. When the number of blocks is 1xN, everything works fine. When the number of blocks is MxN, (M is any number greater than 1), the kernel gives wrong result. I think the problem should be my block indexing, but I couldn't find the problem.
Could anyone offer some help? I know it's a very basic program, but I really need it.
Any comments are gratefully appreciated!
Thanks in advance!
#include <stdio.h>
#include <stdlib.h>
#include "types.h"
#include "cuda.h"
static int DCT_bases[64]= {2896, 2896, 2896, 2896, 2896, 2896, 2896, 2896,
4017, 3406, 2276, 799, -799, -2276, -3406, -4017,
3784, 1568, -1568, -3784, -3784, -1568, 1568, 3784,
3406, -799, -4017, -2276, 2276, 4017, 799, -3406,
2896, -2896, -2896, 2896, 2896, -2896, -2896, 2896,
2276, -4017, 799, 3406, -3406, -799, 4017, -2276,
1568, -3784, 3784, -1568, -1568, 3784, -3784, 1568,
799, -2276, 3406, -4017, 4017, -3406, 2276, -799 };
__device__ __constant__ int dDCT_bases[64];
__global__ void cudaDCT2D(int *src, int width) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int k;
int sum = 0;
int dct_i = threadIdx.y;
int dct_j = threadIdx.x;
__shared__ int temp[8][8];
temp[dct_i][dct_j] = src[i*width+j];
__syncthreads();
sum = 0;
for (k=0; k<8; k++) {
sum += temp[dct_i][k] * dDCT_bases[dct_j*8+k];
}
__syncthreads();
temp[dct_i][dct_j] = sum >> 13;
__syncthreads();
sum = 0;
for (k = 0; k < 8; k++) {
sum += dDCT_bases[dct_i*8+k] * temp[k][dct_j];
}
__syncthreads();
src[i*width+j] = sum >> 13;
}
void myDCT2D(int *src, int width, int height) {
int bi, bj;
int i, j, k;
int sum = 0;
int temp[64];
for (bi = 0; bi < width / 8; bi++) {
for (bj = 0; bj < height / 8; bj++) {
for (i=0; i<8; i++) {
for (j=0; j<8; j++) {
for (k = 0; k < 8; k++) {
sum += src[i*8+k] * DCT_bases[j*8+k];
}
temp[i*8+j] = sum >> 13;
sum = 0;
}
}
for (i=0; i<8; i++) {
for (j=0; j<8; j++) {
for (k=0; k < 8; k++) {
sum += DCT_bases[i*8+k] * temp[k*8+j];
}
src[i*8+j] = sum >> 13;
sum = 0;
}
}
src += 64;
}
}
}
int main (int argc, char *argv[])
{
int *matrix;
int *m0;
int i, j;
int *d_m;
int *m1;
FILE* fp;
int width = 8;
int height = 8;
if (argc > 1) {
width = atoi(argv[1]);
height = atoi(argv[2]);
}
if (width % 8 || height % 8) {
printf("Width and Height has to be multiple of 8!\n");
getchar();
return 0;
}
matrix = (int *) malloc(sizeof(int) * width * height);
m0 = (int *) malloc(sizeof(int) * width * height);
m1 = (int *) malloc(sizeof(int) * width * height);
fp = fopen("cuda_test.txt", "w");
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
matrix[i*width+j] = rand()% 256;
m0[i*width+j] = matrix[i*width+j];
m1[i*width+j] = matrix[i*width+j];
fprintf(fp,"%d ", m0[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp, "\n");
cudaMalloc((void**) &d_m, sizeof(int) * width * height);
cudaMemcpy(d_m, m0, sizeof(int) * width * height, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(dDCT_bases, DCT_bases, sizeof(DCT_bases));
// printf("%s\n", cudaGetErrorString(cudaGetLastError()));
dim3 dimGrid(width / 8, height / 8);
dim3 dimBlock(8,8);
cudaDCT2D<<<dimGrid,dimBlock>>> (d_m, width);
cudaMemcpy(m0, d_m, sizeof(int) * width * height, cudaMemcpyDeviceToHost);
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
fprintf(fp,"%d ", m0[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp, "\n");
myDCT2D(m1, width, height);
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
fprintf(fp,"%d ", m1[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp,"\n");
free(matrix);
free(m0);
free(m1);
cudaFree(d_m);
return 0;
}

I find the answer myself.
In fact there's nothing wrong with the cuda program, but i'm interpreting the matrix in different way.
In CUDA, I use a 2-D block structure, so a 16x16 matrix would be interpreted by cuda in this way:
[ M1_8x8 M2_8x8
M3_8x8 M4_8x8]
But in my C test program, I'm assuming that the first 8x8 numbers are in the first matrix, so it becomes this:
[M1 16x4
M2 16x4
M3 16x4
M4 16x4]
So the matrix is different! That's why the results are not the same!
I think it will only happen for starters like me.... :(

Related

Having an issue detecting problem in my code | CUDA c

I'm having a hard time understanding where is the bug in my code.
The purpose of the project is to multiply two matrices and compare the time between sequential and parallel.
When I'm printing the Matrices I see that the device matrix is basically empty.
Also, I treated the matrices as an array of size n*n .
Thanks!
//This program computes the multiplication of two Matrices GPU using CUDA
#include <stdio.h>
#include <cassert>
__global__ void matrixMul(int * m,int * n,int * p,int size)
{
//Calculate Row and Column
int row=threadIdx.y*blockDim.y+threadIdx.y;
int column=threadIdx.x*blockDim.x+threadIdx.x;
int p_sum=0;
for (int i = 0; i < size; i++)
{
p_sum += m[row*size + i] * n[i*size +column];
}
p[row*size + column] = p_sum;
}
void matrixMul_seq(int * m,int * n,int * p,int size){
for(int i = 0; i < size; i++){
for(int j = 0; j < size; j++){
for(int k = 0; k < size; k++){
p[i*size +j] += m[i*size +k] +n[k*size +j];
}
}
}
}
//Initialize matricies
void init_matricies(int * mat,int n){
for(int i = 0; i < n; i++) {
for (int j = 0; j < n; j++)
{
mat[i*n+j]=rand()%1024;
}
}
}
int main(int argc,char **argv)
{
//Set our problem Size(Default = 2^10 == 1024)
int n = 1<<10;
printf("Square Matrix of size:%d\n",n);
//Size in Bytes
size_t bytes=n*n*sizeof(bytes);
//Host matricies
int *h_m;
int *h_p;
int *h_n;
int *h_p_seq;
//Host matricies
int *d_m;
int *d_p;
int *d_n;
//Memory allocation for Host Matricies
h_m=(int*)malloc(bytes);
h_n=(int*)malloc(bytes);
h_p=(int*)malloc(bytes);
h_p_seq=(int*)malloc(bytes);
init_matricies(h_m,n);
init_matricies(h_n,n);
//Allocate memory on device side
cudaMalloc(&d_n, bytes);
cudaMalloc(&d_m, bytes);
cudaMalloc(&d_p, bytes);
//Copy data to Device
cudaMemcpy(d_m,h_m, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_n,h_n, bytes, cudaMemcpyHostToDevice);
int threads_per_block =16;
dim3 block_size(threads_per_block,threads_per_block);
dim3 grid_size( n / block_size.x , n / block_size.y);
printf("Grid size X:%d, Grid size y:%d\n",grid_size.x,grid_size.y);
printf("THE RESULT OF THE SIZES: 2^6 * 2^4 * 2^6 * 2^4 \n");
matrixMul <<<grid_size,block_size>>>(d_m,d_n,d_p,n);
matrixMul_seq(h_m,h_n,h_p_seq,n);
cudaMemcpy(h_p,d_p, bytes, cudaMemcpyDeviceToHost);
for(int i = 0; i < n; i++){
for(int j = 0; j < n; j++){
//printf("Grid size X:%d, Grid size y:%d\n",h_p[ n * i + j],h_p_seq[ n * i + j]);
assert(h_p[ n * i + j]==h_p_seq[ n * i + j]);
}
}
free(h_m);
free(h_p);
free(h_n);
free(h_p_seq);
cudaFree(d_m);
cudaFree(d_n);
cudaFree(d_p);
return 0;
}
You have a variety of problems in your code:
You are calculating kernel index variables incorrectly. This is incorrect:
int row=threadIdx.y*blockDim.y+threadIdx.y;
int column=threadIdx.x*blockDim.x+threadIdx.x;
it should be:
int row=blockIdx.y*blockDim.y+threadIdx.y;
int column=blockIdx.x*blockDim.x+threadIdx.x;
The matrix operations in your calculation functions don't match each other. Kernel:
p_sum += m[row*size + i] * n[i*size +column];
^
multiplication
host code:
p[i*size +j] += m[i*size +k] +n[k*size +j];
^
addition
we also observe, from above, that the host code is doing a summation to the output variable (+=), whereas the the kernel is doing an assignment to the output variable (=):
p[row*size + column] = p_sum;
This has implications for the next issue.
malloc doesn't initialize data. Since this operation is creating the output array that will be used by the host code, which is doing a summation to it, we must initialize this allocation to zero:
h_p_seq=(int*)malloc(bytes);
memset(h_p_seq, 0, bytes); // must add this line to initialize to zero
The calculation of the size of your arrays in bytes is too large. You have defined your arrays to be of type int. But your size calculation is like this:
size_t bytes=n*n*sizeof(bytes);
An int is a 4-byte quantity, whereas a size_t variable like bytes is an 8-byte quantity. This doesn't cause an actual problem, but is unnecessary. I would suggest changing it to:
size_t bytes=n*n*sizeof(int);
With the above items addressed, your code runs correctly for me.

System get stuck on running matrix multiplication using CUDA

When i'm running this code on my system, after some seconds my system get stuck and i have to restart system again. So my question is what's i'm doing wrong here? Any suggestion will appreciated.
__global__ void matMul(float* d_M, float* d_N, float* d_P, int width) {
int row = blockIdx.y*width + threadIdx.y;
int col = blockIdx.x*width + threadIdx.x;
if (row < width && col < width) {
float product_val = 0;
for (int k = 0; k < width; k++) {
product_val += d_M[row*width + k] * d_N[k*width + col];
}
d_P[row*width + col] = product_val;
}
}
int main() {
const int n = 9;
float* d_M;
float* d_N;
float* d_P;
cudaMallocManaged(&d_M, SIZE * sizeof(float));
cudaMallocManaged(&d_N, SIZE * sizeof(float));
cudaMallocManaged(&d_P, SIZE * sizeof(float));
for (int i = 0; i < n; ++i) {
d_P[i] = 0;
}
int count = 0;
for (int i = 0; i < n; ++i) {
d_N[i] = ++count;
}
count = 0;
for (int i = 0; i < n; ++i) {
d_M[i] = ++count;
}
matMul <<<1, n>>> (d_M, d_N, d_P, 3);
cudaDeviceSynchronize();
for (int i = 0; i < n; ++i) {
printf("%f\n", d_P[i]);
}
cudaFree(d_N);
cudaFree(d_M);
cudaFree(d_P);
return 0;
}
Assuming that when you mean your system gets stuck, you get some kind of error in your program, it's likely that you're accessing memory that is invalid.
This could be in the higher indexes of your d_M and d_N iterations when k + row*width is indexing beyond the size of memory that you've allocated in cudaMallocManaged.
It's always good practice in situations like these to add some error handling using commands such as cudaPeekatLastError().
This link might be helpful for implementing some debugging.

cuda copy data which dynamic malloc in kernel from device memory

I met a problem about using cudaMemcpy with cudaMemcpyDeviceToHost.
There is a struct which have a pointer int* a, It will malloc in the kernel function.
And then I need copy this int* a to host memory.
My question is: I didn't know how it can not work by using cudaMemcpy.
There my codes:
#include <cuda_runtime.h>
#include <stdio.h>
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st)
{
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx+1;
mst->a = (int *)malloc((mst->m)*sizeof(int));
mst->a[0] = idx;
}
int main(int argc,char **argv)
{
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
myst *hst = (myst *)malloc(2 * sizeof(myst));
cudaMalloc(&mst, 2 * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
cudaMemcpy(&hst[0],&mst[0],sizeof(myst),cudaMemcpyDeviceToHost);
cudaMemcpy(&hst[1],&mst[1],sizeof(myst),cudaMemcpyDeviceToHost);
int *pInt1 = (int *)malloc((hst[0].m)*sizeof(int)) ;
int *pInt2 = (int *)malloc((hst[1].m)*sizeof(int)) ;
cudaMemcpy(pInt1, hst[0].a, (hst[0].m)*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(pInt2, hst[1].a, (hst[1].m)*sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\t%d\t%d\n",hst[0].n,hst[0].m, pInt1[0]);
printf("%d\t%d\t%d\n",hst[1].n,hst[1].m, pInt2[0]);
free(pInt1);
free(pInt2);
return 0;
}
The codes will go warning about "Cuda API error detected: cudaMemcpy returned (0xb)"
I saw a similar question : copy data which is allocated in device from device to host
But it seem that can not solve my problem.
Thx.
Alright, I work it out with a stupid way (-.-!!).
While return form the kernel function, I count how many space I have to malloc in Host and Device, and cudaMalloc again a big space . Next, in other kernel function named ythread, copy the data which in the Heap to the big space.
typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
myst *mst = &st[idx];
mst->n = idx;
mst->m = idx + 1;
mst->a = (int *) malloc((mst->m) * sizeof(int));
for (int i = 0; i < mst->m; i++) {
mst->a[i] = idx + 900 + i * 10;
}
}
__global__ void ythread(myst *st, int *total_a) {
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
myst *mst = &st[idx];
int offset=0;
for(int i=0; i<idx; i++) {
offset += st[i].m;
}
for(int i=0; i<mst->m; i++) {
total_a[offset+i] = mst->a[i];
}
}
int main(int argc,char **argv) {
dim3 dimGrid(1);
dim3 dimBlock(2);
myst *mst = NULL;
cudaMalloc((void**)&mst, dimBlock.x * sizeof(myst));
xthread<<<dimGrid, dimBlock>>>(mst);
cudaDeviceSynchronize();
myst *hst = (myst *)malloc(dimBlock.x * sizeof(myst));
cudaMemcpy(hst, mst, dimBlock.x*sizeof(myst),cudaMemcpyDeviceToHost);
int t_size = 0;
for(int i=0; i<dimBlock.x; i++) {
t_size += hst[i].m;
}
printf("t_size:%d\n", t_size);
int * t_a_h = (int *)malloc(t_size*sizeof(int));
int * t_a_d = NULL;
cudaMalloc((void**)&t_a_d, t_size*sizeof(int));
ythread<<<dimGrid, dimBlock>>>(mst, t_a_d);
cudaDeviceSynchronize();
cudaMemcpy(t_a_h, t_a_d, t_size*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i<t_size; i++) {
printf("t_a_h[%d]:%d\n", i, t_a_h[i]);
}
free(t_a_h);
cudaFree(mst);
cudaFree(t_a_d);
return 0;
}
Emmmmmm, it work, but I think there is a better way to solve this problem.

How to create and use a 1D layered texture in CUDA

I am new to CUDA. I have figured out how to do 1D and 2D textures in CUDA. However, I am struggling with how to use a 1D layered texture. The output of my kernel which uses the texture is all zeros, which is definitely incorrect. However, I am not sure what I am doing wrong. I have serious doubts that I set up this texture correctly, but I checked for cuda errors everywhere and couldn't find any issues. Can someone show me how to correctly set up a 1D layered texture and use it. Here is my code. Thanks in advance:
// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (location_idx < numlocations) {
// Get the location you want to interpolate from the array
float loc2find = (float) d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 10;
const unsigned int numlayers = 3;
const unsigned int upsamp = 3;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1/(float)upsamp;
float h_data[len][numlayers], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
for (int i = 0; i < loclen; i ++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, len, numlayers);
// Copy to device memory some data located at address h_data in host memory
cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float* d_output;
cudaMalloc(&d_output, loclen * sizeof(float));
// Invoke kernel
int thdsPerBlk = 256;
int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid);
interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}
You must use cudaMalloc3DArray with the cudaArrayLayered flag set to allocate memory for layered textures. There is a complete example of layered texture usage in the toolkit samples which you can study to see how they work.
Unfortunately, the CUDA SDK only shows you how to do it when you have 2D layered texture. There is some more trickiness when it comes to 1D layered textures. It turns out you have to put a 0 into the second argument for make_cudaExtent when making the extentDesc as follows:
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
However, when using make_cudaExtent for mParams.extent for cudaMemcpy3D, you still need to put a 1 for the second argument:
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
Furthermore, there are some other non-obvious details such as the pitch for make_cudaPitchedPtr. So I have included my complete and functioning code for the 1D layered texture. I couldn't find an example of this anywhere. So hopefully this will help out others who are in the same boat:
// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
if (location_idx < numlocations && layer < numlayers) {
// Get the location you want to interpolate from the array
float loc2find = (float)d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
//printf("location=%d layer=%d loc2find=%f result=%f \n", location_idx, layer, loc2find, d_output[location_idx]);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 7;
const unsigned int numlayers = 3;
const unsigned int upsamp = 4;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1 / (float)upsamp;
float h_data[numlayers*len], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
for (int i = 0; i < loclen; i++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaMemcpy3DParms mParams = { 0 };
mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
mParams.kind = cudaMemcpyHostToDevice;
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
cudaArray* cuArray;
cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
mParams.dstArray = cuArray;
cudaMemcpy3D(&mParams);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float *d_output;
cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
float h_output[loclen * numlayers];
// Invoke kernel
dim3 dimBlock(16, 16, 1);
dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
(numlayers + dimBlock.y - 1) / dimBlock.y, 1);
interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);
// Print Results
printf("\n Original Indices \n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("\n Original array \n");
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < len; i++) {
printf("%5.3f ", h_data[i + j*len]);
}
printf("\n");
}
printf("\n Output Indices \n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("\n Output Array \n");
cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < loclen; i++) {
printf("%5.3f ", h_output[i + j*loclen]);
}
printf("\n");
}
printf("\n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}

Complicated for loop to be ported to a CUDA kernel

I have the next for nested loop and I would like to port it to CUDA to be run on a GPU
int current=0;
int ptr=0;
for (int i=0; i < Nbeans; i++){
for(int j=0;j< NbeamletsPerbeam[i];j++){
current = j + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[i];
}
}
I would be very happy if any body has an idea of how to do it or how can be done.
We are talking about Nbeams=5, NbeamletsPerBeam around 200 each.
This is what I currently have but I am not sure it is right...
for (int i= blockIdx.x; i < d_params->Nbeams; i += gridDim.x){
for (int j= threadIdx.y; j < d_beamletsPerBeam[i]; j+= blockDim.y){
currentBeamlet= j+k;
for (int ivoxel= threadIdx.x; ivoxel < totalVoxels; ivoxel += blockDim.x){
I would suggest this idea. But you might need to do some minor modifications based on your code.
dim3 blocks(NoOfThreads, 1);
dim3 grid(Nbeans, 1);
kernel<<grid, blocks, 1>>()
__global__ kernel()
{
int noOfBlocks = ( NbeamletsPerbeam[blockIdx.x] + blockDim.x -1)/blockDim.x;
for(int j=0; j< noOfBlocks;j++){
// use threads and compute....
if( (threadIdx.x * j) < NbeamletsPerbeam[blockIdx.x]) {
current = (threadIdx.x * j) + ptr;
for(int k=0;k<Nmax;k++){
......
}
ptr+=NbeamletsPerbeam[blockIdx.x];
}
}
}
This should do the trick and gives you better parallelization.