CUDA copy jagged array from host to device - cuda

So I'm trying to copy a jagged array from host to device. First of all here is my current understanding of cudaMalloc and cudaMemcpy:
cudaMalloc takes a pointer to the pointer to the memory block.
cudaMemcpy takes a pointer to the memory block to copy to or from.
Correct me if I'm wrong please.
Now this is my code that doesn't work (compiles fine but no output):
__global__ void kernel(int** arr)
{
for (int i=0; i<3; i++)
printf("%d\n", arr[i][0]);
}
int main()
{
int arr[][3] = {{1},{2},{3}}; // 3 arrays, 1 element each
int **d_arr;
cudaMalloc((void**)(&d_arr), sizeof(int*)*3); // allocate for 3 int pointers
for (int i=0; i<3; i++)
{
cudaMalloc( (void**) &(d_arr[i]), sizeof(int) * 1 ); // allocate for 1 int in each int pointer
cudaMemcpy(d_arr[i], arr[i], sizeof(int) * 1, cudaMemcpyHostToDevice); // copy data
}
kernel<<<1,1>>>(d_arr);
cudaDeviceSynchronize();
cudaDeviceReset();
}
So what am I doing wrong here?
Cheers

I found out why, it's because cudaMalloc and cudaMemcpy expect pointers that exist on the host and not on the device.
In my for-loop I was trying to fill pointers that exist on the device, in code that runs on host !
The right way is to make an intermediate variable, a pointer on host that points to memory on the device, fill it with integers, then copy that pointer into the jagged array (the pointer on pointers) !
This is the correct version:
__global__ void kernel(int** arr)
{
for (int i=0; i<3; i++)
printf("%d\n", arr[i][0]);
}
int main()
{
int arr[][3] = {{1},{2},{3}}; // 3 arrays, 1 element each
int **d_arr;
cudaMalloc((void***)(&d_arr), sizeof(int*)*3); // allocate for 3 int pointers
for (int i=0; i<3; i++)
{
int* temp;
cudaMalloc( (void**) &(temp), sizeof(int) * 1 ); // allocate for 1 int in each int pointer
cudaMemcpy(temp, arr[i], sizeof(int) * 1, cudaMemcpyHostToDevice); // copy data
cudaMemcpy(d_arr+i, &temp, sizeof(int*), cudaMemcpyHostToDevice);
}
kernel<<<1,1>>>(d_arr);
cudaDeviceSynchronize();
cudaDeviceReset();
}

Your kernel calls printf(), which is used to be (until CC2.0) a host function. Everything ok here. ;)
cudaMemcpy((void*)d_arr, (void*)arr, sizeof(int*)*3, cudaMemcpyHostToDevice); copies the Memory adresses of your Arrays on the host to the device. That makes no sense. Since you now have pointers to host memory on the device.
You can not allocate 2d Arrays that particular way in CUDA. See http://www.stevenmarkford.com/allocating-2d-arrays-in-cuda/.

Related

How to access device 2D array global variable from host

I want to print d_t global 2D array variable using "printf" inside main method. But I got a compile warning saying that:
a __device__ variable "d_t" cannot be directly read in a host function
How can I copy global 2D array variable from device to host and then print the first column of each row?
__device__ double *d_t;
__device__ size_t d_gridPitch;
__global__ void kernelFunc()
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
double* rowt = (double*)((char *)d_t + i * d_gridPitch);
rowt[0] = rowt[0] + 40000;
}
int main()
{
int size = 16;
size_t d_pitchLoc;
double *d_tLoc;
cudaMallocPitch((void**)&d_tLoc, &d_pitchLoc, size * sizeof(double), size);
cudaMemset2D(d_tLoc, d_pitchLoc, 0, size * sizeof(double), size);
cudaMemcpyToSymbol(d_gridPitch, &d_pitchLoc, sizeof(int));
cudaMemcpyToSymbol(d_t, & d_tLoc, sizeof(d_tLoc));
kernelFunc<<<1,size>>>();
for(int i=0; i< size; i++){
double* rowt = (double*)((char *)d_t + i * d_gridPitch);
printf("%.0f, ",rowt[0]);
}
cudaDeviceReset();
return 0;
}
As indicated in comments, the cudaMemcpy2D API is designed for exactly this task. You must allocate or statically define a host memory buffer or container to act as storage for the data from the device, and then provide the pitch of that host buffer to the cudaMemcpy2D call. The API handles the pitch conversion without any further intervention on the caller side.
If you replace the print loop with something like this:
double* h_t = new double[size * size];
cudaMemcpy2D(h_t, size * sizeof(double), d_tLoc, d_pitchLoc,
size * sizeof(double), size, cudaMemcpyDeviceToHost);
for(int i=0, j=0; i< size; i++){
std::cout << h_t[i * size + j] << std::endl;
}
[Note I'm using iostream here for the printing. CUDA uses a C++ compiler for compiling host code and you should prefer iostream functions over cstdio because they are less error prone and support improve diagnostics on most platforms].
You can see that the API call form is very similar to the cudaMemset2D call that I provided for you in your last question.

loop unrolling with dynamic parallelism decrease the time performance

I have a simple program to calculate square root, loop unrolling was done as
loop unrolling
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
//printf("%d\n",n);
for(int q=0;q<2;q++)
{
if(N<2000)
{
arr[idx+q] = arr[idx+q] * arr[idx+q];
}
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
then realizing that the loop calculation can be minimized with dynamic parallelism .
unrolling with dynamic parallelism was implemented as
unrolling with dynamic parallelism
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
square <<< 1,2 >>> (arr, n,idx);
}
__global__ void square(float *a, int N,int idx)
{
int tdx = blockIdx.x * blockDim.x + threadIdx.x;
printf("%d\n",N);
if(N<2000)
{
a[tdx+idx] = a[tdx+idx] * a[tdx+idx];
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
the implementation of dynamic parallelism with unrolling takes more time for executio than only unrolling. Aren,t we suppose to improve execution time with dynamic parallelism in such case?
Dynamic parallelism is mainly useful in cases where you have parallelism that is dynamic. That is: cases where you don't know how much parallelism you're going to need until you've done some calculation. Rather than transfer data back to the host which is then instantly fed into parameterising another launch, you launch from within the kernel. In this pattern, with memcpys between kernel launches avoided, you'll see speedup.
In your example above this is not the case. You could have just launched twice as many threads from the host. There's nothing dynamic required as there's no parallelism available there that you didn't know about at the time of the first kernel launch.
Furthermore, performance requirements for kernels launched using dynamic parallelism are similar to that of those launched from the host. You have to launch a reasonable amount of work or the launch latency will dominate your computation time.

simple sorting using thrust not working

I have a cuda thrust program as
#include <stdio.h>
#include<iostream>
#include <cuda.h>
#include <thrust/sort.h>
// main routine that executes on the host
int main(void)
{
int *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(int);
a_h = (int *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size);// Allocate array on device
std::cout<<"enter the 10 numbers";
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++)
{
std::cin>>a_h[i];
}
for (int i=0; i<N; i++) printf("%d %d\n", i, a_h[i]);
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
thrust::sort(a_d, a_d + N);
// Do calculation on device:
cudaMemcpy(a_h, a_d, sizeof(int)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %d\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
but it is not running to give the desired output.
Are we supposed to use the host vector and device vector for sorting in thrust????
For device operations you should use either a device pointer, or a device_vector iterator, not raw pointers. Raw pointers (that point to host memory) can be used for operations on the host.
So if you modify your code as follows:
#include <thrust/device_ptr.h>
...
thrust::device_ptr<int> t_a(a_d); // add this line before the sort line
thrust::sort(t_a, t_a + N); // modify your sort line
I believe it will work for you.
You may wish to read the thrust quick start guide. In particular note this section:
You may wonder what happens when a "raw" pointer is used as an argument to a Thrust function. Like the STL, Thrust permits this usage and it will dispatch the host path of the algorithm. If the pointer in question is in fact a pointer to device memory then you'll need to wrap it with thrust::device_ptr before calling the function

cudaMemCpy2D getting crashed

I am trying to implement Sauvola Binarization in cuda.For this I have read the image in a 2d array in host and allocating memory for 2D array in device using pitch.After allocating the memory I am trying to copy the host 2D array to Device 2d Array using cudaMemcpy2D,it compiles fine but it crashes here on runtime.I am unable to understand where am I missing,Kindly suggest something.The code which I have written is as follows:
#include "BinMain.h"
#include "Binarization.h"
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
void printDevProp(cudaDeviceProp);
void CUDA_SAFE_CALL( cudaError_t);
int main()
{
//Read an IplImage in imgOriginal as grayscale
IplImage * imgOriginal = cvLoadImage("E:\\1.tiff",CV_LOAD_IMAGE_GRAYSCALE);
//Create a size variable of type CvSize for cvCreateImage Parameter
CvSize size = cvSize(imgOriginal->width,imgOriginal->height);
//create an image for storing the result image with same height and width as imgOriginal
IplImage * imgResult = cvCreateImage(size,imgOriginal->depth,imgOriginal- >nChannels);
//Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
//Create a 2D array for storing the returned device array
int ** arrReturn = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrReturn[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
//Create a CvScalar variable to copy pixel values in 2D array (arrOriginal)
CvScalar s;
//Copying the pixl values
for(int j = 0;j<imgOriginal->height;j++)
{
for(int k =0;k<imgOriginal->width;k++)
{
s = cvGet2D(imgOriginal,j,k);
arrOriginal[j][k] = s.val[0];
}
}
//Cuda Device Property
int devCount;
cudaGetDeviceCount(&devCount);
printf("CUDA Device Query...\n");
printf("There are %d CUDA devices.\n", devCount);
// Iterate through devices
for (int i = 0; i < devCount; ++i)
{
// Get device properties
printf("\nCUDA Device #%d\n", i);
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
printDevProp(devProp);
}
//Start the clock
clock_t start = clock();
//Allocating Device memory for 2D array using pitch
size_t host_orig_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height; //host original array pitch in bytes
size_t dev_pitch; //device array pitch in bytes which will be used in cudaMallocPitch
size_t dev_pitchReturn; //device return array pitch in bytes
size_t host_ret_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height; //host return array pitch in bytes
int * devArrOriginal; //device 2d array of original image
int * result; //device 2d array for returned array
int dynmicRange = 128; //Dynamic Range for calculating the threshold from sauvola's formula
//Allocating memory by using cudaMallocPitch
CUDA_SAFE_CALL(cudaMallocPitch((void**)&devArrOriginal,&dev_pitch,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));
//Allocating memory for returned array
CUDA_SAFE_CALL(cudaMallocPitch((void**)&result,&dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));
//Copying 2D array from host memory to device mempry by using cudaMemCpy2D
CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));
int windowSize = 19; //Size of the window for calculating mean and variance
//Launching the kernel by calling myKernelLauncher function.
myKernelLauncher(devArrOriginal,result,windowSize,imgOriginal->width,imgOriginal- >height,dev_pitch,dynmicRange);
//Calling the sauvola binarization function by passing the parameters as
//1.arrOriginal 2D array 2.Original image height 3.Original image width
//int ** result = AdaptiveBinarization(arrOriginal,imgOriginal->height,imgOriginal- >width);//binarization(arrOriginal,imgOriginal->width,imgOriginal->height);
//
CUDA_SAFE_CALL(cudaMemcpy2D(arrReturn,host_ret_pitch,result,dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int),cudaMemcpyDeviceToHost));
//create a CvScalar variable to set the data in imgResult
CvScalar ss;
//Copy the pixel values from returned array to imgResult
for(int i=0;i<imgOriginal->height;i++)
{
for(int j=0;j<imgOriginal->width;j++)
{
ss = cvScalar(arrReturn[i][j]*255);
cvSet2D(imgResult,i,j,ss);
//k++; //No need for k if returned array is 2D
}
}
printf("Done \n");
//calculate and print the time elapsed
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
//Create a windoe and show the resule image
cvNamedWindow("Result",CV_WINDOW_AUTOSIZE);
cvShowImage("Result",imgResult);
cvWaitKey(0);
getch();
//Release the various resources
cvReleaseImage(&imgResult);
cvReleaseImage(&imgOriginal);
cvDestroyWindow("Result");
for(int i = 0; i < imgOriginal->height; i++)
free(arrOriginal[i]);
free(arrOriginal);
free(result);
cudaFree(&devArrOriginal);
cudaFree(&result);
}
// Print device properties
void printDevProp(cudaDeviceProp devProp)
{
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Name: %s\n", devProp.name);
printf("Total global memory: %u\n", devProp.totalGlobalMem);
printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of block: %d\n", i, devProp.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
printf("Clock rate: %d\n", devProp.clockRate);
printf("Total constant memory: %u\n", devProp.totalConstMem);
printf("Texture alignment: %u\n", devProp.textureAlignment);
printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Kernel execution timeout: %s\n", (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
return;
}
/* Utility Macro : CUDA SAFE CALL */
void CUDA_SAFE_CALL( cudaError_t call)
{
cudaError_t ret = call;
switch(ret)
{
case cudaSuccess:
break;
default :
{
printf(" ERROR at line :%i.%d' ' %s\n",
__LINE__,ret,cudaGetErrorString(ret));
exit(-1);
break;
}
}
}
The flow of the code is as follows:
1. Create a 2D array in host from image and another array for returned array from kernel.
2. Allocate memory for a 2D array in device using CudaMallocPitch
3. Allocate memory for a 2d array which will be returned by kernel.
4. Copy the original 2d array from host to device array using cudaMemcpy2d.
5. Launch the Kernel.
6. Copy the returned device array to host array using cudaMemcpy2D.
The program is crashing while it reaches to 4th point.It is an unhandled exception stating "Unhandled exception at 0x773415de in SauvolaBinarization_CUDA_OpenCV.exe: 0xC0000005: Access violation reading location 0x01611778."
I think the problem must be while allocating the memory,but I am using the function first time and have no idea how it works,kindly suggest.
First of all, you're not calling "cudaMallocPitch" properly. The "height" parameter should represent the number of rows, so instead of :
imgOriginal->height * sizeof(int)
you should simply use:
imgOriginal->height
This is fine because the number of bytes per row is already contained in the "pitch" property. The main problem, however, lies with the way you allocate the memory for the host image. When you write:
//Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
you are effectively creating an array with pointers to arrays. The CUDA API call that you
're making:
CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));
expects that the input memory buffer is contiguous. So here's what will happen: the first row from the input image (totalling "imgOriginal->width * sizeof(float)" bytes) will be read starting with the address:
(void*)arrOriginal
However, the amount of valid data you have starting at that address is only "imgOriginal->height * sizeof(int *)" bytes. The two byte counts are very likely to be different, which will lead to the crash because you will end up reading from an unknown location.
To solve this, consider allocating "arrOriginal" as one contiguous block, such as:
int * arrOriginal = (int *)malloc(imgOriginal->height * imgOriginal->width * sizeof(int));
Also, in this case, your pitch should be:
"imgOriginal->width * sizeof(int)"

tex1Dfetch() does not record change in values

In the below code, I first bind the texture called ref to an array called gpu in the global memory. Then I call a function called getVal in which i first set the value of gpu[1] to 5 and then read it using the binded texture using tex1Dfetch(ref,1). However, in this case, tex1Dfetch() does not display the changed value of gpu[5], instead it displays the old value.
Then, I call another function called getagain which just reads tex1Dfetch(ref,1) again. However, this time i get the new value . I really do not understand that why in the first function I do not get the changed value.
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
texture<int> ref;
__global__ void getVal(int *c, int *gpu){
gpu[1] = 5;
*c = tex1Dfetch(ref, 1); // returns old value, not 5
}
__global__ void getagain(int *c){
*c = tex1Dfetch(ref, 1); // returns new value !!!????
}
void main(){
int *gpu,*c;
int i,b[10];
for( i =0 ; i < 10; i++){
b[i] = i*3;
}
cudaMalloc((void**)&gpu, sizeof(int) * 10);
cudaBindTexture(NULL, ref, gpu,10*sizeof(int));
cudaMemcpy(gpu, b, 10 * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&c, sizeof(int));
//try changing value and reading using tex1dfetch
getVal<<<1,1>>>(c,gpu);
cudaMemcpy(&i, c,sizeof(int), cudaMemcpyDeviceToHost);
printf("the value returned by tex fetch is %d\n" , i);
cudaMemcpy(b, gpu,10*sizeof(int), cudaMemcpyDeviceToHost);
for( i =0 ; i < 10; i++){
printf("%d\n",b[i]);
}
getagain<<<1,1>>>(c);
cudaMemcpy(&i, c,sizeof(int), cudaMemcpyDeviceToHost);
printf("the value returned by tex fetch is %d\n" , i);
getchar();
}
Within the same kernel call, the texture cache does not maintain coherency with global memory. See section 3.2.10.4 of the CUDA 4.0 C Programming Guide. Coherency of the texture cache between consecutive kernel calls is achieved by the driver flushing the texture cache prior to launching a kernel.