How to access device 2D array global variable from host

How to access device 2D array global variable from host - cuda

I want to print d_t global 2D array variable using "printf" inside main method. But I got a compile warning saying that:
a __device__ variable "d_t" cannot be directly read in a host function
How can I copy global 2D array variable from device to host and then print the first column of each row?
__device__ double *d_t;
__device__ size_t d_gridPitch;
__global__ void kernelFunc()
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
double* rowt = (double*)((char *)d_t + i * d_gridPitch);
rowt[0] = rowt[0] + 40000;
}
int main()
{
int size = 16;
size_t d_pitchLoc;
double *d_tLoc;
cudaMallocPitch((void**)&d_tLoc, &d_pitchLoc, size * sizeof(double), size);
cudaMemset2D(d_tLoc, d_pitchLoc, 0, size * sizeof(double), size);
cudaMemcpyToSymbol(d_gridPitch, &d_pitchLoc, sizeof(int));
cudaMemcpyToSymbol(d_t, & d_tLoc, sizeof(d_tLoc));
kernelFunc<<<1,size>>>();
for(int i=0; i< size; i++){
double* rowt = (double*)((char *)d_t + i * d_gridPitch);
printf("%.0f, ",rowt[0]);
}
cudaDeviceReset();
return 0;
}

As indicated in comments, the cudaMemcpy2D API is designed for exactly this task. You must allocate or statically define a host memory buffer or container to act as storage for the data from the device, and then provide the pitch of that host buffer to the cudaMemcpy2D call. The API handles the pitch conversion without any further intervention on the caller side.
If you replace the print loop with something like this:
double* h_t = new double[size * size];
cudaMemcpy2D(h_t, size * sizeof(double), d_tLoc, d_pitchLoc,
size * sizeof(double), size, cudaMemcpyDeviceToHost);
for(int i=0, j=0; i< size; i++){
std::cout << h_t[i * size + j] << std::endl;
}
[Note I'm using iostream here for the printing. CUDA uses a C++ compiler for compiling host code and you should prefer iostream functions over cstdio because they are less error prone and support improve diagnostics on most platforms].
You can see that the API call form is very similar to the cudaMemset2D call that I provided for you in your last question.

Related

CUDA kernel launch

I am new to CUDA. I have written some simple code, which tries to copy a random initialized matrix to device memory, increments the value of each matrix entry by one, and transfer it back to the host memory.
There is no error while compiling or running the code. But, it seems that the kernel does not launch as the value of matrix entries are the same after launching the kernel.
Any idea what is happening there?
#include <iostream>
using namespace std;
#define SIZE 2
void print_matrix (int size, float *array);
void matrix_initialize(int size, float *array);
__global__ void LU(float * m, int size){
m[threadIdx.y*size + threadIdx.x] ++ ;
}
int main(){
srand(0);
//variables
float *a = new float[SIZE*SIZE];
dim3 blockdim(2,2,0);
dim3 griddim(1,0,0);
//initialize
matrix_initialize(SIZE, a);
print_matrix (SIZE, a);
//allocate space on device memory:
float * Ad;
int size = SIZE * SIZE;
cudaMalloc ((void **)&Ad, size);
//transfer data to device memory:
cudaMemcpy(Ad , a, size, cudaMemcpyHostToDevice);
//run the kernel
LU<<<griddim,blockdim>>>(Ad, SIZE);
// transfer the data back to the host memory
cudaMemcpy(a , Ad, size, cudaMemcpyDeviceToHost);
//test if the kernel runing the kernel has changed the value
print_matrix (SIZE, a);
// free device memory :
cudaFree (Ad);
return 0;
}
void print_matrix (int size, float *array){
for (int i=0; i < size*size ; i++){
if(i % size == 0)
cout << endl;
cout << array [i] << " ";
}
}
void matrix_initialize(int size, float *array){
for (int i = 0; i< SIZE*SIZE; i++){
array[i] = rand()/(float) RAND_MAX;
}
}

Unused dimensions should be set to 1 instead of 0:
dim3 blockdim(2, 2, 1);
dim3 griddim(1, 1, 1);
Your code launches 2 x 2 x 0 = 0 blocks, 1 x 0 x 0 = 0 threads each.
Your size calculation is wrong:
int size = SIZE * SIZE * sizeof(float);
Your code does not take array element size into account.

loop unrolling with dynamic parallelism decrease the time performance

I have a simple program to calculate square root, loop unrolling was done as
loop unrolling
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
//printf("%d\n",n);
for(int q=0;q<2;q++)
{
if(N<2000)
{
arr[idx+q] = arr[idx+q] * arr[idx+q];
}
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
then realizing that the loop calculation can be minimized with dynamic parallelism .
unrolling with dynamic parallelism was implemented as
unrolling with dynamic parallelism
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
square <<< 1,2 >>> (arr, n,idx);
}
__global__ void square(float *a, int N,int idx)
{
int tdx = blockIdx.x * blockDim.x + threadIdx.x;
printf("%d\n",N);
if(N<2000)
{
a[tdx+idx] = a[tdx+idx] * a[tdx+idx];
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
the implementation of dynamic parallelism with unrolling takes more time for executio than only unrolling. Aren,t we suppose to improve execution time with dynamic parallelism in such case?

Dynamic parallelism is mainly useful in cases where you have parallelism that is dynamic. That is: cases where you don't know how much parallelism you're going to need until you've done some calculation. Rather than transfer data back to the host which is then instantly fed into parameterising another launch, you launch from within the kernel. In this pattern, with memcpys between kernel launches avoided, you'll see speedup.
In your example above this is not the case. You could have just launched twice as many threads from the host. There's nothing dynamic required as there's no parallelism available there that you didn't know about at the time of the first kernel launch.
Furthermore, performance requirements for kernels launched using dynamic parallelism are similar to that of those launched from the host. You have to launch a reasonable amount of work or the launch latency will dominate your computation time.

Wrong sizes for block reduction in CUDA?

I was checking out this sum_reduction.cu example and tutorial and noticed that for certain problem sizes it doesn't work e.g. it works with problem size n=2000 but not with n=3000. Apparently it always work with problem sizes that are multiple of the block size but neither the tutorial nor the example code states so. The question is, does this reduction algorithm only works for certain problem sizes? the example they chose N=256k which is even, a power of two and also multiple of the block size 512.
For self containment I paste the most important bits of (a template version of) the code here:
template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
and to invoke the kernel:
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
To my understanding if the problem size is smaller than the block reduction this statement T x = 0.0; ensures that the element is zeroed out and thus should work but it doesn't?
UPDATE: I am sorry the float/double thing was a typo while preparing the question and not the real problem.

The code you have posted is not consistent, as your templated kernel
is called kernelSum but you are invoking something called
block_sum.
Furthermore, I don't believe your usage of the templated kernel
function could possibly be correct as written:
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(float)>>>(d_input, d_partial_sums_and_total, num_elements);
^ ^
| these types are required to match |
The kernel template is being instantiated with type double. Therefore it is expecting enough shared memory to store block_size double quantities, based on this line:
extern __shared__ T sdata[];
But you are only passing half of the required storage:
block_size * sizeof(float)
I believe that's going to give you unexpected results.
The reduction as written does expect that the block
dimension is a power of 2, due to this loop:
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
This is not likely to be an issue on the first kernel call, because you are probably choosing a power of two for the number of threads per block (block_size):
block_sum<double> <<<num_blocks,block_size,...
However, for the second kernel call, this will depend on whether num_blocks is a power of two, which depends on your grid calculations, which you haven't shown:
block_sum<double> <<<1,num_blocks,...
Finally, the first kernel launch will fail if num_blocks exceeds the limit for your device. This may happen for very large data sets but probably not for size 3000, and it depends on your grid calculations which you haven't shown.
Item 3 above is a difficult requirement to satisfy on the fly for arbitrary vector sizes. Therefore I would suggest an alternate reduction strategy to handle arbitrary sized vectors. For this I would suggest that you study the CUDA reduction sample code and presentation.
Here's a complete program, mostly based on the code you have shown, that has the above issues addressed, and seems to work for me for a size of 3000:
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 3000
#define nTPB 256
template<typename T>
__global__ void block_sum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
extern __shared__ T sdata[];
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
T x = 0.0;
if (tid < n) {
x = input[tid];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if(threadIdx.x < offset) {
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0) {
per_block_results[blockIdx.x] = sdata[0];
}
}
int main(){
double *d_input, *d_partial_sums_and_total, *h_input, *h_partial_sums_and_total;
int num_elements=DSIZE;
int block_size = nTPB;
int num_blocks = (num_elements + block_size -1)/block_size;
// bump num_blocks up to the next power of 2
int done = 0;
int test_val = 1;
while (!done){
if (test_val >= num_blocks){
num_blocks = test_val;
done = 1;}
else test_val *= 2;
if (test_val > 65535) {printf("blocks failure\n"); exit(1);}
}
h_input = (double *)malloc(num_elements * sizeof(double));
h_partial_sums_and_total = (double *)malloc((num_blocks+1)*sizeof(double));
cudaMalloc((void **)&d_input, num_elements * sizeof(double));
cudaMalloc((void **)&d_partial_sums_and_total, (num_blocks+1)*sizeof(double));
double h_result = 0.0;
for (int i = 0; i < num_elements; i++) {
h_input[i] = rand()/(double)RAND_MAX;
h_result += h_input[i];}
cudaMemcpy(d_input, h_input, num_elements*sizeof(double), cudaMemcpyHostToDevice);
cudaMemset(d_partial_sums_and_total, 0, (num_blocks+1)*sizeof(double));
// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);
// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);
cudaMemcpy(h_partial_sums_and_total, d_partial_sums_and_total, (num_blocks+1)*sizeof(double), cudaMemcpyDeviceToHost);
printf("host result = %lf\n", h_result);
printf("device result = %lf\n", h_partial_sums_and_total[num_blocks]);
}
For brevity/readability, I have dispensed with error checking in the above code. When having difficulty with a cuda code, you should always do proper cuda error checking.
Also, in the future, you will make it easier for others to help you if you post a complete code to demonstrate what you are doing, as I have done above.

Shared Memory of Objects in CUDA and libc++abi.dylib error

I have the following problem (keep in mind that I am fairly new to programming with CUDA),
I have a class called vec3f that is just like the float3 data type but with overloaded operators, and other vector functions. These functions are prefixed with __ device __ __ host __ (i added spaces because it was making these words bolded). Then, in my kernel I do a nested for loop over block_x and block_y indicies and do something like,
//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();
In the above code, oldParticles is a pointer to a class called particles that is being passed to the kernel. OldParticles is acutally an underlying pointer of a thrust::device_vector (im not sure if this has something to do with it). Everything compiles okay but when I run I get the error
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
Thanks for the replies. I think the error had to do with me not allocating room for the arguments being passed to my kernel. Doing the following in my host code fixed this error,
particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );
The kernel call is then,
force_kernel<<< grid,block,sharedMemSize >>>(particle_ptrs[0],particle_ptrs[1],time_step);
The issue that I am having now seems to be that I can't get data copied back to the host from the device. I think this has to do with me not being familiar with thrust.
Im doing a series of copies as follows,
//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;
So I guess my question is, can I modify a thrust device vector in a kernel (in this case particle_pters[0]) save the modification to to another thrust device vector in the kernel (in this case particle_pters[1]) and then once I exit from the kernel, copy that to a host vector?
I still can't get this to work. I made a shorter example where I am having the same problem,
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
particle() :
_velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
};
particle(const vec3f& pos, const vec3f& vel) :
_position(pos), _velocity(vel), _density(0.0) {
};
vec3f _velocity;
vec3f _position;
float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];
int main() {
//load host vector
for (int i =0; i<max_particles; i++) {
h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
}
particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
//copy host particles to old device particles...
old_parts = h_parts;
//kernel block and grid dimensions
dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
//copy new device particles back to host particles
h_parts = new_parts;
for (int i =0; i<max_particles; i++) {
particle temp1 = h_parts[i];
cout << temp1._position << endl;
}
//delete thrust device vectors
old_parts.clear();
old_parts.shrink_to_fit();
new_parts.clear();
new_parts.shrink_to_fit();
return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//get array position for 2d grid...
unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
So the host vector has an initial position of (0.5,0.5,0.5) for all 64 particles. Then the kernel attempts to multiply that by 10 to give (5,5,5) as the position for all particles. But I dont see this when I "cout" the data. It is still just (0.5,0.5,0.5). Is there a problem with how I am allocating memory? Is there a problem with the lines:
//copy new device particles back to host particles
h_parts = new_parts;
What could be the issue? Thank you.

There are various problems with the code you have posted.
you have your block and grid variables reversed in your kernel invocation. grid comes first.
you should be doing cuda error checking on your kernel and runtime API calls.
your method of allocating storage using cudaMalloc on a pointer which has been raw-cast from an empty device vector is not sensible. The vector container has no knowledge that you did this "under the hood." Instead, you can directly allocate storage for the device vector when you instantiate it, like:
thrust::device_vector<particle> old_parts(max_particles), new_parts(max_particles);
You say you're expecting 5,5,5, but your kernel is multiplying by 10 and then by dt which is 0.01, so I believe the correct output is 0.05, 0.05, 0.05
Your grid computation (int(sqrt...)), for an arbitrary max_particles either is not guaranteed to produce enough blocks (if casting a float to int truncates or rounds down) or will produce extra blocks (if it rounds up). The round down case is bad. We should handle that by using a ceil function or another grid computation method. The round up case (which is what ceil will do) is OK, but we need to handle the fact that the grid may launch extra blocks/threads. We do that with a thread check in the kernel. There were other problems with the grid computation as well. We want to take the square root of max_particles, then divide it by the block dimension in a particular direction, to get the grid dimension in that direction.
Here's some code that I've modified with these changes in mind, it seems to produce the correct output (0.05, 0.05, 0.05). Note that I had to make some other changes because I don't have your "vec3f.h" header file handy, so I used float3 instead.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector_functions.h>
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
particle() :
_velocity(make_float3(0,0,0)), _position(make_float3(0,0,0)), _density(0.0)
{
};
particle(const float3& pos, const float3& vel) :
_position(pos), _velocity(vel), _density(0.0)
{
};
float3 _velocity;
float3 _position;
float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
int main() {
//global thrust vectors
thrust::host_vector<particle> h_parts;
particle* particle_ptrs[2];
//load host vector
for (int i =0; i<max_particles; i++) {
h_parts.push_back(particle(make_float3(0.5,0.5,0.5),make_float3(10,10,10)));
}
//copy host particles to old device particles...
thrust::device_vector<particle> old_parts = h_parts;
thrust::device_vector<particle> new_parts(max_particles);
particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
//kernel block and grid dimensions
dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 grid((int)ceil(sqrt(float(max_particles)) / (float(block.x))), (int)ceil(sqrt(float(max_particles)) / (float(block.y))), 1);
cout << "grid x: " << grid.x << " grid y: " << grid.y << endl;
kernel_func<<<grid,block>>>(particle_ptrs[0],particle_ptrs[1],dt);
//copy new device particles back to host particles
cudaDeviceSynchronize();
h_parts = new_parts;
for (int i =0; i<max_particles; i++) {
particle temp1 = h_parts[i];
cout << temp1._position.x << "," << temp1._position.y << "," << temp1._position.z << endl;
}
//delete thrust device vectors
old_parts.clear();
old_parts.shrink_to_fit();
new_parts.clear();
new_parts.shrink_to_fit();
return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//get array position for 2d grid...
unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
if (arr_pos < max_particles) {
new_parts[arr_pos]._velocity.x = old_parts[arr_pos]._velocity.x * 10.0 * dt;
new_parts[arr_pos]._velocity.y = old_parts[arr_pos]._velocity.y * 10.0 * dt;
new_parts[arr_pos]._velocity.z = old_parts[arr_pos]._velocity.z * 10.0 * dt;
new_parts[arr_pos]._position.x = old_parts[arr_pos]._position.x * 10.0 * dt;
new_parts[arr_pos]._position.y = old_parts[arr_pos]._position.y * 10.0 * dt;
new_parts[arr_pos]._position.z = old_parts[arr_pos]._position.z * 10.0 * dt;
new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
}

cudaMemCpy2D getting crashed

I am trying to implement Sauvola Binarization in cuda.For this I have read the image in a 2d array in host and allocating memory for 2D array in device using pitch.After allocating the memory I am trying to copy the host 2D array to Device 2d Array using cudaMemcpy2D,it compiles fine but it crashes here on runtime.I am unable to understand where am I missing,Kindly suggest something.The code which I have written is as follows:
#include "BinMain.h"
#include "Binarization.h"
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
void printDevProp(cudaDeviceProp);
void CUDA_SAFE_CALL( cudaError_t);
int main()
{
//Read an IplImage in imgOriginal as grayscale
IplImage * imgOriginal = cvLoadImage("E:\\1.tiff",CV_LOAD_IMAGE_GRAYSCALE);
//Create a size variable of type CvSize for cvCreateImage Parameter
CvSize size = cvSize(imgOriginal->width,imgOriginal->height);
//create an image for storing the result image with same height and width as imgOriginal
IplImage * imgResult = cvCreateImage(size,imgOriginal->depth,imgOriginal- >nChannels);
//Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
//Create a 2D array for storing the returned device array
int ** arrReturn = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrReturn[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
//Create a CvScalar variable to copy pixel values in 2D array (arrOriginal)
CvScalar s;
//Copying the pixl values
for(int j = 0;j<imgOriginal->height;j++)
{
for(int k =0;k<imgOriginal->width;k++)
{
s = cvGet2D(imgOriginal,j,k);
arrOriginal[j][k] = s.val[0];
}
}
//Cuda Device Property
int devCount;
cudaGetDeviceCount(&devCount);
printf("CUDA Device Query...\n");
printf("There are %d CUDA devices.\n", devCount);
// Iterate through devices
for (int i = 0; i < devCount; ++i)
{
// Get device properties
printf("\nCUDA Device #%d\n", i);
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
printDevProp(devProp);
}
//Start the clock
clock_t start = clock();
//Allocating Device memory for 2D array using pitch
size_t host_orig_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height; //host original array pitch in bytes
size_t dev_pitch; //device array pitch in bytes which will be used in cudaMallocPitch
size_t dev_pitchReturn; //device return array pitch in bytes
size_t host_ret_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height; //host return array pitch in bytes
int * devArrOriginal; //device 2d array of original image
int * result; //device 2d array for returned array
int dynmicRange = 128; //Dynamic Range for calculating the threshold from sauvola's formula
//Allocating memory by using cudaMallocPitch
CUDA_SAFE_CALL(cudaMallocPitch((void**)&devArrOriginal,&dev_pitch,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));
//Allocating memory for returned array
CUDA_SAFE_CALL(cudaMallocPitch((void**)&result,&dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));
//Copying 2D array from host memory to device mempry by using cudaMemCpy2D
CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));
int windowSize = 19; //Size of the window for calculating mean and variance
//Launching the kernel by calling myKernelLauncher function.
myKernelLauncher(devArrOriginal,result,windowSize,imgOriginal->width,imgOriginal- >height,dev_pitch,dynmicRange);
//Calling the sauvola binarization function by passing the parameters as
//1.arrOriginal 2D array 2.Original image height 3.Original image width
//int ** result = AdaptiveBinarization(arrOriginal,imgOriginal->height,imgOriginal- >width);//binarization(arrOriginal,imgOriginal->width,imgOriginal->height);
//
CUDA_SAFE_CALL(cudaMemcpy2D(arrReturn,host_ret_pitch,result,dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int),cudaMemcpyDeviceToHost));
//create a CvScalar variable to set the data in imgResult
CvScalar ss;
//Copy the pixel values from returned array to imgResult
for(int i=0;i<imgOriginal->height;i++)
{
for(int j=0;j<imgOriginal->width;j++)
{
ss = cvScalar(arrReturn[i][j]*255);
cvSet2D(imgResult,i,j,ss);
//k++; //No need for k if returned array is 2D
}
}
printf("Done \n");
//calculate and print the time elapsed
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
//Create a windoe and show the resule image
cvNamedWindow("Result",CV_WINDOW_AUTOSIZE);
cvShowImage("Result",imgResult);
cvWaitKey(0);
getch();
//Release the various resources
cvReleaseImage(&imgResult);
cvReleaseImage(&imgOriginal);
cvDestroyWindow("Result");
for(int i = 0; i < imgOriginal->height; i++)
free(arrOriginal[i]);
free(arrOriginal);
free(result);
cudaFree(&devArrOriginal);
cudaFree(&result);
}
// Print device properties
void printDevProp(cudaDeviceProp devProp)
{
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Name: %s\n", devProp.name);
printf("Total global memory: %u\n", devProp.totalGlobalMem);
printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of block: %d\n", i, devProp.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
printf("Clock rate: %d\n", devProp.clockRate);
printf("Total constant memory: %u\n", devProp.totalConstMem);
printf("Texture alignment: %u\n", devProp.textureAlignment);
printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Kernel execution timeout: %s\n", (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
return;
}
/* Utility Macro : CUDA SAFE CALL */
void CUDA_SAFE_CALL( cudaError_t call)
{
cudaError_t ret = call;
switch(ret)
{
case cudaSuccess:
break;
default :
{
printf(" ERROR at line :%i.%d' ' %s\n",
__LINE__,ret,cudaGetErrorString(ret));
exit(-1);
break;
}
}
}
The flow of the code is as follows:
1. Create a 2D array in host from image and another array for returned array from kernel.
2. Allocate memory for a 2D array in device using CudaMallocPitch
3. Allocate memory for a 2d array which will be returned by kernel.
4. Copy the original 2d array from host to device array using cudaMemcpy2d.
5. Launch the Kernel.
6. Copy the returned device array to host array using cudaMemcpy2D.
The program is crashing while it reaches to 4th point.It is an unhandled exception stating "Unhandled exception at 0x773415de in SauvolaBinarization_CUDA_OpenCV.exe: 0xC0000005: Access violation reading location 0x01611778."
I think the problem must be while allocating the memory,but I am using the function first time and have no idea how it works,kindly suggest.

First of all, you're not calling "cudaMallocPitch" properly. The "height" parameter should represent the number of rows, so instead of :
imgOriginal->height * sizeof(int)
you should simply use:
imgOriginal->height
This is fine because the number of bytes per row is already contained in the "pitch" property. The main problem, however, lies with the way you allocate the memory for the host image. When you write:
//Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
you are effectively creating an array with pointers to arrays. The CUDA API call that you
're making:
CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));
expects that the input memory buffer is contiguous. So here's what will happen: the first row from the input image (totalling "imgOriginal->width * sizeof(float)" bytes) will be read starting with the address:
(void*)arrOriginal
However, the amount of valid data you have starting at that address is only "imgOriginal->height * sizeof(int *)" bytes. The two byte counts are very likely to be different, which will lead to the crash because you will end up reading from an unknown location.
To solve this, consider allocating "arrOriginal" as one contiguous block, such as:
int * arrOriginal = (int *)malloc(imgOriginal->height * imgOriginal->width * sizeof(int));
Also, in this case, your pitch should be:
"imgOriginal->width * sizeof(int)"

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to access device 2D array global variable from host - cuda

Related

CUDA kernel launch

loop unrolling with dynamic parallelism decrease the time performance

Wrong sizes for block reduction in CUDA?

Shared Memory of Objects in CUDA and libc++abi.dylib error

cudaMemCpy2D getting crashed

Categories

Resources