Related
below is a simplified version of a problem that I am trying to solve. Both code snipets compile, but #2 throws an "illegal memory access". Basically, if an array is encapsulated in a structure, passing a pointer to that structure to cudaMalloc creates all kind of problems -- at least the way I do it. I am pretty sure this is due to the fact that the address of dum in the code below is on the host, and so is not accessible inside the kernel. Problem is, I don't know how to create a device version of dum... E.g., using cudaMalloc( (void**)&dum , sizeof(dummy) * 1 ) instead of the new dummy syntax below does not solve the problem. I think I am getting confused with the double pointer used by cudaMalloc.
Of course it may seem silly in this example to encapsulate an array of double in a structure, in the actual code I really need to do this though.
struct dummy
{
double *arr;
};
void allocate( dummy *dum , int n )
{
cudaMalloc( (double**)&(dum->arr) , sizeof(double) * n );
}
__global__ void test( double val , dummy *dum , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
dum->arr[ii] = val;
}
__global__ void test2( double val , double *arr , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
arr[ii] = val;
}
int main()
{
int n = 10;
dummy *dum = new dummy;
/* CODE 1: the piece of code below works */
double *p;
gpu_err_chk( cudaMalloc( &p , sizeof(double) * n ) );
test2<<< 1 , 1 >>>( 123.0 , p , n );
gpu_err_chk( cudaDeviceSynchronize() );
/* CODE 2: the piece of code below does not... */
allocate( dum , n );
test<<< 1 , 1 >>>( 123.0 , dum , n );
gpu_err_chk( cudaDeviceSynchronize() );
return 1;
}
After digging through some example in previous posts by Robert, I was able to re-write the code so that it works:
struct dummy
{
double *arr;
};
__global__ void test( dummy *dum , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
printf( "dum->arr[%d] = %f\n" , ii , dum->arr[ii] );
}
int main()
{
int n = 10;
dummy *dum_d , *dum_h;
srand( time(0) );
dum_h = new dummy;
dum_h->arr = new double[n];
for( int ii = 0 ; ii < n ; ii++ ){
dum_h->arr[ii] = double( rand() ) / RAND_MAX;
printf( "reference data %d = %f\n" , ii , dum_h->arr[ii] );
}
cudaMalloc( &dum_d , sizeof(dummy) * 1 );
cudaMemcpy( dum_d , dum_h , sizeof(dummy) * 1 , cudaMemcpyHostToDevice );
double *tmp;
cudaMalloc( &tmp , sizeof(double) * n );
cudaMemcpy( &( dum_d->arr ) , &tmp , sizeof(double*) , cudaMemcpyHostToDevice ); // copy the pointer (host) to the device structre to a device pointer
cudaMemcpy( tmp , dum_h->arr , sizeof(double) * n , cudaMemcpyHostToDevice );
delete [] dum_h->arr;
delete dum_h;
test<<< 1 , 1 >>>( dum_d , n );
gpu_err_chk( cudaDeviceSynchronize() );
cudaFree( tmp );
cudaFree( dum_d );
return 1;
}
However, I am still confused why this works. Does anyone have a visual diagram of what's going on? I am getting lost with the different pointers...
Moreover, there is one thing that really blows my mind: I can free tmp right before the kernel launch and the code still works, i.e.:
cudaFree( tmp );
test<<< 1 , 1 >>>( dum_d , n );
gpu_err_chk( cudaDeviceSynchronize() );
How is this the case? In my mind (clearly wrong), the device array containing the random values is gone...
Another point of confusion is that I can't free dum_d->arr directly (freeCuda(dum_d->arr)), this throws a segmentation fault.
I multiplay each row from pB to each row from pA and put max value to pC.
The problem is: in internal loop the only last row of receptors taken as "max value". As result the right column is totally wrong.
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1, 2, 3, 4, 5
, 2, 4, 6, 8, 2};
float pB[k * n] = { 9, 8, 7, 6, 5
, 4, 3, 2, 1, 9
, 8, 7, 6, 5, 4 };
float expected[k * n] = { 18, 32, 42, 48, 25
, 8, 12, 12, 8, 45
,16, 28, 36, 40, 20 };
float pC[k * n] = { 18, 32, 42, 48, 10
, 8, 12, 12, 8, 18
,16, 28, 36, 40, 8 };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
CPPUNIT_ASSERT_EQUAL_MESSAGE( "passed processing", 0, rst );
}
// pDevB and pDevC nave the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
if( value > pDevC[j] )
{
pDevC[j] = value;
}
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC, pfnMsg fnMsg )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = nullptr;
float* pDevB = nullptr;
float* pDevC = nullptr;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
Sorry, I missed at some point that you had edited your code.
The problem you are having is a race condition. In the failing case you are launching 2 blocks. The design of your algorithm is such that each block is operating on the same set of output elements (in pdevC). Therefore, since both blocks can execute simultaneously, both blocks can write to the same output elements simultaneously. This is a collision and there are two ways you can avoid it:
redesign your algorithm to partition the work differently between
blocks. Instead of each block checking all (or the same set of) the output elements
against a particular set of inputs, have each block only be
responsible for a portion of the output elements but checking
against all the inputs. This is a common code refactoring
operation that is done when converting a sequential/serial
algorithm, to one that runs in parallel.
use atomic operations to prevent the collisions from happening. If your algorithm only has a small amount of these types of collisions, it may be convenient and not very costly to use atomics. But when the algorithm uses atomics for every output element (perhaps multiple times, as in this case) it's probably better (for higher performance) to try to refactor the code as in method 1 above.
What follows is some code where I illustrate the second approach (because it is easier for me to write). There is no atomic function that provides an atomicMax operation on float, so I crafted my own, following the template given in the atomic functions documentation for creating arbitrary atomic operations using atomicCAS. That is what atomicMaxf is.
If you elect to use the first approach (recommended), I would point out that calling the kernel in a loop is probably not necessary for your algorithm. I would craft a new kernel that assigns one thread to each output point, and then computes all the necessary max operations on the various input points, in a loop (or nested loops) in the kernel. Since each thread is writing to one and only one unique output point, there is no possibility for write collisions between threads.
This code should provide correct results, anyway:
#include <stdio.h>
__device__ float atomicMaxf(float* address, float val)
{
int *address_as_int =(int*)address;
int old = *address_as_int, assumed;
while (val > __int_as_float(old)) {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val));
}
return __int_as_float(old);
}
// pDevB and pDevC have the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
atomicMaxf(pDevC+j, value);
// if( value > pDevC[j] )
// {
// pDevC[j] = value;
// }
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = NULL;
float* pDevB = NULL;
float* pDevC = NULL;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f
, 2.0f, 4.0f, 6.0f, 8.0f, 2.0f};
float pB[k * n] = { 9.0f, 8.0f, 7.0f, 6.0f, 5.0f
, 4.0f, 3.0f, 2.0f, 1.0f, 9.0f
, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f };
float expected[k * n] = { 18.0f, 32.0f, 42.0f, 48.0f, 25.0f
, 8.0f, 12.0f, 12.0f, 8.0f, 45.0f
,16.0f, 28.0f, 36.0f, 40.0f, 20.0f };
float pC[k * n] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
printf("passed processing: %d \n", rst );
for (int i=0; i<(k*n); i++)
if (pC[i] != expected[i]) printf("mismatch at %d, should be: %f was: %f\n", i, expected[i], pC[i]);
}
int main(){
TestCalcDotMax_2x5x3();
return 0;
}
Thanks a lot - it works now. Is possible to keep the index of iteratiion [idx] at the moment of comparing? Like this:
struct ValIndex_t
{
float value;
int index;
};
__device__ float atomicMaxPare( float* address, float val, int* index, int idx )
{
int *address_as_int = reinterpret_cast<int*>( address->value ); // assume that float has size of integer 32 bit
int old = *address_as_int, assumed;
while( val > ::__int_as_float(old) )
{
assumed = old;
old = ::atomicCAS( address_as_int, assumed, ::__float_as_int(val) );
*index = idx;
}
return ::__int_as_float(old);
}
__global__ void CudaPareDotMax( float* pDevA, const float* pDevB, ValIndex_t* pDevC, const size_t m, const size_t k, const size_t n )
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if( idx < m )
{
for( size_t row = 0; row < n; row++ )
{
for( size_t col = 0; col < k; col++ )
{
const size_t slice = col + row * k;
const size_t index = slice + k * n * idx;
pDevA[index] *= pDevB[ col + k * idx ];
float& prvalue = (pDevC + slice )->value;
int& prindex = (pDevC + slice )->index;
::atomicMaxPare( &prvalue, pDevA[ index ], &prindex, idx );
}
}
}
}
Or I have to use another atomic function for exchange? Not quite understand how to join it exactly at the moment when value became max. Thanks again
I've got this kernel
__global__ void kernel1(int keep, int include, int width, int* d_Xco,
int* d_Xnum, bool* d_Xvalid, float* d_Xblas)
{
int i, k;
i = threadIdx.x + blockIdx.x * blockDim.x;
if(i < keep){
for(k = 0; k < include ; k++){
int val = (d_Xblas[i*include + k] >= 1e5);
int aux = d_Xnum[i];
d_Xblas[i*include + k] *= (!val);
d_Xco[i*width + aux] = k;
d_Xnum[i] +=val;
d_Xvalid[i*include + k] = (!val);
}
}
}
launched with
int keep = 9000;
int include = 23000;
int width = 0.2*include;
int threads = 192;
int blocks = keep+threads-1/threads;
kernel1 <<< blocks,threads >>>( keep, include, width,
d_Xco, d_Xnum, d_Xvalid, d_Xblas );
This kernel1 works fine but it is obviously not totally optimized. I thought it would be straight forward to eliminate the inner loop k but for some reason it doesn't work fine.
My first idea was:
__global__ void kernel2(int keep, int include, int width,
int* d_Xco, int* d_Xnum, bool* d_Xvalid,
float* d_Xblas)
{
int i, k;
i = threadIdx.x + blockIdx.x * blockDim.x;
k = threadIdx.y + blockIdx.y * blockDim.y;
if((i < keep) && (k < include) ) {
int val = (d_Xblas[i*include + k] >= 1e5);
int aux = d_Xnum[i];
d_Xblas[i*include + k] *= (float)(!val);
d_Xco[i*width + aux] = k;
atomicAdd(&d_Xnum[i], val);
d_Xvalid[i*include + k] = (!val);
}
}
launched with a 2D grid:
int keep = 9000;
int include = 23000;
int width = 0.2*include;
int th = 32;
dim3 threads(th,th);
dim3 blocks ((keep+threads.x-1)/threads.x, (include+threads.y-1)/threads.y);
kernel2 <<< blocks,threads >>>( keep, include, width, d_Xco, d_Xnum,
d_Xvalid, d_Xblas );
Although I believe the idea is fine, it does not work and I am running out of ideas here. Could you please help me out here? I also think the problem could be in d_Xco which stores the position k in a smaller array and push them to the beginning of the array , so the order matters.
d_Xco
-------------------------------
| 2|3 |15 |4 |5 |5 | | | | | | .......
-------------------------------
In the original code, you have
for(k = 0; k < include ; k++){
...
int aux = d_Xnum[i];
...
d_Xco[i*width + aux] = k;
...
}
The index to the d_Xco array is not dependent on k and therefore writing to it each iteration is redundant. The final value will always be include-1. So, replace these two lines inside the k loop with one line outside the k loop:
d_Xco[i*width + d_Xnum[i]] = include - 1;
Once you do that, when you parallelize the k loop you will no longer have the race condition you currently have when many k threads assign different values to the same location in d_Xco concurrently (no guarantee of ordering).
I have a large character array in the device global memory that is accessed
in a coalescent manner by threads. I've read somewhere that I could speed up
memory access by reading 4 or 16 chars in one memory transaction per thread.
I believe I would have to use textures and the char4 or int4 structs. However,
I can't find any documentation or examples on this. Could anyone here please
provide a simple example or pointers to where I can learn more about this?
In my code I define the char array as
char *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char) );
What would the definition be if I want to use textures and char4 (or int4)?
Thanks very much.
I finally figured out the answer to my own question. The definition with char4
would be
char4 *database = NULL;
cudaMalloc( (void**) &database, SIZE * sizeof(char4)/4 );
Don't need textures for this. The kernel does speedup by a factor of three
with char4 but reduces to two if I do loop unrolling. For the sake of completeness
my kernel is
__global__ void kernel(unsigned int jobs_todo, char* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char ch;
if(id < jobs_todo) {
for(i = 0; i < 1000; i += 1){
ch = database[jobs_todo*i + id];
if(ch == 'A') A++;
}
results[id] = A;
}
}
And with char4 it is
__global__ void kernel4(unsigned int jobs_todo, char4* database, float* results ) {
unsigned int id = threadIdx.x + blockIdx.x * blockDim.x;
float A = 0; int i; char4 ch4;
if(id < jobs_todo) {
for(i = 0; i < 1000/4; i += 1){
ch4 = database[jobs_todo*i + id];
if(ch4.x == 'A') A++;
if(ch4.y == 'A') A++;
if(ch4.z == 'A') A++;
if(ch4.w == 'A') A++;
}
results[id] = A;
}
}
I also tried int4 but it's just .0002 seconds faster than the char4 time.
I have several lists of numbers on a file . For example,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
Now, I want to get the number of times each number occurs using the GPU. I believe this will be faster to do on the GPU than the CPU because each thread can process one list. What data structure should I use on the GPU to easily get the above counts. For example , for the above, the answer will look as follows:
.333 = 2 times in entire file
.324 = 1 time
etc..
I looking for a general solution. Not one that works only on devices with specific compute capability
Just writing kernel suggested by Pavan to see if I have implemented it efficiently:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
Question: how to modify this kernel so that it handles arrays of arbitrary size. I.e , handle the condition when the total number of threads < number of elements
Here is how I would do the code in matlab
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
There is no easy way to do this in plain cuda, but you can use existing libraries to do this.
1) Thrust
It is also being shipped with CUDA toolkit from CUDA 4.0.
The matlab code can be roughly translated into thrust by using the following functions. I am not too proficient with thrust, but I am just trying to give you an idea on what routines to look at.
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
You now have the locations of the first instance of each unique value. You can now launch a kernel to find the differences between adjacent elements from 0 to new_end in d_Locations. Subtract the final value from num to get the count for final location.
EDIT (Adding code that was provided over chat)
Here is how the difference code needs to be done
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
2) ArrayFire
This is an easy to use, free array based library.
You can do the following in ArrayFire.
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
Disclosure: I work for AccelerEyes, that develops this software.
To answer the latest addenum to this question - the diff kernel which would complete the thrust method proposed by Pavan could look something like this:
template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
int strd = blockDim.x * gridDim.x;
int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));
__shared__ int indices[blcksz+1];
for(; id<nmax; id+=strd) {
// Data load
indices[threadIdx.x] = (id < n) ? i[id] : n;
if (threadIdx.x == (blcksz-1))
indices[blcksz] = ((id+1) < n) ? i[id+1] : n;
__syncthreads();
// Differencing calculation
int diff = indices[threadIdx.x+1] - indices[threadIdx.x];
// Store
if (id < n) count[id] = diff;
__syncthreads();
}
}
here is a solution:
__global__ void counter(float* a, int* b, int N)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N)
{
float my = a[idx];
int count = 0;
for(int i=0; i < N; i++)
{
if(my == a[i])
count++;
}
b[idx]=count;
}
}
int main()
{
int threads = 9;
int blocks = 1;
int N = blocks*threads;
float* h_a;
int* h_b;
float* d_a;
int* d_b;
h_a = (float*)malloc(N*sizeof(float));
h_b = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(int));
h_a[0]= .333f;
h_a[1]= .324f;
h_a[2]= .123f;
h_a[3]= .543f;
h_a[4]= .00054f;
h_a[5]= .2243f;
h_a[6]= .333f;
h_a[7]= .53343f;
h_a[8]= .4434f;
cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
counter<<<blocks,threads>>>(d_a,d_b,N);
cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i < N; i++)
{
printf("%f = %d times\n",h_a[i],h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
getchar();
return 0;
}