I am trying to replace some thrust calls to arrayfire to check the performance.
I am not sure if I am using properly arrayfire because the results I am taking do not match at all.
So , the thrust code for example I am using is:
cudaMalloc( (void**) &devRow, N * sizeof(float) );
...//devRow is filled
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < N; i++, SlBegin += PerSlElmts )
{
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts, SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow...
Arrayfire:
af::array SlBegin( N , devRow );
for ( int i = 0;i < N; i++,SlBegin += PerSlElmts )
{
accum( SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow..
I am not sure how arrayfire handles the copy : af::array SlBegin( N , devRow ); .In thrust we have the device pointer which points from devRow to SlBegin , but in arrayfire..?
Also , I wanted to ask about using gfor .
In arrayfire webpage , it states that
Do not use this function directly; see GFOR: Parallel For-Loops.
And then for GFOR :
GFOR is disabled in the current version of ArrayFire
So , we can't use gfor?
---------UPDATE---------------------------
I have a small running example which shows the different results:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include "arrayfire.h"
#include <thrust/scan.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
__global__ void Kernel( const int N ,float * const devRow )
{
int i = threadIdx.x;
if ( i < N )
devRow[ i ] = i;
}
int main(){
int N = 6;
int Slices = 2;
int PerSlElmts = 3;
float * theRow = (float*) malloc ( N * sizeof( float ));
for ( int i = 0; i < N; i ++ )
theRow[ i ] = 0;
// raw pointer to device memory
float * devRow;
cudaMalloc( (void **) &devRow, N * sizeof( float ) );
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
// wrap raw pointer with a device_ptr
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < Slices; i++ , SlBegin += PerSlElmts )
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts , SlBegin );
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
for ( int i = 0; i < N; i++ )
printf("\n Thrust accum : %f",theRow[ i ] );
//--------------------------------------------------------------------//
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
af::array SlBeginFire( N, devRow );
for ( int i = 0; i < Slices; i++ , SlBeginFire += PerSlElmts )
af::accum( SlBeginFire );
SlBeginFire.host( theRow );
for ( int i = 0; i < N; i++ )
printf("\n Arrayfire accum : %f",theRow[ i ] );
cudaFree( devRow );
free( theRow );
return 0;
}
It looks like you are trying to run a column-wise (0th-dim in ArrayFire) scan on a 2D array. Here is some code that you could use:
af::array SlBegin(N, devRow);
af::array result = accum(SlBegin, 0);
Here is a sample output
A [5 3 1 1]
0.7402 0.4464 0.7762
0.9210 0.6673 0.2948
0.0390 0.1099 0.7140
0.9690 0.4702 0.3585
0.9251 0.5132 0.6814
accum(A, 0) [5 3 1 1]
0.7402 0.4464 0.7762
1.6612 1.1137 1.0709
1.7002 1.2236 1.7850
2.6692 1.6938 2.1435
3.5943 2.2070 2.8249
This runs and inclusive scan on each column independently.
As for gfor, it has been added to the Open Source version of ArrayFire. As this code base is still a beta, improvements and fixes are taking place very rapidly. So keep a watch on our github page.
Related
I have a very big array in Device memory, and I need to partition it into some smaller parts. Now, I wondered if I could use an array of arrays to access them by indices.
I tried to write the following code, however, it returns rubbish which is I think because of its undefined behavior. It has no error and I don't know if it is possible.
#include <stdio.h>
#include <assert.h>
#include <iostream>
inline
cudaError_t checkCuda(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void cudaVectorFill(int **array, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
array[0][i] = 1;
else if (i < 2 * N)
array[1][i - N] = 2;
else if (i < 3 * N)
array[2][i - 2 * N] = 3;
}
int main() {
int N = 100000000;
int **array = new int*[3];
checkCuda( cudaMalloc(&array[0], N * sizeof(int)) );
checkCuda( cudaMalloc(&array[1], N * sizeof(int)) );
checkCuda( cudaMalloc(&array[2], N * sizeof(int)) );
cudaVectorFill<<<(3 * N + 1023) / 1024, 1024>>>(array, N);
checkCuda( cudaPeekAtLastError() );
auto *host_array0 = new int[1];
auto *host_array1 = new int[1];
auto *host_array2 = new int[1];
checkCuda( cudaMemcpy(host_array0, array[0], 1 * sizeof(int), cudaMemcpyDeviceToHost) );
checkCuda( cudaMemcpy(host_array1, array[1], 1 * sizeof(int), cudaMemcpyDeviceToHost) );
checkCuda( cudaMemcpy(host_array2, array[2], 1 * sizeof(int), cudaMemcpyDeviceToHost) );
std::cout << *host_array0 << std::endl << *host_array1 << std::endl << *host_array2 << std::endl;
return 0;
}
Output is something like:
707093096
707093104
707093112
Correct Output should be:
1
2
3
As noted in comments, if you are passing pointers to a GPU kernel, they have to be accessible to the GPU. That means you either explicitly allocate a copy of the host array of device pointers and populate it on the device, or rely on managed or otherwise GPU accessible host memory.
One approach that will probably work in this case is:
int N = 100000000;
int **array = new int*[3];
checkCuda( cudaMalloc(&array[0], N * sizeof(int)) );
checkCuda( cudaMalloc(&array[1], N * sizeof(int)) );
checkCuda( cudaMalloc(&array[2], N * sizeof(int)) );
int **array_d;
checkCuda( cudaMalloc(&array_d, 3 * sizeof(int*)) );
checkCuda( cudaMemcpy(array_d, array, 3 * sizeof(int*), cudaMemcpyHostToDevice) );
cudaVectorFill<<<(3 * N + 1023) / 1024, 1024>>>(array_d, N);
[Standard disclaimer, code written in browser, no guarantees implied or given, use at own risk]
i.e. after building array in host memory, make a copy in GPU memory and pass that GPU memory copy to your kernel. There might be other problems in your code, I haven't analyzed further than the first six lines.
FYI, I just found another approach for 2D allocation in device memory. See method 3 in this example for more information. So we can use something like:
int N = 100000000;
int **array;
cudaMallocManaged(&array, 3 * sizeof(int *));
cudaMallocManaged(&(array[0]), N * sizeof(int));
cudaMallocManaged(&(array[1]), N * sizeof(int));
cudaMallocManaged(&(array[2]), N * sizeof(int));
cudaVectorFill<<<(3 * N + 1023) / 1024, 1024>>>(array, N);
It also worked fine.
May I know why the following simple cuda program fails on a device with CC5.2?
#include <iostream>
#include <math.h>
#include <stdlib.h>
#define N 1
__global__ void vector_addition( int *a, int *b, int *c )
{
int i = blockDim.x + blockIdx.x + threadIdx.x;
if ( i < N )
c[ i ] = a[ i ] + b[ i ];
}
int main()
{
size_t bytes = N*sizeof( int );
int *A = (int *)malloc(bytes);
int *B = (int *)malloc(bytes);
int *C = (int *)malloc(bytes);
int *d_A, *d_B, *d_C;
cudaMalloc( &d_A, bytes );
cudaMalloc( &d_B, bytes );
cudaMalloc( &d_C, bytes );
for ( int i = 0; i < N; i++ ) {
A[ i ] = 1; B[ i ] = 2; C[ i ] = 0;
}
cudaMemcpy( d_A, A, bytes, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, B, bytes, cudaMemcpyHostToDevice );
int thr_per_blk = 1024;
int blk_in_grid = ceil( float( N ) / thr_per_blk );
vector_addition<<< blk_in_grid, thr_per_blk >>>( d_A, d_B, d_C );
cudaMemcpy( C, d_C, bytes, cudaMemcpyDeviceToHost );
for ( int i = 0; i < N; i++ ) {
if ( C[ i ] != 3 ) {
std::cout << "error\n";
}
}
free( A ); free( B ); free( C );
cudaFree( d_A ); cudaFree( d_B ); cudaFree( d_C );
return 0;
}
The output is error message.
This line in your kernel is not correct:
int i = blockDim.x + blockIdx.x + threadIdx.x;
That is not the proper way to generate a 1D index. It should be:
int i = blockDim.x * blockIdx.x + threadIdx.x;
With your incorrect indexing, the first thread, which should generate a 0 for a globally unique index, generates 1024+0+0 = 1024. This fails the if test in your kernel, so no threads actually do anything.
I want to do some thrust operations but I am not sure how exactly.
Right now , I am receiving am array full of zeros ( the h_a array)
I have :
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/device_ptr.h>
#include <thrust/fill.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/generate.h>
template <typename T>
struct square
{
__host__ __device__
T operator()( const T& x ) const
{
return x * x;
}
};
int
main(
int argc,
const char * argv[] )
{
const size_t NbOfPoints = 256;
int BlocksPerGridX = 16;
int BlocksPerGridY = 16;
int ThreadsPerBlockX = 16;
int ThreadsPerBlockY = 16;
// generate random data on the host
thrust::host_vector<float> h_Kx ( NbOfPoints );
thrust::generate( h_Kx.begin(), h_Kx.end(), rand );
thrust::host_vector<float> h_Ky ( NbOfPoints );
thrust::generate( h_Ky.begin(), h_Ky.end(), rand );
// transfer to device
thrust::device_vector<float> dev_Kx = h_Kx;
thrust::device_vector<float> dev_Ky = h_Ky;
// create arrays for holding the number of threads per block in each dimension
int * X , * Y;
cudaMalloc((void **) &X, ThreadsPerBlockX * BlocksPerGridX * sizeof(*X) );
cudaMalloc((void **) &Y, ThreadsPerBlockY * BlocksPerGridY * sizeof(*Y) );
// wrap raw pointer with a device_ptr
thrust::device_ptr<int> dev_X ( X );
thrust::device_ptr<int> dev_Y ( Y );
// use device_ptr in Thrust algorithms
thrust::fill( dev_X, dev_X + ( ThreadsPerBlockX * BlocksPerGridX ) , (int) 0 );
thrust::fill( dev_Y, dev_Y + ( ThreadsPerBlockY * BlocksPerGridY ) , (int) 0 );
// setup arguments
square<float> square_op;
// create various vectors
thrust::device_vector<int> distX ( NbOfPoints );
thrust::device_vector<int> distY ( NbOfPoints );
thrust::device_vector<unsigned int> Tmp ( NbOfPoints );
thrust::host_vector<unsigned int> h_a ( NbOfPoints );
thrust::device_vector<unsigned int> distXSquared ( NbOfPoints );
thrust::device_vector<unsigned int> distYSquared ( NbOfPoints );
// compute distX = dev_Kx - dev_X and distY = dev_Ky - dev_Y
thrust::transform( dev_Kx.begin(), dev_Kx.begin(), dev_X , distX.begin() , thrust::minus<float>() );
thrust::transform( dev_Ky.begin(), dev_Ky.begin(), dev_Y , distY.begin() , thrust::minus<float>() );
//square distances
thrust::transform( distX.begin(), distX.end(), distXSquared.begin(), square_op );
thrust::transform( distY.begin(), distY.end(), distYSquared.begin(), square_op );
// compute Tmp = distX + distY
thrust::transform( distXSquared.begin() ,distXSquared.begin() , distYSquared.begin() , Tmp.begin() , thrust::plus<unsigned int>() );
thrust::copy( Tmp.begin(), Tmp.end(), h_a.begin() );
for ( int i = 0; i < 5; i ++ )
printf("\n temp = %u",h_a[ i ] );
return 0;
}
UPDATE:
Apart the edits from Robert Crovella , you must edit to integers:
square<int> square_op;
thrust::transform( dev_Kx.begin(), dev_Kx.end(), dev_X , distX.begin() , thrust::minus<int>() );
thrust::transform( dev_Ky.begin(), dev_Ky.end(), dev_Y , distY.begin() , thrust::minus<int>() );
You've got several instances of doing zero-length transforms:
thrust::transform( dev_Kx.begin(), dev_Kx.begin(), dev_X , distX.begin() , thrust::minus<float>() );
thrust::transform( dev_Ky.begin(), dev_Ky.begin(), dev_Y , distY.begin() , thrust::minus<float>() );
and:
thrust::transform( distXSquared.begin() ,distXSquared.begin() , distYSquared.begin() , Tmp.begin() , thrust::plus<unsigned int>() );
Since the first two parameters to each of the above transforms is the same, the work being done is zero. Presumably you want the corresponding .end() iterators in the second position rather than .begin()
When I make those changes, I got non-zero values printed out. They are quite large, but you appear to be squaring large values, so I'm not sure what your intent is.
I multiplay each row from pB to each row from pA and put max value to pC.
The problem is: in internal loop the only last row of receptors taken as "max value". As result the right column is totally wrong.
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1, 2, 3, 4, 5
, 2, 4, 6, 8, 2};
float pB[k * n] = { 9, 8, 7, 6, 5
, 4, 3, 2, 1, 9
, 8, 7, 6, 5, 4 };
float expected[k * n] = { 18, 32, 42, 48, 25
, 8, 12, 12, 8, 45
,16, 28, 36, 40, 20 };
float pC[k * n] = { 18, 32, 42, 48, 10
, 8, 12, 12, 8, 18
,16, 28, 36, 40, 8 };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
CPPUNIT_ASSERT_EQUAL_MESSAGE( "passed processing", 0, rst );
}
// pDevB and pDevC nave the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
if( value > pDevC[j] )
{
pDevC[j] = value;
}
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC, pfnMsg fnMsg )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = nullptr;
float* pDevB = nullptr;
float* pDevC = nullptr;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
Sorry, I missed at some point that you had edited your code.
The problem you are having is a race condition. In the failing case you are launching 2 blocks. The design of your algorithm is such that each block is operating on the same set of output elements (in pdevC). Therefore, since both blocks can execute simultaneously, both blocks can write to the same output elements simultaneously. This is a collision and there are two ways you can avoid it:
redesign your algorithm to partition the work differently between
blocks. Instead of each block checking all (or the same set of) the output elements
against a particular set of inputs, have each block only be
responsible for a portion of the output elements but checking
against all the inputs. This is a common code refactoring
operation that is done when converting a sequential/serial
algorithm, to one that runs in parallel.
use atomic operations to prevent the collisions from happening. If your algorithm only has a small amount of these types of collisions, it may be convenient and not very costly to use atomics. But when the algorithm uses atomics for every output element (perhaps multiple times, as in this case) it's probably better (for higher performance) to try to refactor the code as in method 1 above.
What follows is some code where I illustrate the second approach (because it is easier for me to write). There is no atomic function that provides an atomicMax operation on float, so I crafted my own, following the template given in the atomic functions documentation for creating arbitrary atomic operations using atomicCAS. That is what atomicMaxf is.
If you elect to use the first approach (recommended), I would point out that calling the kernel in a loop is probably not necessary for your algorithm. I would craft a new kernel that assigns one thread to each output point, and then computes all the necessary max operations on the various input points, in a loop (or nested loops) in the kernel. Since each thread is writing to one and only one unique output point, there is no possibility for write collisions between threads.
This code should provide correct results, anyway:
#include <stdio.h>
__device__ float atomicMaxf(float* address, float val)
{
int *address_as_int =(int*)address;
int old = *address_as_int, assumed;
while (val > __int_as_float(old)) {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val));
}
return __int_as_float(old);
}
// pDevB and pDevC have the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
atomicMaxf(pDevC+j, value);
// if( value > pDevC[j] )
// {
// pDevC[j] = value;
// }
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = NULL;
float* pDevB = NULL;
float* pDevC = NULL;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f
, 2.0f, 4.0f, 6.0f, 8.0f, 2.0f};
float pB[k * n] = { 9.0f, 8.0f, 7.0f, 6.0f, 5.0f
, 4.0f, 3.0f, 2.0f, 1.0f, 9.0f
, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f };
float expected[k * n] = { 18.0f, 32.0f, 42.0f, 48.0f, 25.0f
, 8.0f, 12.0f, 12.0f, 8.0f, 45.0f
,16.0f, 28.0f, 36.0f, 40.0f, 20.0f };
float pC[k * n] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
printf("passed processing: %d \n", rst );
for (int i=0; i<(k*n); i++)
if (pC[i] != expected[i]) printf("mismatch at %d, should be: %f was: %f\n", i, expected[i], pC[i]);
}
int main(){
TestCalcDotMax_2x5x3();
return 0;
}
Thanks a lot - it works now. Is possible to keep the index of iteratiion [idx] at the moment of comparing? Like this:
struct ValIndex_t
{
float value;
int index;
};
__device__ float atomicMaxPare( float* address, float val, int* index, int idx )
{
int *address_as_int = reinterpret_cast<int*>( address->value ); // assume that float has size of integer 32 bit
int old = *address_as_int, assumed;
while( val > ::__int_as_float(old) )
{
assumed = old;
old = ::atomicCAS( address_as_int, assumed, ::__float_as_int(val) );
*index = idx;
}
return ::__int_as_float(old);
}
__global__ void CudaPareDotMax( float* pDevA, const float* pDevB, ValIndex_t* pDevC, const size_t m, const size_t k, const size_t n )
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if( idx < m )
{
for( size_t row = 0; row < n; row++ )
{
for( size_t col = 0; col < k; col++ )
{
const size_t slice = col + row * k;
const size_t index = slice + k * n * idx;
pDevA[index] *= pDevB[ col + k * idx ];
float& prvalue = (pDevC + slice )->value;
int& prindex = (pDevC + slice )->index;
::atomicMaxPare( &prvalue, pDevA[ index ], &prindex, idx );
}
}
}
}
Or I have to use another atomic function for exchange? Not quite understand how to join it exactly at the moment when value became max. Thanks again
Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);