I want to do some thrust operations but I am not sure how exactly.
Right now , I am receiving am array full of zeros ( the h_a array)
I have :
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/device_ptr.h>
#include <thrust/fill.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/generate.h>
template <typename T>
struct square
{
__host__ __device__
T operator()( const T& x ) const
{
return x * x;
}
};
int
main(
int argc,
const char * argv[] )
{
const size_t NbOfPoints = 256;
int BlocksPerGridX = 16;
int BlocksPerGridY = 16;
int ThreadsPerBlockX = 16;
int ThreadsPerBlockY = 16;
// generate random data on the host
thrust::host_vector<float> h_Kx ( NbOfPoints );
thrust::generate( h_Kx.begin(), h_Kx.end(), rand );
thrust::host_vector<float> h_Ky ( NbOfPoints );
thrust::generate( h_Ky.begin(), h_Ky.end(), rand );
// transfer to device
thrust::device_vector<float> dev_Kx = h_Kx;
thrust::device_vector<float> dev_Ky = h_Ky;
// create arrays for holding the number of threads per block in each dimension
int * X , * Y;
cudaMalloc((void **) &X, ThreadsPerBlockX * BlocksPerGridX * sizeof(*X) );
cudaMalloc((void **) &Y, ThreadsPerBlockY * BlocksPerGridY * sizeof(*Y) );
// wrap raw pointer with a device_ptr
thrust::device_ptr<int> dev_X ( X );
thrust::device_ptr<int> dev_Y ( Y );
// use device_ptr in Thrust algorithms
thrust::fill( dev_X, dev_X + ( ThreadsPerBlockX * BlocksPerGridX ) , (int) 0 );
thrust::fill( dev_Y, dev_Y + ( ThreadsPerBlockY * BlocksPerGridY ) , (int) 0 );
// setup arguments
square<float> square_op;
// create various vectors
thrust::device_vector<int> distX ( NbOfPoints );
thrust::device_vector<int> distY ( NbOfPoints );
thrust::device_vector<unsigned int> Tmp ( NbOfPoints );
thrust::host_vector<unsigned int> h_a ( NbOfPoints );
thrust::device_vector<unsigned int> distXSquared ( NbOfPoints );
thrust::device_vector<unsigned int> distYSquared ( NbOfPoints );
// compute distX = dev_Kx - dev_X and distY = dev_Ky - dev_Y
thrust::transform( dev_Kx.begin(), dev_Kx.begin(), dev_X , distX.begin() , thrust::minus<float>() );
thrust::transform( dev_Ky.begin(), dev_Ky.begin(), dev_Y , distY.begin() , thrust::minus<float>() );
//square distances
thrust::transform( distX.begin(), distX.end(), distXSquared.begin(), square_op );
thrust::transform( distY.begin(), distY.end(), distYSquared.begin(), square_op );
// compute Tmp = distX + distY
thrust::transform( distXSquared.begin() ,distXSquared.begin() , distYSquared.begin() , Tmp.begin() , thrust::plus<unsigned int>() );
thrust::copy( Tmp.begin(), Tmp.end(), h_a.begin() );
for ( int i = 0; i < 5; i ++ )
printf("\n temp = %u",h_a[ i ] );
return 0;
}
UPDATE:
Apart the edits from Robert Crovella , you must edit to integers:
square<int> square_op;
thrust::transform( dev_Kx.begin(), dev_Kx.end(), dev_X , distX.begin() , thrust::minus<int>() );
thrust::transform( dev_Ky.begin(), dev_Ky.end(), dev_Y , distY.begin() , thrust::minus<int>() );
You've got several instances of doing zero-length transforms:
thrust::transform( dev_Kx.begin(), dev_Kx.begin(), dev_X , distX.begin() , thrust::minus<float>() );
thrust::transform( dev_Ky.begin(), dev_Ky.begin(), dev_Y , distY.begin() , thrust::minus<float>() );
and:
thrust::transform( distXSquared.begin() ,distXSquared.begin() , distYSquared.begin() , Tmp.begin() , thrust::plus<unsigned int>() );
Since the first two parameters to each of the above transforms is the same, the work being done is zero. Presumably you want the corresponding .end() iterators in the second position rather than .begin()
When I make those changes, I got non-zero values printed out. They are quite large, but you appear to be squaring large values, so I'm not sure what your intent is.
Related
below is a simplified version of a problem that I am trying to solve. Both code snipets compile, but #2 throws an "illegal memory access". Basically, if an array is encapsulated in a structure, passing a pointer to that structure to cudaMalloc creates all kind of problems -- at least the way I do it. I am pretty sure this is due to the fact that the address of dum in the code below is on the host, and so is not accessible inside the kernel. Problem is, I don't know how to create a device version of dum... E.g., using cudaMalloc( (void**)&dum , sizeof(dummy) * 1 ) instead of the new dummy syntax below does not solve the problem. I think I am getting confused with the double pointer used by cudaMalloc.
Of course it may seem silly in this example to encapsulate an array of double in a structure, in the actual code I really need to do this though.
struct dummy
{
double *arr;
};
void allocate( dummy *dum , int n )
{
cudaMalloc( (double**)&(dum->arr) , sizeof(double) * n );
}
__global__ void test( double val , dummy *dum , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
dum->arr[ii] = val;
}
__global__ void test2( double val , double *arr , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
arr[ii] = val;
}
int main()
{
int n = 10;
dummy *dum = new dummy;
/* CODE 1: the piece of code below works */
double *p;
gpu_err_chk( cudaMalloc( &p , sizeof(double) * n ) );
test2<<< 1 , 1 >>>( 123.0 , p , n );
gpu_err_chk( cudaDeviceSynchronize() );
/* CODE 2: the piece of code below does not... */
allocate( dum , n );
test<<< 1 , 1 >>>( 123.0 , dum , n );
gpu_err_chk( cudaDeviceSynchronize() );
return 1;
}
After digging through some example in previous posts by Robert, I was able to re-write the code so that it works:
struct dummy
{
double *arr;
};
__global__ void test( dummy *dum , int n )
{
printf( "test\n" );
for( int ii = 0 ; ii < n ; ii++ )
printf( "dum->arr[%d] = %f\n" , ii , dum->arr[ii] );
}
int main()
{
int n = 10;
dummy *dum_d , *dum_h;
srand( time(0) );
dum_h = new dummy;
dum_h->arr = new double[n];
for( int ii = 0 ; ii < n ; ii++ ){
dum_h->arr[ii] = double( rand() ) / RAND_MAX;
printf( "reference data %d = %f\n" , ii , dum_h->arr[ii] );
}
cudaMalloc( &dum_d , sizeof(dummy) * 1 );
cudaMemcpy( dum_d , dum_h , sizeof(dummy) * 1 , cudaMemcpyHostToDevice );
double *tmp;
cudaMalloc( &tmp , sizeof(double) * n );
cudaMemcpy( &( dum_d->arr ) , &tmp , sizeof(double*) , cudaMemcpyHostToDevice ); // copy the pointer (host) to the device structre to a device pointer
cudaMemcpy( tmp , dum_h->arr , sizeof(double) * n , cudaMemcpyHostToDevice );
delete [] dum_h->arr;
delete dum_h;
test<<< 1 , 1 >>>( dum_d , n );
gpu_err_chk( cudaDeviceSynchronize() );
cudaFree( tmp );
cudaFree( dum_d );
return 1;
}
However, I am still confused why this works. Does anyone have a visual diagram of what's going on? I am getting lost with the different pointers...
Moreover, there is one thing that really blows my mind: I can free tmp right before the kernel launch and the code still works, i.e.:
cudaFree( tmp );
test<<< 1 , 1 >>>( dum_d , n );
gpu_err_chk( cudaDeviceSynchronize() );
How is this the case? In my mind (clearly wrong), the device array containing the random values is gone...
Another point of confusion is that I can't free dum_d->arr directly (freeCuda(dum_d->arr)), this throws a segmentation fault.
May I know why the following simple cuda program fails on a device with CC5.2?
#include <iostream>
#include <math.h>
#include <stdlib.h>
#define N 1
__global__ void vector_addition( int *a, int *b, int *c )
{
int i = blockDim.x + blockIdx.x + threadIdx.x;
if ( i < N )
c[ i ] = a[ i ] + b[ i ];
}
int main()
{
size_t bytes = N*sizeof( int );
int *A = (int *)malloc(bytes);
int *B = (int *)malloc(bytes);
int *C = (int *)malloc(bytes);
int *d_A, *d_B, *d_C;
cudaMalloc( &d_A, bytes );
cudaMalloc( &d_B, bytes );
cudaMalloc( &d_C, bytes );
for ( int i = 0; i < N; i++ ) {
A[ i ] = 1; B[ i ] = 2; C[ i ] = 0;
}
cudaMemcpy( d_A, A, bytes, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, B, bytes, cudaMemcpyHostToDevice );
int thr_per_blk = 1024;
int blk_in_grid = ceil( float( N ) / thr_per_blk );
vector_addition<<< blk_in_grid, thr_per_blk >>>( d_A, d_B, d_C );
cudaMemcpy( C, d_C, bytes, cudaMemcpyDeviceToHost );
for ( int i = 0; i < N; i++ ) {
if ( C[ i ] != 3 ) {
std::cout << "error\n";
}
}
free( A ); free( B ); free( C );
cudaFree( d_A ); cudaFree( d_B ); cudaFree( d_C );
return 0;
}
The output is error message.
This line in your kernel is not correct:
int i = blockDim.x + blockIdx.x + threadIdx.x;
That is not the proper way to generate a 1D index. It should be:
int i = blockDim.x * blockIdx.x + threadIdx.x;
With your incorrect indexing, the first thread, which should generate a 0 for a globally unique index, generates 1024+0+0 = 1024. This fails the if test in your kernel, so no threads actually do anything.
When I try to use swap() in in the function partitionk(), I get the error
" error: request for member 'swap' in '(& num_list)->std::vector<_Tp, _Alloc>::operator[] >(((std::vector::size_type)endn))', which is of non-class type '__gnu_cxx::__alloc_traits >::value_type {aka int}'| " `
enter code here
#include <iostream>
#include <vector>
#include <unordered_map>
#include <cstdlib>
using namespace std;
int partitionk(vector<int>& num_list , int start, int endn ) {
int pindex = start;
int rand_num = rand( ) % endn;
num_list[endn].swap(num_list[rand_num]); // getting error
for (int i = 1 ; i < endn ; i++){
if ( num_list[i] < num_list[endn] ){
num_list[i].swap( num_list[pindex] ); // getting error
pindex += 1;
}
}
num_list[endn].swap(num_list[pindex]); // getting error
return pindex;
}
void quick_sort( vector<int>& num_list , int start, int endn ){
if (start >= endn) return ;
else{
int index = partitionk( num_list , start, endn ) ;
quick_sort( num_list , start, index );
quick_sort( num_list , index+1, endn );
}
}
int main()
{
vector <int> nums= {4,7,1,3,9,5};
quick_sort(nums , 0 , nums.size()-1 );
for (auto i : nums){
cout << i << " ";
}
}
Use std::swap():
std::swap(num_list[endn], num_list[num]];
The vector member swap() is meant for swapping entire vectors. And your use of swap() tries to call a swap member of a vector item, that is an int : there's no swap() for this type.
I am trying to replace some thrust calls to arrayfire to check the performance.
I am not sure if I am using properly arrayfire because the results I am taking do not match at all.
So , the thrust code for example I am using is:
cudaMalloc( (void**) &devRow, N * sizeof(float) );
...//devRow is filled
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < N; i++, SlBegin += PerSlElmts )
{
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts, SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow...
Arrayfire:
af::array SlBegin( N , devRow );
for ( int i = 0;i < N; i++,SlBegin += PerSlElmts )
{
accum( SlBegin );
}
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
//use theRow..
I am not sure how arrayfire handles the copy : af::array SlBegin( N , devRow ); .In thrust we have the device pointer which points from devRow to SlBegin , but in arrayfire..?
Also , I wanted to ask about using gfor .
In arrayfire webpage , it states that
Do not use this function directly; see GFOR: Parallel For-Loops.
And then for GFOR :
GFOR is disabled in the current version of ArrayFire
So , we can't use gfor?
---------UPDATE---------------------------
I have a small running example which shows the different results:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include "arrayfire.h"
#include <thrust/scan.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
__global__ void Kernel( const int N ,float * const devRow )
{
int i = threadIdx.x;
if ( i < N )
devRow[ i ] = i;
}
int main(){
int N = 6;
int Slices = 2;
int PerSlElmts = 3;
float * theRow = (float*) malloc ( N * sizeof( float ));
for ( int i = 0; i < N; i ++ )
theRow[ i ] = 0;
// raw pointer to device memory
float * devRow;
cudaMalloc( (void **) &devRow, N * sizeof( float ) );
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
// wrap raw pointer with a device_ptr
thrust::device_ptr<float> SlBegin( devRow );
for ( int i = 0; i < Slices; i++ , SlBegin += PerSlElmts )
thrust::inclusive_scan( SlBegin, SlBegin + PerSlElmts , SlBegin );
cudaMemcpy( theRow, devRow, N * sizeof(float), cudaMemcpyDeviceToHost );
for ( int i = 0; i < N; i++ )
printf("\n Thrust accum : %f",theRow[ i ] );
//--------------------------------------------------------------------//
Kernel<<< 1,N >>>( N , devRow );
cudaDeviceSynchronize();
af::array SlBeginFire( N, devRow );
for ( int i = 0; i < Slices; i++ , SlBeginFire += PerSlElmts )
af::accum( SlBeginFire );
SlBeginFire.host( theRow );
for ( int i = 0; i < N; i++ )
printf("\n Arrayfire accum : %f",theRow[ i ] );
cudaFree( devRow );
free( theRow );
return 0;
}
It looks like you are trying to run a column-wise (0th-dim in ArrayFire) scan on a 2D array. Here is some code that you could use:
af::array SlBegin(N, devRow);
af::array result = accum(SlBegin, 0);
Here is a sample output
A [5 3 1 1]
0.7402 0.4464 0.7762
0.9210 0.6673 0.2948
0.0390 0.1099 0.7140
0.9690 0.4702 0.3585
0.9251 0.5132 0.6814
accum(A, 0) [5 3 1 1]
0.7402 0.4464 0.7762
1.6612 1.1137 1.0709
1.7002 1.2236 1.7850
2.6692 1.6938 2.1435
3.5943 2.2070 2.8249
This runs and inclusive scan on each column independently.
As for gfor, it has been added to the Open Source version of ArrayFire. As this code base is still a beta, improvements and fixes are taking place very rapidly. So keep a watch on our github page.
Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);