I am begining to port an existing fftw3 application to make use of the cuda fftw libraries. The initial stage is to simply replace the fftw3.h header with the cufft.h header and link the cufft libraries instead of the fftw3 libraries.
That is simple enough, and the code compiles with nvcc. However when I execute the code the application is unable to create a plan using the fftw_plan_guru_dft command (it just returns 0 instead of a valid plan).
Since there are no errors reported I am at a loss as to how I might debug this issue. cuda-gdb and gdb do not provide any further insight. They simply report
Error: Internal error reported by CUDA debugger API (error=7). The application cannot be further debugged.
UPDATE: So here is the minimum working example. As mentioned in my comment to Talonmies, this code is autogenerated by a scientific differential equation solver. So please excuse the function names etc.
#define real Re
#define imag Im
#include <complex>
#undef real
#undef imag
#include <cufftw.h>
#include <stdio.h>
int main(void) {
int _transform_sizes_index = 1, _loop_sizes_index = 0;
fftw_iodim _transform_sizes[1], _loop_sizes[2];
_transform_sizes[0].n = 128;
_transform_sizes[0].is = 0;
_transform_sizes[0].os = 0;
fftw_complex _data_in[128] = {0.};
static fftw_plan _fftw_forward_plan = NULL;
_fftw_forward_plan = fftw_plan_guru_dft(
_transform_sizes_index, _transform_sizes,
_loop_sizes_index, _loop_sizes,
reinterpret_cast<fftw_complex*>(_data_in),
reinterpret_cast<fftw_complex*>(_data_in),
FFTW_FORWARD, FFTW_PATIENT);
if (!_fftw_forward_plan)
printf("Error: Unable to create forward plan\n");
return 0;
}
Unless anyone else knows what I am doing wrong, it looks like this particular functionality of fftw3 may not be supported by cufftw.
As talonmies pointed out, the fftw_plan_guru_dft only has partial support in the cufftw library. The above example will run if you instead make use of the basic level fftw_plan_dft. More concretely
#define real Re
#define imag Im
#include <complex>
#undef real
#undef imag
#include <cufftw.h>
#include <stdio.h>
int main(void) {
int _transform_sizes_index = 1, _loop_sizes_index = 0;
int _transform_sizes[1] = {128};
fftw_complex _data_in[128] = {0.};
static fftw_plan _fftw_forward_plan = NULL;
_fftw_forward_plan = fftw_plan_dft(
_transform_sizes_index, _transform_sizes,
reinterpret_cast<fftw_complex*>(_data_in),
reinterpret_cast<fftw_complex*>(_data_in),
FFTW_FORWARD, FFTW_PATIENT);
if (!_fftw_forward_plan)
printf("Error: Unable to create forward plan\n");
return 0;
}
Related
In a CUDA C project, I would like to try and use the Thrust library in order to find the maximum element inside an array of floats. It seems like the Thrust function thrust::max_element() is what I need. The array on which I want to use this function is the result of a cuda kernel (which seems to work fine) and so it is already present in device memory when calling thrust::max_element().
I am not very familiar with the Thrust library but after looking at the documentation for thrust::max_element() and reading the answers to similar questions on this site, I thought I had grasped the working principles of this process. Unfortunately I get wrong results and it seems that I am not using the library functions correctly. Can somebody please tell me what is wrong in my code?
float* deviceArray;
float* max;
int length = 1025;
*max = 0.0f;
size = (int) length*sizeof(float);
cudaMalloc(&deviceArray, size);
cudaMemset(deviceArray, 0.0f, size);
// here I launch a cuda kernel which modifies deviceArray
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(deviceArray);
*max = *(thrust::max_element(d_ptr, d_ptr + length));
I use the following headers:
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
I keep getting zero values for *max even though I am sure that deviceArray contains non-zero values after running the kernel.
I am using nvcc as a compiler (CUDA 7.0) and I am running the code on a device with compute capability 3.5.
Any help would be much appreciated. Thanks.
This is not proper C code:
float* max;
int length = 1025;
*max = 0.0f;
You're not allowed to store data using a pointer (max) until you properly provide an allocation for that pointer (and set the pointer equal to the address of that allocation).
Apart from that, the rest of your code seems to work for me:
$ cat t990.cu
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <iostream>
int main(){
float* deviceArray;
float max, test;
int length = 1025;
max = 0.0f;
test = 2.5f;
int size = (int) length*sizeof(float);
cudaMalloc(&deviceArray, size);
cudaMemset(deviceArray, 0.0f, size);
cudaMemcpy(deviceArray, &test, sizeof(float),cudaMemcpyHostToDevice);
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(deviceArray);
max = *(thrust::max_element(d_ptr, d_ptr + length));
std::cout << max << std::endl;
}
$ nvcc -o t990 t990.cu
$ ./t990
2.5
$
want to do this programm on cuda.
1.in "main.cpp"
struct Center{
double * Data;
int dimension;
};
typedef struct Center Center;
//I allow a pointer on N Center elements by the CUDAMALLOC like follow
....
#include "kernel.cu"
....
center *V_dev;
int M =100, n=4;
cudaStatus = cudaMalloc((void**)&V_dev,M*sizeof(Center));
Init<<<1,M>>>(V_dev, M, N); //I always know the dimension of N before calling
My "kernel.cu" file is something like this
#include "cuda_runtime.h"
#include"device_launch_parameters.h"
... //other include headers to allow my .cu file to know the Center type definition
__global__ void Init(Center *V, int N, int dimension){
V[threadIdx.x].dimension = dimension;
V[threadIdx.x].Data = (double*)malloc(dimension*sizeof(double));
for(int i=0; i<dimension; i++)
V[threadIdx.x].Data[i] = 0; //For the value, it can be any kind of operation returning a float that i want to be able put here
}
I'm on visual studio 2008 and CUDA 5.0. When I Build my project, I've got these errors:
error: calling a _host_ function("malloc") from a _global_ function("Init") is not allowed.
I want to know please how can I perform this? (I know that 'malloc' and other cpu memory allocation are not allowed for device memory.
malloc is allowed in device code but you have to be compiling for a cc2.0 or greater target GPU.
Adjust your VS project settings to remove any GPU device settings like compute_10,sm_10 and replace it with compute_20,sm_20 or higher to match your GPU. (And, to run that code, your GPU needs to be cc2.0 or higher.)
You need the compiler parameter -arch=sm_20 and a GPU which supports it.
Thrust automatically selects the GPU backend when I provide an algorithm with iterators from thrust::device_vector, since the vector's data lives on the GPU. However, when I only provide thrust::counting_iterator parameters to an algorithm, how can I select which backend it executes on?
In the following invocation of thrust::find, there are no device_vector iterator arguments, so how does Thrust choose which backend (CPU, OMP, TBB, CUDA) to use?
How can I control on which backend this algorithm executes without using thrust::device_vector<> in this code?
thrust::counting_iterator<uint64_t> first(i);
thrust::counting_iterator<uint64_t> last = first + step_size;
auto iter = thrust::find(
thrust::make_transform_iterator(first, functor),
thrust::make_transform_iterator(last, functor),
true);
UPDATE 23.01.14. MSVS2012, CUDA5.5, Thrust 1.7:
Compile success!
#include <iostream>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/find.h>
#include <thrust/functional.h>
#include <thrust/execution_policy.h>
struct is_odd : public thrust::unary_function<uint64_t, bool> {
__host__ __device__ bool operator()(uint64_t const& x) {
return x & 1;
}
};
int main() {
thrust::counting_iterator<uint64_t> first(0);
thrust::counting_iterator<uint64_t> last = first + 100;
auto iter = thrust::find(thrust::device,
thrust::make_transform_iterator(first, is_odd()),
thrust::make_transform_iterator(last, is_odd()),
true);
int bbb; std::cin >> bbb;
return 0;
}
Sometimes where a Thrust algorithm executes can be ambiguous, as in your counting_iterator example, because its associated "backend system" is thrust::any_system_tag (a counting_iterator can be dereferenced anywhere because it is not backed by data). In situations like this, Thrust will use the device backend. By default, this will be CUDA. However, you can explicitly control how execution happens in a couple of ways.
You can either explicitly specify the system through the template parameter as in ngimel's answer, or you can provide the thrust::device execution policy as the first argument to thrust::find in your example:
#include <thrust/execution_policy.h>
...
thrust::counting_iterator<uint64_t> first(i);
thrust::counting_iterator<uint64_t> last = first + step_size;
auto iter = thrust::find(thrust::device,
thrust::make_transform_iterator(first, functor),
thrust::make_transform_iterator(last, functor),
true);
This technique requires Thrust 1.7 or better.
You have to specify System template parameter when instantiating counting_iterator:
typedef thrust::device_system_tag System;
thrust::counting_iterator<uint64_t,System> first(i)
If you are using the current version of Thrust, please follow the way Jared Hoberock mentioned. But if you might use older versions (the system that you work at might have old version of CUDA) then the example below might help.
#include <thrust/version.h>
#if THRUST_MINOR_VERSION > 6
#include <thrust/execution_policy.h>
#elif THRUST_MINOR_VERSION == 6
#include <thrust/iterator/retag.h>
#else
#endif
...
#if THRUST_MINOR_VERSION > 6
total =
thrust::transform_reduce(
thrust::host
, thrust::counting_iterator<unsigned int>(0)
, thrust::counting_iterator<unsigned int>(N)
, AFunctor(), 0, thrust::plus<unsigned int>());
#elif THRUST_MINOR_VERSION == 6
total =
thrust::transform_reduce(
thrust::retag<thrust::host_system_tag>(thrust::counting_iterator<unsigned int>(0))
, thrust::retag<thrust::host_system_tag>(thrust::counting_iterator<unsigned int>(N))
, AFunctor(), 0, thrust::plus<unsigned int>());
#else
total =
thrust::transform_reduce(
thrust::counting_iterator<unsigned int, thrust::host_space_tag>(0)
, thrust::counting_iterator<unsigned int, thrust::host_space_tag>(objectCount)
, AFunctor(), 0, thrust::plus<unsigned int>());
#endif
#see Thrust: How to directly control where an algorithm invocation executes?
This is my first question. I gave up and will use a hand rolled functor for this, but I am curious as to how it is supposed to be done. The contrived example below is intended to resize all of the vectors in a vector to be of size 9, by filling them with nulls. The indicated line causes MinGW GCC 4.5.0 to spew a lot of template errors. I've tried several different permutations, but only posted the code that I consider to be "closest to correct" below. How should it be written? Note, I want to retain the two-argument version of resize.
#include <vector>
using std::vector;
#include <algorithm>
using std::for_each;
#include <tr1/functional>
using std::tr1::bind;
using std::tr1::placeholders::_1;
int main() {
vector<vector<void *> > stacked_vector(20);
for_each(stacked_vector.begin(),stacked_vector.end(),
bind(&std::vector<void *>::resize,_1,9,0/*NULL*/)); // voluminous error output
return 0;
}
Thank you very much for your input.
It's hard to say without seeing the error output (and frankly, even with it). However, try passing the NULL as a void* type: static_cast<void*>(0). Otherwise the object returned by bind tries to give an int value as the second parameter to resize.
Try this.
#include <functional>
#include <algorithm>
#include <iostream>
#include <vector>
int main()
{
typedef std::vector<int> vec_int;
typedef std::vector<vec_int> vec_vec_int;
// Do this to make the _1 work
using namespace std::placeholders;
static const int FIRST_DIM = 5;
static const int SECOND_DIM = 10;
static const int DEFAULT_VALUE = 66;
vec_vec_int v(FIRST_DIM);
std::for_each(v.begin(), v.end(),
std::bind(&vec_int::resize, _1, SECOND_DIM, DEFAULT_VALUE));
std::cout << v[4][9];
return (0);
}
If you do not want to specify the default value, you do not need to.
Is there convenient way for using asserts within the kernels invocation on device mode?
CUDA now has a native assert function. Use assert(...). If its argument is zero, it will stop kernel execution and return an error. (or trigger a breakpoint if in CUDA debugging.)
Make sure to include "assert.h". Also, this requires compute capability 2.x or higher, and is not supported on MacOS. For more details see CUDA C Programming Guide, Section B.16.
The programming guide also includes this example:
#include <assert.h>
__global__ void testAssert(void)
{
int is_one = 1;
int should_be_one = 0;
// This will have no effect
assert(is_one);
// This will halt kernel execution
assert(should_be_one);
}
int main(int argc, char* argv[])
{
testAssert<<<1,1>>>();
cudaDeviceSynchronize();
return 0;
}
#define MYASSERT(condition) \
if (!(condition)) { return; }
MYASSERT(condition);
if you need something fancier you can use cuPrintf() which is available from the CUDA site for registered developers.