I am using Thrust 1.8 and I get two compiler errors when I try to compile the below code :
#include <thrust/device_vector.h>
#include <thrust/functional.h>
int main(int argc, char* argv[])
{
thrust::device_vector<bool> condition(100);
thrust::device_vector<int> input(100);
thrust::device_vector<float> result(100);
float mean = 10.4f;
thrust::transform(condition.begin(),condition.end(),input.begin(),result.begin(), ( (thrust::placeholders::_1 ) ? ( thrust::placeholders::_2) : (mean) ) );
}
When I try to compile, I get the following compiler time errors :
(for placeholders::_1)
Error : Expression must be of bool type (or convertible to bool)
(for placeholders::_2)
Error : operand types are incompatible ("const thrust::detail::functional::actor < thrust::detail::functional::argument<1U>>" and "float")
How to correct this?
You cannot use the placeholders like you tried to, i.e. in combination with a ternary operator.
Instead, you could create your own functor:
struct my_fun : public thrust::binary_function<bool,float,float>
{
float mean;
my_fun(float mean) : mean(mean) {}
__host__ __device__
float operator()(bool condition, float input) const
{
float result = mean;
if (condition)
{
result = input;
}
return result;
}
};
...
thrust::transform(condition.begin(),
condition.end(),
input.begin(),
result.begin(),
my_fun(mean));
Related
In CUDA 9.2 I have something like this:
#ifdef __CUDA_ARCH__
struct Context { float n[4]; } context;
#else
typedef __m128 Context;
#endif
struct A { float k[2]; };
struct B { float q[4]; };
struct FTransform : thrust::unary_function<A, B>
{
const Context context;
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const
{
B b{{a.k[0], a.k[1], a.k[0]*context.n[0], a.k[1]*context.n[1]}};
return b;
}
};
void DoThrust(B* _bs, const Context& context, A* _as, uint32_t count)
{
thrust::device_ptr<B> bs = thrust::device_pointer_cast(_bs);
thrust::device_ptr<A> as = thrust::device_pointer_cast(_as);
FTransform fTransform(context);
auto first = thrust::make_transform_iterator(as, fTransform);
auto last = thrust::make_transform_iterator(as + count, fTransform);
thrust::copy(first, last, bs);
}
int main(int c, char **argv)
{
const uint32_t Count = 4;
Context context;
A* as;
B* bs;
cudaMalloc(&as, Count*sizeof(A));
cudaMalloc(&bs, Count*sizeof(B));
A hostAs[Count];
cudaMemcpy(as, hostAs, Count * sizeof(A), cudaMemcpyHostToDevice);
DoThrust(bs, context, as, Count);
B hostBs[Count];
cudaMemcpy(hostBs, bs, Count * sizeof(B), cudaMemcpyDeviceToHost);//crash
return 0;
}
Then when I call a standard cudaMemcpy() call later on the results I get the exception "an illegal memory access was encountered".
If I replace the thrust code with a non-thrust equivalent there is no error and everything works fine. Various combinations of trying to copy to device_vectors etc I get different crashes that seem to be thrust trying to release the device_ptr's for some reason - so maybe it is here for some reason?
== UPDATE ==
Ok that was confusing it appears it's due to the functor FTransform context member variable in my actual more complicated case. This specifically:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct Context { float v[4]; } context;
#else
__m128 context;
#endif
...
};
So I guess it's an alignment problem somehow => in fact it is, as this works:
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif
The solution is to ensure that if you use aligned types in thrust functor members (such as __m128 SSE types) that are copied to the GPU, that they are defined as aligned both during NVCC's CPU and GPU code build passes - and not accidentally assume even if a type may seem to naturally align to it's equivalent in the other pass that it will be ok, as otherwise bad hard to understand things may happen.
So for example the _ align _(16) is necessary in code like this:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const; // function makes use of context
};
I have found some strange runtime behaviour while experimenting with function pointers in CUDA.
Goal
My goal is to make my function pointers choose which function to apply to two objects according to an internal property of the latter.
In short, I want to emulate C++ templates with a CUDA kernel - without actually using template arguments or switch clauses, but function pointers and class/struct members instead.
Approach
Define my custom objects struct customObj with one property (int type) that will emulate the arguments of a template.
Define a bunch of dummy functions (Sum(), Subtract(), etc) to choose from.
Keep the list of functions to apply (functionsList) and respective type members to look up (first_types, second_types) in __constant__ memory, such that function functionsList[i](obj1,obj2) is applied to objects with obj1.type == first_types[i] and obj2.type == second_types[i].
Working code
The following code has been compiled for Linux x86_64 with CUDA 5.0, on a GPU with compute capability 3.0 (GeForce GTX 670), and works.
#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct customObj
{
int type;
double d;
// Constructors
__device__ __host__ customObj() {}
__device__ __host__ customObj(const int& _type, const double& _d) : type(_type), d(_d) {}
};
typedef void (*function_t)(customObj&, customObj&);
// Define a bunch of functions
__host__ __device__ void Sum(customObj& obj1, customObj& obj2) {printf("Sum chosen! d1 + d2 = %f\n", obj1.d + obj2.d);}
__host__ __device__ void Subtract(customObj& obj1, customObj& obj2) {printf("Subtract chosen! d1 - d2 = %f\n", obj1.d - obj2.d);}
__host__ __device__ void Multiply(customObj& obj1, customObj& obj2) {printf("Multiply chosen! d1 * d2 = %f\n", obj1.d * obj2.d);}
#define ARRAYLENGTH 3
__constant__ int first_type[ARRAYLENGTH] = {1, 2, 3};
__constant__ int second_type[ARRAYLENGTH] = {1, 1, 2};
__constant__ function_t functionsList[ARRAYLENGTH] = {Sum, Sum, Subtract};
// Kernel to loop through functions list
__global__ void choosefunction(customObj obj1, customObj obj2) {
int i = 0;
function_t f = NULL;
do {
if ((obj1.type == first_type[i]) && (obj2.type == second_type[i])) {
f = functionsList[i];
break;
}
i++;
} while (i < ARRAYLENGTH);
if (f == NULL) printf("No possible interaction!\n");
else f(obj1,obj2);
}
int main() {
customObj obj1(1, 5.2), obj2(1, 2.6);
choosefunction<<<1,1>>>(obj1, obj2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
The issue
The problem that I have found is that, as soon as I replace the datatype of member int type and related variables and functions (__constant__ int first_types[...] and so on)... the code compiles but stops working!
If I change the datatype from int to char or int8_t, the memory checker throws error 4 on my call to cudaDeviceSynchronize().
If I change the datatype to unsigned short int, I get a hardware stack overflow.
So, is anybody having similar issues when working with __constant__ memory? I really have no clue about what is going on. As far as I know, char and int8_t are built-in types of 1 byte length, while the size of int is 4 bytes, so maybe it is about data alignment, but I'm just guessing here. Besides, CUDA is supposed to support function pointers on the GPU since compute capability 2.0. Are there any special constraints for function pointers in __constant__ memory that I'm missing?
I was able to reproduce the problem (error 4, unspecified launch failure) on CUDA 5.0 on 64bit RHEL 5.5, but not on CUDA 6.0.
Please update/upgrade to CUDA 6.
I was trying to make somtehing like this (actually I need to write some integration functions) in CUDA
#include <iostream>
using namespace std;
float f1(float x) {
return x * x;
}
float f2(float x) {
return x;
}
void tabulate(float p_f(float)) {
for (int i = 0; i != 10; ++i) {
std::cout << p_f(i) << ' ';
}
std::cout << std::endl;
}
int main() {
tabulate(f1);
tabulate(f2);
return 0;
}
output:
0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9
I tried the following but only got the error
Error: Function pointers and function template parameters are not supported in sm_1x.
float f1(float x) {
return x;
}
__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
for (lower; lower < upper; lower++) {
*result = *result + p_function(lower);
}
}
int main() {
float res;
float* dev_res;
cudaMalloc( (void**)&dev_res, sizeof(float) ) ;
tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("%f\n", res);
/************************************************************************/
scanf("%s");
return 0;
}
To get rid of your compile error, you'll have to use -gencode arch=compute_20,code=sm_20 as a compiler argument when compiling your code. But then you'll likely have some runtime problems:
Taken from the CUDA Programming Guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions
Function pointers to __global__ functions are supported in host code, but not in device code.
Function pointers to __device__ functions are only supported in device code compiled for devices of compute capability 2.x and higher.
It is not allowed to take the address of a __device__ function in host code.
so you can have something like this (adapted from the "FunctionPointers" sample):
//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);
//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
...
}
//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;
//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))
You can then pass the h_pointFunction as a parameter to your kernel, which can use it to call your __device__ function.
//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
unsigned char tmp;
...
tmp = (*pPointOperation)(tmp, 150.0)
...
}
//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);
Hopefully that made some sense. In all, it looks like you would have to change your f1 function to be a __device__ function and follow a similar procedure (the typedefs aren't necessary, but they do make the code nicer) to get it as a valid function pointer on the host-side to pass to your kernel. I'd also advise giving the FunctionPointers CUDA sample a look over
Even though you may be able to compile this code (see #Robert Crovella's answer) this code will not work. You cannot pass function pointers from host code as the host compiler has no way of figuring out the function address.
Here is a simple class for function pointers that are callable from within a kernel I wrote based on this question:
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr)
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
you could use it like this:
__device__ double func1(double x)
{
return x + 1.0f;
}
typedef double (*func)(double x);
__device__ func f_ = func1;
__global__ void test_kernel(func* f)
{
double x = (*f)(2.0);
printf("%g\n", x);
}
int main()
{
cudaCallableFunctionPointer<func> f(&f_);
test_kernel << < 1, 1 >> > (f.ptr);
}
output:
3
I'm attempting to reduce the min and max of an array of values using Thrust and I seem to be stuck. Given an array of floats what I would like is to reduce their min and max values in one pass, but using thrust's reduce method I instead get the mother (or at least auntie) of all template compile errors.
My original code contains 5 lists of values spread over 2 float4 arrays that I want reduced, but I've boiled it down to this short example.
struct ReduceMinMax {
__host__ __device__
float2 operator()(float lhs, float rhs) {
return make_float2(Min(lhs, rhs), Max(lhs, rhs));
}
};
int main(int argc, char *argv[]){
thrust::device_vector<float> hat(4);
hat[0] = 3;
hat[1] = 5;
hat[2] = 6;
hat[3] = 1;
ReduceMinMax binary_op_of_dooooom;
thrust::reduce(hat.begin(), hat.end(), 4.0f, binary_op_of_dooooom);
}
If I split it into 2 reductions instead it of course works. My question is then: Is it possible to reduce both the min and max in one pass with thrust and how? If not then what is the most efficient way of achieving said reduction? Will a transform iterator help me (and if so, will the reduction then be a one pass reduction?)
Some additional info:
I'm using Thrust 1.5 (as supplied by CUDA 4.2.7)
My actual code is using reduce_by_key, not just reduce.
I found transform_reduce while writing this question, but that one doesn't take keys into account.
As talonmies notes, your reduction does not compile because thrust::reduce expects the binary operator's argument types to match its result type, but ReduceMinMax's argument type is float, while its result type is float2.
thrust::minmax_element implements this operation directly, but if necessary you could instead implement your reduction with thrust::inner_product, which generalizes thrust::reduce:
#include <thrust/inner_product.h>
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <cassert>
struct minmax_float
{
__host__ __device__
float2 operator()(float lhs, float rhs)
{
return make_float2(thrust::min(lhs, rhs), thrust::max(lhs, rhs));
}
};
struct minmax_float2
{
__host__ __device__
float2 operator()(float2 lhs, float2 rhs)
{
return make_float2(thrust::min(lhs.x, rhs.x), thrust::max(lhs.y, rhs.y));
}
};
float2 minmax1(const thrust::device_vector<float> &x)
{
return thrust::inner_product(x.begin(), x.end(), x.begin(), make_float2(4.0, 4.0f), minmax_float2(), minmax_float());
}
float2 minmax2(const thrust::device_vector<float> &x)
{
using namespace thrust;
pair<device_vector<float>::const_iterator, device_vector<float>::const_iterator> ptr_to_result;
ptr_to_result = minmax_element(x.begin(), x.end());
return make_float2(*ptr_to_result.first, *ptr_to_result.second);
}
int main()
{
thrust::device_vector<float> hat(4);
hat[0] = 3;
hat[1] = 5;
hat[2] = 6;
hat[3] = 1;
float2 result1 = minmax1(hat);
float2 result2 = minmax2(hat);
assert(result1.x == result2.x);
assert(result1.y == result2.y);
}
I want to call MessageBox() function in such way:
1). load needed library
2). get the function address
3). call it
So, for such aim as I understand, I should define new type with all types of arguments in MessageBox function.
It returnes INT and accepts: HWND, LPCSTR, LPCSTR, UNIT.
So I registred new type:
typedef int(__stdcall *msgbox)(HWND, LPCSTR, LPCSTR, UINT);
I have problems with calling such function. Does such way work for all functions or only for exported?
How can I call MessageBox exactly in such way?
Full code:
#include <iostream>
#include <windows.h>
using namespace std;
typedef int(__stdcall *msgbox)(HWND, LPCSTR, LPCSTR, UINT);
int main(void)
{
HINSTANCE__ *hModule = LoadLibrary(L"\\Windows\\System32\\User32.dll");
msgbox *me = 0;
if(hModule != 0)
{
me = (msgbox*)GetProcAddress(hModule, "MessageBox");
}
return 0;
}
Why are you declaring everything as a pointer?
LoadLibrary returns an HMODULE, not an HINSTANCE__ * (it will work with the latter but it's always better to adhere to the documentation).
Similarly, msgbox is typedef'd to a function pointer type, so me is a msgbox, not a msgbox *.
The reason why GetProcAddress fails is because user32.dll exports 2 functions, MessageBoxA and MessageBoxW. When you simply call MessageBox in your code, macros defined in Windows.h replace it with one of the 2 actual function names depending on whether you're compiling for UNICODE or not. But when you're trying to directly access the exported function as you are doing, you need to explicitly specify which one you're trying to get a pointer to.
#include <iostream>
#include <windows.h>
typedef int(__stdcall *msgbox)(HWND, LPCSTR, LPCSTR, UINT);
int main(void)
{
HMODULE hModule = ::LoadLibrary(L"User32.dll");
msgbox me = NULL;
if( hModule != NULL ) {
me = reinterpret_cast<msgbox>( ::GetProcAddress(hModule, "MessageBoxA") );
}
if( me != NULL ) {
(*me)( NULL, "I'm a MessageBox", "Hello", MB_OK );
}
if( hModule != NULL ) {
::FreeLibrary( hModule );
}
}