I wrote a __device__ function that uses a for loop. It works on GTX640 card (compute capability 2.1), but not on 9500GT (compute capability 1.1).
The function is roughly like this:
__device__ void myFuncD(float4 *myArray, float4 *result, uint index, uint foo, uint *here, uint *there)
{
uint j;
float4 myValue = myArray[index];
uint idxHere = here[foo];
uint idxThere = there[foo];
float4 temp;
for(j=idxHere;j<idxThere;j++){
temp = myArray[j];
//do things with myValue and temp, write result to *result
result->x += /* some calculations with myValue.x and temp.x */
result->y += /* some calculations with myValue.y and temp.y */
result->z += /* some calculations with myValue.z and temp.z */
}
}
__global__ void myKernelD(float4 *myArray, float4 *myResults, uint *here, uint *there)
{
uint index = blockDim.x*blockIdx.x+threadIdx.x;
float4 result = = make_float4(0.0f,0.0f,0.0f,0.0f);
uint foo1, foo2, foo3, foo4;
//compute foo1, foo2, foo3, foo4 based on myArray[index]
myFuncD(myArray, &result, index, foo1, here, there);
myFuncD(myArray, &result, index, foo2, here, there);
myFuncD(myArray, &result, index, foo3, here, there);
myFuncD(myArray, &result, index, foo4, here, there);
myResults[index] = result;
}
On GTX460, myResults has proper values, but on 9500GT every components of its members are all zeroes.
How can I achieve the same effect with a compute capability 1.1 device?
The user was trying to use too many threads per block to launch, and was getting the error "too many resources requested for launch". Decreasing threads per block allowed the kernel to launch.
Related
I am implementing a circular global memory to enable all threads read/write data to the same buffer simultaneously. It is a very simple producer/consumer algorithm in cpu. But i found something wrong in my cuda code.
The circular buffer was defined as follows:
#define BLOCK_NUM 1024
#define THREAD_NUM 64
#define BUFFER_SIZE BLOCK_NUM*THREAD_NUM*10
struct Stack {
bool bDirty[BUFFER_SIZE];
unsigned int index;
unsigned int iStackSize;
}
The read device is implemented as
__device__ void read(Stack *pStack) {
unsigned int index = atomicDec(&pStack->index, BUFFER_SIZE-1);
if(- -index >= BUFFER_SIZE)
index = BUFFER_SIZE - 1;
// check
if(pStack->bDirty[index] == false) {
printf(“no data\n”);
return;
}
//set read flag
pStack->bDirty[index] = false;
atomicSub(&pStack->iStackSize, 1);
}
The write device function is:
__device__ void write(Stack *pStack) {
unsigned int index = atomicInc(&pStack->index, BUFFER_SIZE - 1);
//check
if(pStack->bDirty[index] == true) {
printf(“why dirty\n”);
return;
}
pStack->bDirty[index] = true;
atomicAdd(&pStack->iStackSize, 1);
}
In order to test the read/write function in a more robust way, I write the following kernels:
__global__ void kernelWrite(Stack *pStack) {
if(threadIdx.x != 0) //make write less than thread number for testing purpose
write(pStack);
}
__global__ void kernelRead(Stack *pStack) {
read(pStack);
__syncthreads();
if(threadIdx.x % 3 != 0) // make write less than read
write(pStack);
__syncthreads();
}
In the main function, I used a dead loop to test if the read/write is atomic.
int main() {
Stack *pHostStack = (Stack*)malloc(sizeof(Stack));
Stack *pStack;
cudaMalloc(&pStack, sizeof(Stack));
cudaMemset(pStack, 0, sizeof(Stack));
while(true) { //dead loop
kernelWrite<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
while(pHost->iStackSize >= BLOCK_NUM*THREAD_NUM) {
kernelRead<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
}
return 0;
}
When I execute the above code, I got error msg “why dirty” and “no data”. What is wrong to the read/write logic?
By the way, I do not map the thread ID to the linear buffer address because in my application maybe only 10% threads write to the buffer, it is unpredictable/random.
The key problem is that the atomic operation is not real atomic because of reading and writing to the same buffer. The weird thing is that when the total thread number is less then 4096, no error message will be shown.
I want to implement properly an inlined device function that fill out a vector of dynamic size and return the filled vector like:
__device__ inline thrust::device_vector<double> make_array(double zeta, int l)
{
thrust::device_vector<double> ret;
int N =(int)(5*l+zeta); //the size of the array will depend on l and zeta, in a complex way...
// Make sure of sufficient memory allocation
ret.reserve(N);
// Resize array
ret.resize(N);
//fill it:
//for(int i=0;i<N;i++)
// ...;
return ret;
}
My goal is to use the content of the returned vector in another device function like:
__device__ inline double use_array(double zeta,int l)
{
thrust::device_vector<double> array = make_array(zeta, l);
double result = 0;
for(int i=0; i<array.size(); i++)
result += array[i];
return result;
}
How can I do it properly? my feeling is that a thrust vector is designed for this type of task, but I want to do it properly. What is the standard CUDA approach to this task?
thrust::device_vector is not usable in device code.
However you can return a pointer to a dynamically allocated area, like so:
#include <assert.h>
template <typename T>
__device__ T* make_array(T zeta, int l)
{
int N =(int)(5*l+zeta); //the size of the array will depend on l and zeta, in a complex way...
T *ret = (T *)malloc(N*sizeof(T));
assert(ret != NULL); // error checking
//fill it:
//for(int i=0;i<N;i++)
// ret[i] = ...;
return ret;
}
The inline keyword should not be necessary. The compiler will aggressively inline functions wherever possible.
I am attempting to find the the index of the first zero or negative value of an array using CUDA Thrust. The serial CPU code I am attempting to write using CUDA Thrust is the following:
for (int i = StartIndex; i <= ArrayLimitIndex; i++)
{
if (Array[i] <= 0) { DesiredIndex = i; break; }
}
I am thinking that the easiest way to do this on the GPU will be using the find_if function within the Thrust library.
The array is already on the GPU and I am attempting to search for the index on this array using Thrust as such:
struct less_than_or_eq_zero
{
__host__ __device__
bool operator() (double x)
{
return x <= 0;
}
};
thrust::device_vector<double>::iterator iter;
thrust::device_ptr<double> dev_ptr_Col46 = thrust::device_pointer_cast(dev_Col46);
iter = thrust::find_if(thrust::device, dev_ptr_Col46, dev_ptr_Col46 + size,less_than_or_eq_zero());
Now I would like to use the value of iter as an argument for my next kernel:
newKernel<<<size, 1>>>(*dev_array, iter)
where the newKernel definition is of the form:
__global__ void newKernel(double *dev_array, iter)
{
int x = blockIdx.x;
if(x <= iter)
{
//process data here...
}
}
I know that the code I have here is incorrect and I have a few questions regarding the use of iter. First, iter is a device_vector. Is there any way I can make iter just one value and not a vector? Also, when I have executed the find_if how can I use the value of iter in my next kernel call?
Any help with this be greatly appreciated.
Thanks
I'm summarizing the comments by talonmies and Jared Hoberock above as well as the answer by Sebastian Dressler in a fully compilable and executable example. The code calculates, by CUDA Thrust, the index of the first element of a vector satisfying a predicate (x<=0. in this case), I hope it will be helpful for future readers.
#include <thrust/device_vector.h>
#include <stdio.h>
struct less_than_or_eq_zero
{
__host__ __device__ bool operator() (double x) { return x <= 0.; }
};
int main(void)
{
int N = 6;
thrust::device_vector<float> D(N);
D[0] = 3.;
D[1] = 2.3;
D[2] = -1.3;
D[3] = 0.;
D[4] = 3.;
D[5] = -44.;
thrust::device_vector<float>::iterator iter1 = D.begin();
thrust::device_vector<float>::iterator iter2 = thrust::find_if(D.begin(), D.begin() + N, less_than_or_eq_zero());
int d = thrust::distance(iter1, iter2);
printf("Index = %i\n",d);
getchar();
return 0;
}
As you do not use a device_vector in your kernel but a raw array, you have to pass it an index and not an iterator. You can obtain the index by using thrust::distance to calculate the distance between dev_ptr_Col46 and iter.
You'll also want to read thrust iterators documentation, where distance is documented.
Try this:
thrust::device_ptr<double> val_ptr = thrust::find_if(dev_ptr_Col46, dev_ptr_Col46 + size,less_than_or_eq_zero());
double * val = thrust::raw_pointer_cast(val_ptr);
newKernel<<<size, 1>>>(dev_array, val)
Your kernel will have to have signature
__global__ void newKernel(double * dev_array, double * val)
I have a float array that needs to be referenced many times on the device, so I believe the best place to store it is in __ constant __ memory (using this reference). The array (or vector) will need to be written once at run-time when initializing, but read by multiple different functions many millions of times, so constant copying to the kernel each function call seems like A Bad Idea.
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
__host__ __device__ float operator()(const float& x) const { return fmax(x,C);}
};
void foo(const thrust::host_vector<float> &, const float &);
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x,0.0);
return(0);
}
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
//this method works fine, but the memory transfer is unacceptable
thrust::device_vector<float> input_dev_vec(n);
input_dev_vec = input_host_x; //I want to avoid this
thrust::transform(input_dev_vec.begin(),input_dev_vec.end(),dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this memory transfer for debugging
//this method compiles fine, but crashes at runtime
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(dev_ptr,dev_ptr+n,dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this line crashes
}
I tried adding a global thrust::device_vector dev_x(n), but that also crashed at run-time, and would be in __ global __ memory rather than __ constant__ memory
This can all be made to work if I just discard the thrust library, but is there a way to use the thrust library with globals and device constant memory?
Good question! You can't cast a __constant__ array as if it's a regular device pointer.
I will answer your question (after the line below), but first: this is a bad use of __constant__, and it isn't really what you want. The constant cache in CUDA is optimized for uniform access across threads in a warp. That means all threads in the warp access the same location at the same time. If each thread of the warp accesses a different constant memory location, then the accesses get serialized. So your access pattern, where consecutive threads access consecutive memory locations, will be 32 times slower than a uniform access. You should really just use device memory. If you need to write the data once, but read it many times, then just use a device_vector: initialize it once, and then read it many times.
To do what you asked, you can use a thrust::counting_iterator as the input to thrust::transform to generate a range of indices into your __constant__ array. Then your functor's operator() takes an int index operand rather than a float value operand, and does the lookup into constant memory.
(Note that this means your functor is now __device__ code only. You could easily overload the operator to take a float and call it differently on host data if you need portability.)
I modified your example to initialize the data and print the result to verify that it is correct.
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
// only works as a device function
__device__ float operator()(const int& i) const {
// use index into constant array
return fmax(dev_x[i],C);
}
};
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n),
dev_sol.begin(),
struct_max(x0));
host_sol = dev_sol; //this line crashes
for (int i = 0; i < n; i++)
printf("%f\n", host_sol[i]);
}
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
for (int i = 0; i < n; i++) x[i] = rand() / (float)RAND_MAX;
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x, 0.5);
return(0);
}
this is my first attempt at a CUDA program. This is what it's supposed to do:
Receive 1D Pixel array from host memory
Each Pixel is processed by one thread: it is thread-safe because only "val" is read and only "newval" is updated. Wait for sync.
Each Pixel is processed by one thread: copy "newval" to "val."
Write this array back to host memory.
Repeat 2-4 for several different frames.
What happens, however, is that only a couple of variables, out of about 32000, in the new arrays seem to have decent values at all; the rest are zero.
I've removed the calculations for brevity.
__global__ void kernel(Pixel *array, float dt)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//DO A BUNCH OF CALCULATIONS ON PIXEL KIND OF LIKE THIS
point->newval = point->val + foo;
}
__global__ void copykernel(Pixel *array)
{
const unsigned int tid = threadIdx.x;
Pixel *point = array + tid;
//COPY THE NEWVALS OVER TO THE OLD VALS IN PREPARATION FOR THE NEXT FRAME
point->val = point->newval;
}
extern "C" bool runIt(const int argc, const char **argv, Pixel *inarray, Pixel **outarrays, int arraysize, int numframes, float dt)
{
int memsize = arraysize*sizeof(Pixel);
int i=0;
Pixel *array;
cudaMalloc((void **) &array, memsize);
cudaMemcpy(array, inarray, memsize, cudaMemcpyHostToDevice);
int numthreads = arraysize;
dim3 grid(1,1,1);
dim3 threads(numthreads,1,1);
for(i=0;i<numframes;i++)
{
kernel<<<grid, threads>>>((Pixel *) array, dt);
cudaThreadSynchronize();
copykernel<<<grid, threads>>>((Pixel *) array);
cudaThreadSynchronize();
cudaMemcpy(array, outarrays[i], memsize, cudaMemcpyDeviceToHost);
}
cudaFree(array);
return true;
}
I have a suspicion that I'm setting up the parameters for the device incorrectly, or else I'm getting one of the device-specific keywords wrong or forgetting a crucial step. Does anything jump out at you?
I don't think you can run that many threads, and if you can, its not a good idea. Try setting the number of threads to 256 (16x16 for 2D), then choosing gridsize based on your input size.
dim3 threads(256,1,1);
dim3 grid(arraysize/threads.x,1,1); //Careful of integer division, this is just for example
Also your second copy is incorrect. You need to switch array and out_arrays
cudaMemcpy(outarrays[i], array, memsize, cudaMemcpyDeviceToHost);