How to resolve undefined reference errors to threadIdx.x, blockDim.x, and blockIdx.x in CUDA? - cuda

I'm a beginner working on a parallel list ranking algorithm in CUDA. I have no idea why I'm getting undefined reference errors to threadIdx.x, blockDim.x, and blockIdx.x, with the message "ld returned exit 1 status", when compiling with 'nvcc ParallelListRanking.cu -o ParallelListRanking'. Adding #include "device_launch_parameters.h" and #include "cuda.h" did not help. Any ideas to resolve this?
#include<stdio.h>
#include <device_launch_parameters.h>
#include <cuda.h>
__global__ void set_up(unsigned long int n, unsigned long int *dS, unsigned long long int *dQR){
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < n) {
dS[i] = i + 1;
dS[n-1] = 0;
if (dS[i] != 0) {
dQR[i] = (dQR[i]%(1ull<<32)) + (1ull<<32) * 1ull;
}
else {
dQR[i] = (dQR[i]%(1ull<<32)) + (1ull<<32) * 0ull;
}
dQR[i] = ((unsigned long long int) dS[i])%(1ull<<32) + (1ull<<32)*(dQR[i]/(1ull<<32));
}
}
__global__ void update(unsigned long int n, unsigned long long int *dQR) {
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < n) {
unsigned long int tempdQ = (unsigned long int) (dQR[i]%(1ull<<32));
unsigned long int tempdQQ = (unsigned long int) (dQR[tempdQ]%(1ull<<32));
if (tempdQ != 0 && tempdQQ != 0) {
unsigned long long int tmpdQRQ = dQR[tempdQ];
unsigned long int tempdR = (unsigned long int) dQR[i]/(1ull<<32);
unsigned long long int tempdRQ = tmpdQRQ/(1ull<<32);
dQR[i] = ((unsigned long long int) tempdQQ)%(1ull<<32) + (((unsigned long long int) tempdR) + tempdRQ)*(1ull<<32);
tempdQ = tempdQQ;
tempdQQ = (unsigned long int) dQR[tempdQ]%(1ull<<32);
}
}
}
int main() {
unsigned long int n = 1000;
unsigned long int *dS =(unsigned long int *) cudaMalloc((int **)n, sizeof(unsigned long int));
unsigned long long int *dQR = (unsigned long long int *) cudaMalloc((int **)n, sizeof(unsigned long long int));
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
set_up<<< 100, 100 >>>(n, dS, dQR);
for (int t = 1; t < n; t++) {
update<<< 100, 100>>>(n, dQR);
}
printf("%lu", (unsigned long int) (dQR[i]%(1ull<<32)));
printf("\n");
printf("%lu", (unsigned long int) (dQR[i]/(1ull<<32)));
cudaFree(dQR);
cudaFree(dS);
return 0;
}

You can't do this:
unsigned long int i = threadIdx.x + blockDim.x * blockIdx.x;
in host code (i.e. in main()).
Those variables are automatically defined in device code only (the routines decorated with __global__, for example).
The remainder of your host code from that point should not depend on variables like threadIdx.x because they are not defined in host code and make no sense to use there.
So just delete that line entirely (from main()) and come up with another method to use an index like that in your host code.
The only place you seem to use it is in your printf statements, so I guess if you are trying to print out all the variables you will need a loop in host code:
for (int i = 0; i < n; i++){
printf("%lu", (unsigned long int) (dQR[i]%(1ull<<32)));
printf("\n");
printf("%lu", (unsigned long int) (dQR[i]/(1ull<<32)));}
That is the proximal cause of the error you are reporting, however your code has other issues.
This is not correct usage of cudaMalloc:
unsigned long int *dS =(unsigned long int *) cudaMalloc((int **)n, sizeof(unsigned long int));
you want to do something like this:
cudaMallocManaged(&dS, sizeof(unsigned long int)*n);
and likewise for dQR. (I'm switching from cudaMalloc to cudaMallocManaged here for a reason related to your attempt to print those variables from host code.)
There are likely other problems with your code. Before attempting anything like this I would suggest you have a firm grasp of how to write a proper, simple vector add application in CUDA.
To get a basic grasp of CUDA you might want to study a simple sample code like vectorAdd and perhaps study one of the introductory blogs.

Related

Pack two types into one CUDA

I'd like to pack two data types, specifically an int and a float into an unsigned long long int in CUDA.
I wrote something like this but I get compilation errors when unpacking:
__global__ void kernel() {
// pack
float positive_num = 5.1034786f;
int index = 1024;
unsigned long long int u_int_val = (unsigned long long int) positive_num << 32;
u_int_val += index & 0xffff;
// unpack
positive_num = (float) u_int_val >> 32 ;
index = u_int_val & 0xffff ;
// check
printf("positive_num: %f - index %i\n", positive_num, index);
}
The error I get:
error: expression must have integral or unscoped enum type // error line - positive_num = (float) ...
I have tried in multiple ways, even casting the memory address, either I get compilation errors or simply I don't manage to have a precise conversion straight and back.
If helps, the only assumption I can make is that both the numbers are positive, hence float positive_num > 0.f; int index > 0;
The reason I need to pack two numbers into one, is to embed everything (a float and and int) in a single atomic operation, to find the minimum of both for instance.
If the need for heterogeneous packing and unpacking exists only in device code, one can use CUDA's device function intrinsics __float_as_int() and __int_as_float() to re-interpret a 32-bit float into a 32-bit int and vice versa. The packing of integers is unproblematic, simply shift the desired high-order part and OR the parts together.
For the same functionality in code that needs to work on both host and device, the canonical C++ way of re-interpreting floating-point data as integer data and vice versa is to use memcpy(); CUDA is a C++ derivative. This may or may not be as efficient as using the device intrinsics, which have no cost, as the 32-bit registers of the GPU can be used for both integer and floating-point data. It may be worthwhile to inspect the generated machine code (SASS) with cuobjdump --dump-sass.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define DEVICE_CODE_ONLY (0)
#if DEVICE_CODE_ONLY
__device__ unsigned long long int pack_float_int (float a, int b)
{
return (((unsigned long long int)(unsigned int)__float_as_int (a)) << 32) |
(unsigned long long int)(unsigned int)b;
}
__device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
*f = __int_as_float ((int)(unsigned int)(a >> 32));
*i = (int)(unsigned int)a;
}
#else // DEVICE_CODE_ONLY
__host__ __device__ unsigned long long int pack_float_int (float a, int b)
{
unsigned int t;
memcpy (&t, &a, sizeof t);
return ((unsigned long long int)t << 32) |
(unsigned long long int)(unsigned int)b;
}
__host__ __device__ void unpack_float_int (unsigned long long int a, float *f, int *i)
{
unsigned int t = (unsigned int)(a >> 32);
memcpy (f, &t, sizeof (*f));
*i = (int)(unsigned int)a;
}
#endif // DEVICE_CODE_ONLY
__global__ void kernel (float f, int i)
{
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("GPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("GPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
}
int main (void)
{
float f = 5.1034786f;
int i = 1024;
printf ("testing packing/unpacking: %s\n",
DEVICE_CODE_ONLY ? "on device" : "on device and host");
kernel<<<1,1>>> (f, i);
cudaDeviceSynchronize();
#if !DEVICE_CODE_ONLY
unsigned long long int p;
float uf;
int ui;
p = pack_float_int (f, i);
printf ("CPU: packed (%15.8e, %d) into %016llx\n", f, i, p);
unpack_float_int (p, &uf, &ui);
printf ("CPU: unpacked %016llx into (%15.8e %d)\n", p, uf, ui);
#endif // DEVICE_CODE_ONLY
return EXIT_SUCCESS;
}
something like this should work:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
(This is not really unique to CUDA. This is just C++ code, apart from the __device__ decorators.)
Example:
$ cat t2169.cu
#include <cstdio>
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&a))))<<32) + *(reinterpret_cast<unsigned *>(&b));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned mb = (unsigned)val;
b = *(reinterpret_cast<float *>(&mb));
unsigned ma = (unsigned)(val >> 32);
a = *(reinterpret_cast<int *>(&ma));
}
__global__ void k(int a, float b){
unsigned long long val = pack(a,b);
int ma;
float mb;
unpack(val, ma, mb);
printf("a = %d, b = %f\n", ma, mb);
}
int main(){
k<<<1,1>>>(-2, -1.3f);
cudaDeviceSynchronize();
}
$ nvcc -o t2169 t2169.cu
$ compute-sanitizer ./t2169
========= COMPUTE-SANITIZER
a = -2, b = -1.300000
========= ERROR SUMMARY: 0 errors
$
That packs the int quantity in the upper 32-bits and the float quantity in the lower 32-bits.
If you want to reverse the storage order, you can just reverse the usage of a and b like this:
__device__ unsigned long long pack(int a, float b){
return (((unsigned long long)(*(reinterpret_cast<unsigned *>(&b))))<<32) + *(reinterpret_cast<unsigned *>(&a));
}
__device__ void unpack(unsigned long long val, int &a, float &b){
unsigned ma = (unsigned)val;
a = *(reinterpret_cast<int *>(&ma));
unsigned mb = (unsigned)(val >> 32);
b = *(reinterpret_cast<float *>(&mb));
}

CUDA, how to find the first item in an array that makes a function maximal

In Cuda C++, I have a big array Arr of integers, and a function F: int -> int. I want to find the first index of some items in Arr that makes F maximal.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
BTW, I wonder if I can use the functions in Thrust library for this purpose instead.
How can I write a kernel that always keeps the maximal value (in F) to compare with others using atomic stuff to avoid facing the race condition problems?
Based on your description, including usage of int, and a desire to use atomics, I would suggest using a custom atomic. This should work for arrays up to 4 billion elements:
$ cat t2154.cu
#include <iostream>
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
unsigned my_idx = blockIdx.x*blockDim.x+threadIdx.x;
if (my_idx < N){
int my_val = arr[my_idx];
my_val = f(my_val);
my_custom_atomic(min, my_val, my_idx);
}
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
*min = 0xFFFFFFFF80000000ULL; //maximum unsigned index combined with minimum int value
k<<<my_N/256, 256>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " maximum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}
$ nvcc -o t2154 t2154.cu
$ compute-sanitizer ./t2154
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
We are assembling and disassembling the 64-bit quantity as needed, and using the general method outlined in the programming guide for arbitrary atomics.
I wonder if I can use the functions in Thrust library for this purpose instead.
Yes, you can do this with a transform and a reduce operation in thrust. In fact thrust can combine these into a single algorithm call. Here is an example:
$ cat t2155.cu
#include <iostream>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <limits>
const int minidx = 256;
const size_t my_N = 32768;
struct f
{
template <typename T>
__host__ __device__ T operator()(T t) {
T result = t;
thrust::get<0>(result) = minidx - (thrust::get<0>(t) - minidx)*(thrust::get<0>(t) - minidx);
return result;
}
};
struct r
{
template <typename T1, typename T2>
__host__ __device__ T1 operator()(T1 &t1, T2 &t2){
if (thrust::get<0>(t1) > thrust::get<0>(t2)) return t1;
if (thrust::get<0>(t1) < thrust::get<0>(t2)) return t2;
if (thrust::get<1>(t1) < thrust::get<1>(t2)) return t1;
return t2;
}
};
int main(){
thrust::device_vector<int> arr(my_N);
thrust::sequence(arr.begin(), arr.end());
auto my_zip = thrust::make_zip_iterator(thrust::make_tuple(arr.begin(), thrust::counting_iterator<size_t>(0)));
auto init = thrust::make_tuple(std::numeric_limits<int>::min(), std::numeric_limits<size_t>::max());
auto result = thrust::transform_reduce(my_zip, my_zip+my_N, f(), init, r());
std::cout << " maximum val: " << thrust::get<0>(result) << " at index: " << thrust::get<1>(result) << std::endl;
}
$ nvcc -o t2155 t2155.cu
$ compute-sanitizer ./t2155
========= COMPUTE-SANITIZER
maximum val: 256 at index: 256
========= ERROR SUMMARY: 0 errors
$
Notes:
Comparing those 2 implementations/examples, I know which one I would choose. The atomic method is brittle, type-limited, and size-limited. The thrust example could be adapted to handle types more flexibly (e.g. a function that returns a 64-bit type) and could be extended to handle beyond 4 billion elements.
The code here is just intended to be a possible roadmap. It's not thoroughly tested; bugs are always possible.
There is a strong correspondence between the two methods. The main() routines have almost a 1:1 correspondence, which hopefully you can identify. Furthermore the r() functor corresponds to the done() function, and the f() functor corresponds to the f() function.
Don't assume that you can readily/trivially increase my atomic example to 4 billion elements. The f() function I wrote would overflow/underflow an int variable. But with an appropriate data array and f() function, it should be possible to use up to 4 billion elements.
EDIT: As suggested in the comments below, we may be able to do a better job in the atomic case, by doing a threadblock level shared sweep reduction, followed by a single atomic per threadblock. Here is an example of that:
#include <iostream>
const int nTPB=512;
const unsigned long long initval = 0xFFFFFFFF80000000ULL; // maximum index and minimum int
__device__ __host__ int get_int(unsigned long long val){return reinterpret_cast<int *>(&val)[0];}
__device__ __host__ unsigned get_uns(unsigned long long val){return reinterpret_cast<unsigned *>(&val)[1];}
__device__ bool done(int fval, int fval1, unsigned idx, unsigned idx1){
if (fval > fval1) return true;
if ((fval == fval1) && (idx <= idx1)) return true;
return false;
}
__device__ unsigned long long my_custom_atomic(unsigned long long *addr, int fval, unsigned idx){
unsigned long long old = *addr;
while (!done(get_int(old),fval, get_uns(old), idx))
old = atomicCAS(addr, old, ((((unsigned long long)idx)<<32)|fval));
return old;
}
const int minidx = 256;
__device__ int f(int t){ return minidx - (t-minidx)*(t-minidx);}
__device__ unsigned long long my_reduce(unsigned long long t1, unsigned long long t2){
if (done(get_int(t1), get_int(t2), get_uns(t1), get_uns(t2))) return t1;
return t2;
}
__global__ void k(int *arr, unsigned long long *min, unsigned N){
__shared__ unsigned long long smem[nTPB];
smem[threadIdx.x] = initval;
for (unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x; idx < N; idx+=gridDim.x*blockDim.x)
smem[threadIdx.x] = my_reduce(smem[threadIdx.x], (((unsigned long long)idx)<<32)|f(arr[idx]));
for (int t = nTPB>>1; t > 0; t>>=1){
__syncthreads();
if (threadIdx.x < t) smem[threadIdx.x] = my_reduce(smem[threadIdx.x], smem[threadIdx.x+t]);}
if (!threadIdx.x) my_custom_atomic(min, get_int(smem[0]), get_uns(smem[0]));
}
const unsigned my_N = 32768;
int main(){
unsigned long long *min;
cudaMallocManaged(&min, sizeof(min[0]));
int *arr;
cudaMallocManaged(&arr, sizeof(arr[0])*my_N);
for (unsigned i = 0; i < my_N; i++) arr[i] = i;
arr[1024] = minidx;
*min = initval;
k<<<(my_N+nTPB-1)/nTPB, nTPB>>>(arr, min, my_N);
cudaDeviceSynchronize();
std::cout << " minimum val: " << get_int(*min) << " at index: " << get_uns(*min) << std::endl;
}

How to do atomic operation on structures inside CUDA kernel?

I have the below piece of code where I'm trying to increment the structure called SL. How can I atomically increment the same in the below case? How to avoid the race condition? I don't care about the parallelization achieved in this case.
__global__ void insertKernel(struct SlabList* head_ref, int* new_key, int* new_val, int size,struct SlabList* SL, struct SlabList* temp){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id<size/SLAB_SIZE){
head_ref=NULL;
struct SlabList* new_node = (struct SlabList*)
malloc(sizeof(struct SlabList));
for(int j=0;j<SLAB_SIZE;j++){
new_node->key[j] = new_key[id*SLAB_SIZE+j];
new_node->val[j]= new_val[id*SLAB_SIZE+j];
}
new_node->next = head_ref;
memcpy(SL,new_node, size * sizeof(struct SlabList));
head_ref = new_node;
SL++;//How to perform this atomically?
}
I looked into atomicInc and atomicAdd APIs of CUDA but was not able to proceed as they take different parameters.
By my reckoning, there are two operations which would only work correctly if performed atomically (without changes to the structure of the code)-- the increment of SL which you highlight, and exchange of the head_ref pointer value as the tree expands.
If (and only if), you are using a 64 bit operating system, then something like this might work:
__global__ void insertKernel(struct SlabList* head_ref, int* new_key,
int* new_val, int size, struct SlabList* SL, struct SlabList* temp)
{
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id<size/SLAB_SIZE){
struct SlabList* new_node = (struct SlabList*)malloc(sizeof(struct SlabList));
SlabList* SLnew = (SlabList *)atomicAdd((unsigned long long *)&SL,
sizeof(struct SlabList));
SlabList* oldhead = (SlabList *)atomicExch((unsigned long long *)&head_ref,
(unsigned long long)new_node);
for(int j=0;j<SLAB_SIZE;j++){
new_node->key[j] = new_key[id*SLAB_SIZE+j];
new_node->val[j] = new_val[id*SLAB_SIZE+j];
}
new_node->next = oldhead;
memcpy(SLnew, new_node, sizeof(struct SlabList));
}
}
[Note: never compiled or run, let alone tested. Use at own risk]

Atomic Operation failed in CUDA

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.
It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.
The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4
__device__ double gsum[num_of_blocks];
__device__ double dev_sum;
__device__ double dev_max;
// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &globalState[idx]);
}
// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
unsigned long long int* address_as_i = (unsigned long long int*)address;
unsigned long long int old = *address_as_i, assumed;
do {
assumed = old;
old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
} while (assumed != old);
return __longlong_as_double(old);
}
// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do{
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
}while(assumed != old);
return __longlong_as_double(old);
}
__global__ void kernel(curandState *globalState){
// global id
int gidx = threadIdx.x + blockIdx.x * blockDim.x;
// local id
int lidx = threadIdx.x;
// creat shared memory to store seeds
__shared__ curandState localState[tot_threads];
__shared__ double srandnum[threads_per_block];
// copy global seed to local
localState[lidx] = globalState[gidx];
//synchronize the local threads writing to the local memory cache
__syncthreads();
// generate random number from normal distribution in shared memory
srandnum[lidx] = curand_normal(&localState[lidx]);
__syncthreads();
if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];} // sum of each block
if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];} // copy the sums back to global memory
__threadfence();
if( gidx < num_of_blocks){
atomicAdd_d(&dev_sum, gsum[gidx]);
}
if( gidx < num_of_blocks){
atomicMax_d(&dev_max, gsum[gidx]);
}
if( gidx == 0){
printf("Sum is: %lf\n", dev_sum);
}
if( gidx == 1){
printf("Max is: %lf\n", dev_max);
}
}
int main(){
// set seed on device
curandState *globalState;
cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);
// launch kernel
kernel<<<num_of_blocks, threads_per_block>>>(globalState);
double randnum[num_of_blocks];
cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);
std::cout << "Sum of each block:\n";
for (int i = 0; i < num_of_blocks; ++i){
std::cout << randnum[i] << std::endl;
}
cudaFree(globalState);
return 0;
}
The result I get is
Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303
From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.
You don't ever initialize dev_max or dev_sum. You can't sensibly do these types of atomic operations on them if they don't start with a known value.
Try something like this instead:
__device__ double dev_sum = 0.0;
__device__ double dev_max = -1e99;
and I think you'll be happier with the results.

Optimizing a Very Simple Image Processing Kernel

I was hoping someone could give me a hand here. I've been getting my feet wet in CUDA, and wrote a simple kernel to negate an image. It works brilliantly and I'm pretty happy with it.
I guess my rather stupid question is... is there anyway I could optimize this kernel? I tried to use shared memory, however the number of pixels is 19224000.
I tried to just do __shared__ int sharedMem[19224000], which simply didn't run. I'm a little lost here, as a CUDA programmer could probably tell.
Here is my kernel:
__global__ void cuda_negate_image(int * new_array, int * old_array, int rows, int cols){
int tIdx = threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
int n = rows * cols;
if (i < n)
new_array[i] = -(old_array[i]) + 255;
}
Any help would be awesome!
There isn't much scope for optimisation here. For simple, memory bound operations the four golden rules are usually:
Coalesce memory reads and writes
Maximise byte per memory transaction when using coalesced memory access
Use the appropriate compiler heuristics to ensure that emitted code is optimal
Amortise thread scheduling and setup overhead by having each thread process multiple inputs, where practical. (Note this requires a different approach to execution grid parameter selection, i.e. size for the utilisation of your device, rather than the total amount of available work)
Apply those principles to your kernel and I get something like this:
__device__ __forceinline__ void negate(int &in, int &out)
{
out = 255 - in;
}
__device__ __forceinline__ void negate(int2 &in, int2 & out)
{
negate(in.x, out.x);
negate(in.y, out.y);
}
__device__ __forceinline__ void negate(int4 &in, int4 & out)
{
negate(in.x, out.x);
negate(in.y, out.y);
negate(in.z, out.z);
negate(in.w, out.w);
}
template<typename T>
__global__ void cuda_negate_image(T * __restrict__ new_array, T * __restrict__ old_array, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
T oldval, newval;
for(; i < n; i += stride) {
oldval = old_array[i];
negate(oldval, newval);
new_array[i] = newval;
}
}
template __global__ void cuda_negate_image<int>(int * __restrict__ new_array, int * __restrict__ old_array, int n);
template __global__ void cuda_negate_image<int2>(int2 * __restrict__ new_array, int2 * __restrict__ old_array, int n);
template __global__ void cuda_negate_image<int4>(int4 * __restrict__ new_array, int4 * __restrict__ old_array, int n);
Only benchmarking on your target hardware will tell you which version of the code is the fastest and whether this is even worth bothering with.