I was hoping someone could give me a hand here. I've been getting my feet wet in CUDA, and wrote a simple kernel to negate an image. It works brilliantly and I'm pretty happy with it.
I guess my rather stupid question is... is there anyway I could optimize this kernel? I tried to use shared memory, however the number of pixels is 19224000.
I tried to just do __shared__ int sharedMem[19224000], which simply didn't run. I'm a little lost here, as a CUDA programmer could probably tell.
Here is my kernel:
__global__ void cuda_negate_image(int * new_array, int * old_array, int rows, int cols){
int tIdx = threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
int n = rows * cols;
if (i < n)
new_array[i] = -(old_array[i]) + 255;
}
Any help would be awesome!
There isn't much scope for optimisation here. For simple, memory bound operations the four golden rules are usually:
Coalesce memory reads and writes
Maximise byte per memory transaction when using coalesced memory access
Use the appropriate compiler heuristics to ensure that emitted code is optimal
Amortise thread scheduling and setup overhead by having each thread process multiple inputs, where practical. (Note this requires a different approach to execution grid parameter selection, i.e. size for the utilisation of your device, rather than the total amount of available work)
Apply those principles to your kernel and I get something like this:
__device__ __forceinline__ void negate(int &in, int &out)
{
out = 255 - in;
}
__device__ __forceinline__ void negate(int2 &in, int2 & out)
{
negate(in.x, out.x);
negate(in.y, out.y);
}
__device__ __forceinline__ void negate(int4 &in, int4 & out)
{
negate(in.x, out.x);
negate(in.y, out.y);
negate(in.z, out.z);
negate(in.w, out.w);
}
template<typename T>
__global__ void cuda_negate_image(T * __restrict__ new_array, T * __restrict__ old_array, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
T oldval, newval;
for(; i < n; i += stride) {
oldval = old_array[i];
negate(oldval, newval);
new_array[i] = newval;
}
}
template __global__ void cuda_negate_image<int>(int * __restrict__ new_array, int * __restrict__ old_array, int n);
template __global__ void cuda_negate_image<int2>(int2 * __restrict__ new_array, int2 * __restrict__ old_array, int n);
template __global__ void cuda_negate_image<int4>(int4 * __restrict__ new_array, int4 * __restrict__ old_array, int n);
Only benchmarking on your target hardware will tell you which version of the code is the fastest and whether this is even worth bothering with.
Related
I have a kernel that operates on complex numbers, and I am loading the values like this:
thrust::complex<float> x = X[tIdx];
where X is in global memory. When I profile this kernel with nvvp, I find that it is memory bandwidth-limited and the profiler suggests that I improve the memory access pattern:
Global Load L2 Transactions/Access=8, Ideal Transactions/Access=4
The disassembly confirms that this line is indeed split into two 32-bit loads, producing a strided access pattern:
LDG.E R9, [R16];
LDG.E R11, [R16+0x4];
How can I get this to compile into a single 64-bit load?
Potential solutions
I realize this is pretty closely related to this earlier question but the proposed solutions (change the global memory layout or use shared memory) seem less ideal than a 64-bit load.
The NVidia developer blog suggests reinterpret_cast to a vector data type such as float2, but I'm a little hazy about how this fits in with pointer aliasing rules.
I must also confess that this is somewhat of a theoretical question. For this particular kernel, I'm limited by the device memory bandwidth, so halving the # of L2 transactions shouldn't significantly improve the overall performance. But I anticipate working with more complex numbers in my future, and if there's a simple solution then I'd like to start using it now.
The basic problem here is that the compiler seems to need explicit alignment specifications for a type before it will generate vector load and store instructions. Consider the following trivial example:
class __align__(8) cplx0
{
public:
__device__ __host__ cplx0(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
class cplx1
{
public:
__device__ __host__ cplx1(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
template<typename T>
__global__ void memsetkernel(T* out, const T val, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = val;
}
template<typename T>
__global__ void memcpykernel(const T* __restrict__ in, T* __restrict__ out, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = in[tid];
}
template<typename T>
void memcpy(const T* in, T* out, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memcpykernel<T><<<nblocks, nthreads>>>(in, out, Nitems);
cudaDeviceSynchronize();
}
template<typename T>
void memset(T* in, const T value, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memsetkernel<T><<<nblocks, nthreads>>>(in, value, Nitems);
cudaDeviceSynchronize();
}
int main(void)
{
const int Nitems = 1 << 24;
typedef cplx0 fcomplex0;
typedef cplx1 fcomplex1;
{
fcomplex0* in;
fcomplex0* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex0));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex0>(in, fcomplex0(1.0f,1.0f), Nitems);
memcpy<fcomplex0>(in, out, Nitems);
}
cudaFree(in);
cudaFree(out);
}
{
fcomplex1* in;
fcomplex1* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex1));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex1>(in, fcomplex1(1.0f,1.0f), Nitems);
memcpy<fcomplex1>(in, out, Nitems);
cudaDeviceSynchronize();
}
cudaFree(in);
cudaFree(out);
}
cudaDeviceReset();
return 0;
}
Here we has two home-baked complex types, one with explicit alignment specifications, and one without. Otherwise they are identical. Putting them through a naïve mempcy and memset kernels in this test harness allows us to inspect the code generation behaviour of the toolchain for each type and benchmark the performance.
Firstly, the code. For cplx0 class, which has explicit 8-byte alignment, the compiler emits vectorized loads and stores in both kernels:
memcpykernel
ld.global.nc.v2.f32 {%f5, %f6}, [%rd17];
st.global.v2.f32 [%rd18], {%f5, %f6};
memsetkernel
st.global.v2.f32 [%rd11], {%f1, %f2};
whereas for the cplx1 case, it does not:
memcpykernel
ld.global.nc.f32 %f1, [%rd16];
ld.global.nc.f32 %f2, [%rd16+4];
st.global.f32 [%rd15+4], %f2;
st.global.f32 [%rd15], %f1;
memsetkernel
st.global.f32 [%rd11+4], %f2;
st.global.f32 [%rd11], %f1;
Looking at performance, there is a non-trivial difference in performance for the memset case (CUDA 8 release toolkit, GTX 970 with Linux 367.48 driver):
$ nvprof ./complex_types
==29074== NVPROF is profiling process 29074, command: ./complex_types
==29074== Profiling application: ./complex_types
==29074== Profiling result:
Time(%) Time Calls Avg Min Max Name
33.04% 19.264ms 10 1.9264ms 1.9238ms 1.9303ms void memcpykernel<cplx1>(cplx1 const *, cplx1*, int)
32.72% 19.080ms 10 1.9080ms 1.9055ms 1.9106ms void memcpykernel<cplx0>(cplx0 const *, cplx0*, int)
19.15% 11.165ms 10 1.1165ms 1.1120ms 1.1217ms void memsetkernel<cplx1>(cplx1*, cplx1, int)
15.09% 8.7985ms 10 879.85us 877.67us 884.13us void memsetkernel<cplx0>(cplx0*, cplx0, int)
The Thrust templated complex type does not have an explicit alignment definition (although it potentially could via specialization, although that would somewhat defeat the purpose). So your only choice here is to either make your own version of the Thrust type with explicit alignment, or use another complex type which does (like the cuComplex type which CUBLAS and CUFFT use).
I want to add 128-bit vectors with carry. My 128-bit version (addKernel128 in the code below) is twice slower than the basic 32-bit version (addKernel32 below).
Do I have memory coalescing problems ? How can I get better performance ?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#define UADDO(c, a, b) asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define UADDC(c, a, b) asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
__global__ void addKernel32(unsigned int *c, const unsigned int *a, const unsigned int *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
__global__ void addKernel128(unsigned *c, const unsigned *a, const unsigned *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size / 4)
{
uint4 a4 = ((const uint4 *)a)[tid],
b4 = ((const uint4 *)b)[tid],
c4;
UADDO(c4.x, a4.x, b4.x)
UADDC(c4.y, a4.y, b4.y) // add with carry
UADDC(c4.z, a4.z, b4.z) // add with carry
UADDC(c4.w, a4.w, b4.w) // add with carry (no overflow checking for clarity)
((uint4 *)c)[tid] = c4;
tid += blockDim.x * gridDim.x;
}
}
int main()
{
const int size = 10000000; // 10 million
unsigned int *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, size * sizeof(int));
cudaMalloc((void**)&d_b, size * sizeof(int));
cudaMalloc((void**)&d_c, size * sizeof(int));
cudaMemset(d_a, 1, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_b, 2, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_c, 0, size * sizeof(int));
int nbThreads = 512;
int nbBlocks = 1024; // for example
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
addKernel128<<<nbBlocks, nbThreads>>>(d_c, d_a, d_b, size);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float m = 0;
cudaEventElapsedTime(&m, start, stop);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
cudaDeviceReset();
printf("Elapsed = %g\n", m);
return 0;
}
Timing CUDA code on a WDDM GPU can be quite difficult for a variety of reasons. Most of these revolve around the fact that the GPU is being managed as a display device by Windows, and this can introduce a variety of artifacts into the timing. One example is that the windows driver and WDDM will batch work for the GPU, and may interleave display work in the middle of CUDA GPU work.
if possible, time your cuda code on linux, or else on a windows GPU
in TCC mode.
for performance, always build without the -G switch. In visual studio, this usually corresponds to building the release, not the debug version of the project.
To get a good performance comparison, it's usually advisable to do some "warm up runs" before actually measuring the timing results. These will eliminate "start-up" and other one-time measurement issues, are you are more likely to get sensible results. You may also wish to run your code a number of times and average the results.
It's also usually advisable to compile with an arch flag that corresponds to your GPU, so for example -arch=sm_20 for a cc2.0 GPU.
when I use this code in cuda it only increase a[0],a[1],a[2] other was 0 (didn't increased)
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[i]),1);
}
when I write
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[6]),1);
}
it didn't increase a[6]
what's wrong? sorry
all of the code is this
__global__ void inc2(int * a){
int i= threadIdx.x;
i%=10;
atomicAdd(&(a[6]),1);
}
int main()
{
//=============================================
int aaa[10]={0};
int *q;
cudaMalloc((void**)&q,100);
cudaMemcpy(q,aaa,10,cudaMemcpyHostToDevice);
inc2<<<100,100>>>(q);
cudaMemcpy(aaa,q,10,cudaMemcpyDeviceToHost);
printf("\n\n");
for(int i=0;i<10;i++){
printf("%d\t",aaa[i]);
}
cudaFree(q);
return 0;
}
First of all, you should use proper cuda error checking any time you are having trouble with a CUDA code.
You may be confused about the size parameters associated with functions like cudaMalloc or cudaMemcpy. They represent a size in bytes. So this:
cudaMemcpy(aaa,q,10,cudaMemcpyDeviceToHost);
only transfers 10 bytes, which is 2.5 int quantities. If you want to see the modified value of a[6], you're going to have to transfer more than the first 2 int quantities in a.
If you modify these lines:
cudaMemcpy(q,aaa,40,cudaMemcpyHostToDevice);
^^
and:
cudaMemcpy(aaa,q,40,cudaMemcpyDeviceToHost);
^^
I think you'll have better results.
What is the difference between the following two functions?
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
__threadfence();
}
Could anyone help me?
I implement BitonicSort in some different versions based on CUDA SDK version.
For ATOMIC version (bitonicSortAtomic), I tried to use __threadfence() in __syncblocks_atomic to maintain memory consistency. But it doesn't work (the output is incorrect). I have to call comparator_volatile instead of comparator, then I get correct result. Any idea?
The BitonicSort benchmark:
// (C) Copyright 2013, University of Illinois. All Rights Reserved
#include <stdlib.h>
#include <stdio.h>
#include "parboil.h"
#define THREADS 256
#define BLOCKS 32
#define NUM_VALS 2*THREADS*BLOCKS
__device__ volatile int mutex = 0;
__device__ inline void __syncblocks_atomic(int goal) {
__syncthreads();
// __threadfence();
int tx = threadIdx.x;
if (tx == 0) {
atomicAdd((int *)&mutex, 1);
while(g_mutex != goal) {}
}
__syncthreads();
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
#ifdef NAIVE
__global__ void bitonicSortNaive(float *src, int stride, int size) {
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
uint dir = (tid & (size / 2)) == 0;
unsigned int pos = 2*tid - (tid & (stride - 1));
comparator(src[pos], src[pos+stride], dir);
}
#endif
#ifdef ATOMIC
__global__ void bitonicSortAtomic(float *src, int length) {
uint numBlocks = gridDim.x * gridDim.y * gridDim.z;
uint goalVal = 0;
uint tid = threadIdx.x + blockDim.x * blockIdx.x;
for(uint size=2; size<=length; size<<=1) {
for(uint stride=size>>1; stride>0; stride=stride>>1) {
uint dir = (tid & (size / 2)) == 0;
uint pos = 2*tid - (tid & (stride - 1));
comparator_volatile(src[pos], src[pos+stride], dir);
if(stride>THREADS || (stride==1 && size>=THREADS)) {
goalVal += numBlocks;
__syncblocks_atomic(goalVal);
}
else
__syncthreads();
} // end for stride
} // end for size
}
#endif
int main() {
printf("[BENCH] Bitonic Sort %d elements\n", NUM_VALS);
printf("[BENCH] Xuhao Chen <cxh#illinois.edu>\n");
#ifdef NAIVE
printf("[BENCH] Naive version\n");
#endif
#ifdef ATOMIC
printf("[BENCH] Atomic Barrier\n");
#endif
float *values = (float*) malloc( NUM_VALS * sizeof(float));
array_init(values, NUM_VALS);
float *dev_values;
size_t size = NUM_VALS * sizeof(float);
cudaMalloc((void**) &dev_values, size);
cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS,1);
dim3 threads(THREADS,1);
cudaDeviceSynchronize();
#ifdef NAIVE
int j, k;
for (k = 2; k <= NUM_VALS; k <<= 1) {
for (j=k>>1; j>0; j=j>>1) {
bitonicSortNaive<<<blocks, threads>>>(dev_values, j, k);
}
}
#endif
#ifdef ATOMIC
bitonicSortAtomic<<<blocks, threads>>>(dev_values, NUM_VALS);
#endif
cudaDeviceSynchronize();
cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
cudaFree(dev_values);
free(values);
}
__syncblocks_atomic is a function to implement global barrier. Since there is inter-block communication, I have to keep data consistency.
The CUDA programming guide states:
If a variable located in
global or shared memory is declared as volatile, the compiler assumes that its value
can be changed or used at any time by another thread and therefore any reference to
this variable compiles to an actual memory read or write instruction.
This basically means that the memory will be flushed immediately as you assign a value to the variable, and will be fetched directly from the memory (with no cache) when you try to read its value.
In you first code sample, since both A and B are volatile, 6 actual memory instructions are generated. One read/write each time you use either A or B. The good point is that other threads will be able to see that modifications earlier, while they are made. The downside is that the execution will be slower, because the caches will be disabled.
In your second code sample, on the other side, the GPU is authorized to use caches to accelerate its execution, until the end of the function, when it's forced to issue a memory write. If both A and B are already cached, only 2 memory writes are issued. The downside is that other threads might only be able to see the changed value after the fence.
Another thing you should consider is that operations are not atomic.
If other threads try to access A and B while your function is executing, they might see a partial execution of the function, in both cases. In the second code sample, this is a bit less likely to happen, because the thread will probably use its cached value, and flush the final values at once (anyway, you should not rely on this).
Also, volatile works as a faster version of __threadfence() among threads in the same warp (because threads in a warp act synchronously).
I found some difficulty when I try to access a global array from function that's executed from device:
float globTemp[3][3] = "some value in here";
__device__ float* globTemp_d;
__global__ void compute(int *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = y*w+x;
if(x<3 && y<3)
a[i] = 1+globTemp_d[i];
}
int hostFunc(){
float *a_d;
cudaMalloc((void**)&a_d, 3*3*sizeof(int));
cudaMalloc((void**)&globTemp_d, 3*3*sizeof(int));
cudaMemcpy(globTemp_d,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
compute<<<1,1>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
}
However, I get seg fault when i try to access globTemp_d[i]. Am I doing something wrong in here?
There are a variety of problems with your code:
Your grid is a 1D grid of 1D threadblocks (in fact you are launching a single block of 1 thread) but your kernel is written as if it were expecting a 2D threadblock structure (using .x and .y built-in variables). A single thread won't get the work done certainly, and a 1D threadblock won't work with your kernel code.
__device__ variables are not accessed with cudaMalloc and cudaMemcpy. We use a different set of API calls like cudaMemcpyToSymbol.
You're not doing any cuda error checking which is always recommended when you're having difficulty. You should do cuda error checking on both API calls and kernel calls.
You're mixing float variables (a_d ) with int variables in the kernel parameters (int *a) so I don't think this code would compile without at least a warning. And that can lead to strange behavior of course if you ignore it.
This is the closest I could come to your code while fixing all the errors:
#include <stdio.h>
__device__ float* globTemp_d;
__global__ void compute(float *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = (y*w)+x;
if((x<3) && (y<3))
a[i] = 1.0f+globTemp_d[i];
}
int main(){
float *a_d, *d_globTemp;
float globTemp[3][3] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f};
float a[(3*3)];
dim3 threads(3,3);
dim3 blocks(1);
cudaMalloc((void**)&a_d, 3*3*sizeof(float));
cudaMalloc((void**)&d_globTemp, 3*3*sizeof(float));
cudaMemcpy(d_globTemp,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(globTemp_d, &d_globTemp, sizeof(float *));
compute<<<blocks,threads>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i<(3*3); i++)
printf("a[%d] = %f\n", i, a[i]);
return 0;
}
This code can be simplified by dispensing with the __device__ variable and just passing d_globTemp as a parameter to the kernel, and using it in place of references to globTemp_d. However I did not make that simplification.