Optimizing memory access for complex numbers - cuda

I have a kernel that operates on complex numbers, and I am loading the values like this:
thrust::complex<float> x = X[tIdx];
where X is in global memory. When I profile this kernel with nvvp, I find that it is memory bandwidth-limited and the profiler suggests that I improve the memory access pattern:
Global Load L2 Transactions/Access=8, Ideal Transactions/Access=4
The disassembly confirms that this line is indeed split into two 32-bit loads, producing a strided access pattern:
LDG.E R9, [R16];
LDG.E R11, [R16+0x4];
How can I get this to compile into a single 64-bit load?
Potential solutions
I realize this is pretty closely related to this earlier question but the proposed solutions (change the global memory layout or use shared memory) seem less ideal than a 64-bit load.
The NVidia developer blog suggests reinterpret_cast to a vector data type such as float2, but I'm a little hazy about how this fits in with pointer aliasing rules.
I must also confess that this is somewhat of a theoretical question. For this particular kernel, I'm limited by the device memory bandwidth, so halving the # of L2 transactions shouldn't significantly improve the overall performance. But I anticipate working with more complex numbers in my future, and if there's a simple solution then I'd like to start using it now.

The basic problem here is that the compiler seems to need explicit alignment specifications for a type before it will generate vector load and store instructions. Consider the following trivial example:
class __align__(8) cplx0
{
public:
__device__ __host__ cplx0(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
class cplx1
{
public:
__device__ __host__ cplx1(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
template<typename T>
__global__ void memsetkernel(T* out, const T val, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = val;
}
template<typename T>
__global__ void memcpykernel(const T* __restrict__ in, T* __restrict__ out, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = in[tid];
}
template<typename T>
void memcpy(const T* in, T* out, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memcpykernel<T><<<nblocks, nthreads>>>(in, out, Nitems);
cudaDeviceSynchronize();
}
template<typename T>
void memset(T* in, const T value, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memsetkernel<T><<<nblocks, nthreads>>>(in, value, Nitems);
cudaDeviceSynchronize();
}
int main(void)
{
const int Nitems = 1 << 24;
typedef cplx0 fcomplex0;
typedef cplx1 fcomplex1;
{
fcomplex0* in;
fcomplex0* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex0));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex0>(in, fcomplex0(1.0f,1.0f), Nitems);
memcpy<fcomplex0>(in, out, Nitems);
}
cudaFree(in);
cudaFree(out);
}
{
fcomplex1* in;
fcomplex1* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex1));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex1>(in, fcomplex1(1.0f,1.0f), Nitems);
memcpy<fcomplex1>(in, out, Nitems);
cudaDeviceSynchronize();
}
cudaFree(in);
cudaFree(out);
}
cudaDeviceReset();
return 0;
}
Here we has two home-baked complex types, one with explicit alignment specifications, and one without. Otherwise they are identical. Putting them through a naïve mempcy and memset kernels in this test harness allows us to inspect the code generation behaviour of the toolchain for each type and benchmark the performance.
Firstly, the code. For cplx0 class, which has explicit 8-byte alignment, the compiler emits vectorized loads and stores in both kernels:
memcpykernel
ld.global.nc.v2.f32 {%f5, %f6}, [%rd17];
st.global.v2.f32 [%rd18], {%f5, %f6};
memsetkernel
st.global.v2.f32 [%rd11], {%f1, %f2};
whereas for the cplx1 case, it does not:
memcpykernel
ld.global.nc.f32 %f1, [%rd16];
ld.global.nc.f32 %f2, [%rd16+4];
st.global.f32 [%rd15+4], %f2;
st.global.f32 [%rd15], %f1;
memsetkernel
st.global.f32 [%rd11+4], %f2;
st.global.f32 [%rd11], %f1;
Looking at performance, there is a non-trivial difference in performance for the memset case (CUDA 8 release toolkit, GTX 970 with Linux 367.48 driver):
$ nvprof ./complex_types
==29074== NVPROF is profiling process 29074, command: ./complex_types
==29074== Profiling application: ./complex_types
==29074== Profiling result:
Time(%) Time Calls Avg Min Max Name
33.04% 19.264ms 10 1.9264ms 1.9238ms 1.9303ms void memcpykernel<cplx1>(cplx1 const *, cplx1*, int)
32.72% 19.080ms 10 1.9080ms 1.9055ms 1.9106ms void memcpykernel<cplx0>(cplx0 const *, cplx0*, int)
19.15% 11.165ms 10 1.1165ms 1.1120ms 1.1217ms void memsetkernel<cplx1>(cplx1*, cplx1, int)
15.09% 8.7985ms 10 879.85us 877.67us 884.13us void memsetkernel<cplx0>(cplx0*, cplx0, int)
The Thrust templated complex type does not have an explicit alignment definition (although it potentially could via specialization, although that would somewhat defeat the purpose). So your only choice here is to either make your own version of the Thrust type with explicit alignment, or use another complex type which does (like the cuComplex type which CUBLAS and CUFFT use).

Related

Compare Thrust fill with kernel launch speed [duplicate]

I want to add 128-bit vectors with carry. My 128-bit version (addKernel128 in the code below) is twice slower than the basic 32-bit version (addKernel32 below).
Do I have memory coalescing problems ? How can I get better performance ?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#define UADDO(c, a, b) asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define UADDC(c, a, b) asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
__global__ void addKernel32(unsigned int *c, const unsigned int *a, const unsigned int *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
__global__ void addKernel128(unsigned *c, const unsigned *a, const unsigned *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size / 4)
{
uint4 a4 = ((const uint4 *)a)[tid],
b4 = ((const uint4 *)b)[tid],
c4;
UADDO(c4.x, a4.x, b4.x)
UADDC(c4.y, a4.y, b4.y) // add with carry
UADDC(c4.z, a4.z, b4.z) // add with carry
UADDC(c4.w, a4.w, b4.w) // add with carry (no overflow checking for clarity)
((uint4 *)c)[tid] = c4;
tid += blockDim.x * gridDim.x;
}
}
int main()
{
const int size = 10000000; // 10 million
unsigned int *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, size * sizeof(int));
cudaMalloc((void**)&d_b, size * sizeof(int));
cudaMalloc((void**)&d_c, size * sizeof(int));
cudaMemset(d_a, 1, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_b, 2, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_c, 0, size * sizeof(int));
int nbThreads = 512;
int nbBlocks = 1024; // for example
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
addKernel128<<<nbBlocks, nbThreads>>>(d_c, d_a, d_b, size);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float m = 0;
cudaEventElapsedTime(&m, start, stop);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
cudaDeviceReset();
printf("Elapsed = %g\n", m);
return 0;
}
Timing CUDA code on a WDDM GPU can be quite difficult for a variety of reasons. Most of these revolve around the fact that the GPU is being managed as a display device by Windows, and this can introduce a variety of artifacts into the timing. One example is that the windows driver and WDDM will batch work for the GPU, and may interleave display work in the middle of CUDA GPU work.
if possible, time your cuda code on linux, or else on a windows GPU
in TCC mode.
for performance, always build without the -G switch. In visual studio, this usually corresponds to building the release, not the debug version of the project.
To get a good performance comparison, it's usually advisable to do some "warm up runs" before actually measuring the timing results. These will eliminate "start-up" and other one-time measurement issues, are you are more likely to get sensible results. You may also wish to run your code a number of times and average the results.
It's also usually advisable to compile with an arch flag that corresponds to your GPU, so for example -arch=sm_20 for a cc2.0 GPU.

Optimizing a Very Simple Image Processing Kernel

I was hoping someone could give me a hand here. I've been getting my feet wet in CUDA, and wrote a simple kernel to negate an image. It works brilliantly and I'm pretty happy with it.
I guess my rather stupid question is... is there anyway I could optimize this kernel? I tried to use shared memory, however the number of pixels is 19224000.
I tried to just do __shared__ int sharedMem[19224000], which simply didn't run. I'm a little lost here, as a CUDA programmer could probably tell.
Here is my kernel:
__global__ void cuda_negate_image(int * new_array, int * old_array, int rows, int cols){
int tIdx = threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
int n = rows * cols;
if (i < n)
new_array[i] = -(old_array[i]) + 255;
}
Any help would be awesome!
There isn't much scope for optimisation here. For simple, memory bound operations the four golden rules are usually:
Coalesce memory reads and writes
Maximise byte per memory transaction when using coalesced memory access
Use the appropriate compiler heuristics to ensure that emitted code is optimal
Amortise thread scheduling and setup overhead by having each thread process multiple inputs, where practical. (Note this requires a different approach to execution grid parameter selection, i.e. size for the utilisation of your device, rather than the total amount of available work)
Apply those principles to your kernel and I get something like this:
__device__ __forceinline__ void negate(int &in, int &out)
{
out = 255 - in;
}
__device__ __forceinline__ void negate(int2 &in, int2 & out)
{
negate(in.x, out.x);
negate(in.y, out.y);
}
__device__ __forceinline__ void negate(int4 &in, int4 & out)
{
negate(in.x, out.x);
negate(in.y, out.y);
negate(in.z, out.z);
negate(in.w, out.w);
}
template<typename T>
__global__ void cuda_negate_image(T * __restrict__ new_array, T * __restrict__ old_array, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
T oldval, newval;
for(; i < n; i += stride) {
oldval = old_array[i];
negate(oldval, newval);
new_array[i] = newval;
}
}
template __global__ void cuda_negate_image<int>(int * __restrict__ new_array, int * __restrict__ old_array, int n);
template __global__ void cuda_negate_image<int2>(int2 * __restrict__ new_array, int2 * __restrict__ old_array, int n);
template __global__ void cuda_negate_image<int4>(int4 * __restrict__ new_array, int4 * __restrict__ old_array, int n);
Only benchmarking on your target hardware will tell you which version of the code is the fastest and whether this is even worth bothering with.

Simple CUDA Kernel Optimization

In the process of speeding up an application, I have a very simple kernel which does the type casting as shown below:
__global__ void UChar2FloatKernel(float *out, unsigned char *in, int nElem){
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
if(i<nElem)
out[i] = (float) in[i];
}
The global memory access is coalesced and in my understanding using shared memory will also not be beneficial as there are not multiple reads of the same memory. Does any one have any idea if there is any optimization which can be performed to speed up this kernel. The input and output data is already on the device, so no host to device memory copy will be required.
The single biggest optimisation you can perform on a code like that one is to use resident threads and increase the number of transactions each thread performs. While the CUDA block scheduling model is pretty lightweight, it isn't free, and launching a lot blocks containing threads which do only a single memory load and single memory store will accrue a lot of block scheduling overhead. So only launch as many blocks as will "fill" the all the SM of your GPU and have each thread do more work.
The second obvious optimization is switch to 128 byte memory transactions for loads, which should give you a tangible bandwidth utilization gain. On a Fermi or Kepler GPU this won't give as large a performance boost as on first and second generation hardware.
Putting this altogether into a simple benchmark:
__global__
void UChar2FloatKernel(float *out, unsigned char *in, int nElem)
{
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
if(i<nElem)
out[i] = (float) in[i];
}
__global__
void UChar2FloatKernel2(float *out,
const unsigned char *in,
int nElem)
{
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
for(; i<nElem; i+=gridDim.x*blockDim.x) {
out[i] = (float) in[i];
}
}
__global__
void UChar2FloatKernel3(float4 *out,
const uchar4 *in,
int nElem)
{
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
for(; i<nElem; i+=gridDim.x*blockDim.x) {
uchar4 ival = in[i]; // 32 bit load
float4 oval = make_float4(ival.x, ival.y, ival.z, ival.w);
out[i] = oval; // 128 bit store
}
}
int main(void)
{
const int n = 2 << 20;
unsigned char *a = new unsigned char[n];
for(int i=0; i<n; i++) {
a[i] = i%255;
}
unsigned char *a_;
cudaMalloc((void **)&a_, sizeof(unsigned char) * size_t(n));
float *b_;
cudaMalloc((void **)&b_, sizeof(float) * size_t(n));
cudaMemset(b_, 0, sizeof(float) * size_t(n)); // warmup
for(int i=0; i<5; i++)
{
dim3 blocksize(512);
dim3 griddize(n/512);
UChar2FloatKernel<<<griddize, blocksize>>>(b_, a_, n);
}
for(int i=0; i<5; i++)
{
dim3 blocksize(512);
dim3 griddize(8); // 4 blocks per SM
UChar2FloatKernel2<<<griddize, blocksize>>>(b_, a_, n);
}
for(int i=0; i<5; i++)
{
dim3 blocksize(512);
dim3 griddize(8); // 4 blocks per SM
UChar2FloatKernel3<<<griddize, blocksize>>>((float4*)b_, (uchar4*)a_, n/4);
}
cudaDeviceReset();
return 0;
}
gives me this on a small Fermi device:
>nvcc -m32 -Xptxas="-v" -arch=sm_21 cast.cu
cast.cu
tmpxft_000014c4_00000000-5_cast.cudafe1.gpu
tmpxft_000014c4_00000000-10_cast.cudafe2.gpu
cast.cu
ptxas : info : 0 bytes gmem
ptxas : info : Compiling entry function '_Z18UChar2FloatKernel2PfPKhi' for 'sm_2
1'
ptxas : info : Function properties for _Z18UChar2FloatKernel2PfPKhi
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 5 registers, 44 bytes cmem[0]
ptxas : info : Compiling entry function '_Z18UChar2FloatKernel3P6float4PK6uchar4
i' for 'sm_21'
ptxas : info : Function properties for _Z18UChar2FloatKernel3P6float4PK6uchar4i
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 8 registers, 44 bytes cmem[0]
ptxas : info : Compiling entry function '_Z17UChar2FloatKernelPfPhi' for 'sm_21'
ptxas : info : Function properties for _Z17UChar2FloatKernelPfPhi
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 3 registers, 44 bytes cmem[0]
tmpxft_000014c4_00000000-5_cast.cudafe1.cpp
tmpxft_000014c4_00000000-15_cast.ii
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
40.20 6.61ms 5 1.32ms 1.32ms 1.32ms UChar2FloatKernel(float*, unsigned char*, int)
29.43 4.84ms 5 968.32us 966.53us 969.46us UChar2FloatKernel2(float*, unsigned char const *, int)
26.35 4.33ms 5 867.00us 866.26us 868.10us UChar2FloatKernel3(float4*, uchar4 const *, int)
4.02 661.34us 1 661.34us 661.34us 661.34us [CUDA memset]
In the latter two kernel, using only 8 blocks gives a large speed up compared to 4096 blocks, which confirms the idea that multiple work items per thread is the best way to improve performance in this sort of memory bound, low instruction count kernel.
Here is a cpu version of the function and 4 gpu kernels. 3 kernels are from #talonmies answer and I have added kernel2 which only utilizes vector data types only.
// cpu version for comparison
void UChar2Float(unsigned char *a, float *b, const int n){
for(int i=0;i<n;i++)
b[i] = (float)a[i];
}
__global__ void UChar2FloatKernel1(float *out, const unsigned char *in, int nElem){
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
if(i<nElem) out[i] = (float) in[i];
}
__global__ void UChar2FloatKernel2(float4 *out, const uchar4 *in, int nElem){
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
if(i<nElem) {
uchar4 ival = in[i]; // 32 bit load
float4 oval = make_float4(ival.x, ival.y, ival.z, ival.w);
out[i] = oval; // 128 bit store
}
}
__global__ void UChar2FloatKernel3(float *out, const unsigned char *in, int nElem) {
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
for(; i<nElem; i+=gridDim.x*blockDim.x)
{
out[i] = (float) in[i];
}
}
__global__ void UChar2FloatKernel4(float4 *out, const uchar4 *in, int nElem) {
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
for(; i<nElem; i+=gridDim.x*blockDim.x)
{
uchar4 ival = in[i]; // 32 bit load
float4 oval = make_float4(ival.x, ival.y, ival.z, ival.w);
out[i] = oval; // 128 bit store
}
}
On my Geforce GT 640, here are the timing results:
simpleKernel (cpu): 0.101463 seconds.
simpleKernel 1 (gpu): 0.007845 seconds.
simpleKernel 2 (gpu): 0.004914 seconds.
simpleKernel 3 (gpu): 0.005461 seconds.
simpleKernel 4 (gpu): 0.005461 seconds.
So we can see kernel2 which utilizes vector types only, is the winner. I have done these tests for (32 * 1024 * 768) elements. nvprof output is also shown below:
Time(%) Time Calls Avg Min Max Name
91.68% 442.45ms 4 110.61ms 107.43ms 119.51ms [CUDA memcpy DtoH]
3.76% 18.125ms 1 18.125ms 18.125ms 18.125ms [CUDA memcpy HtoD]
1.43% 6.8959ms 1 6.8959ms 6.8959ms 6.8959ms UChar2FloatKernel1(float*, unsigned char const *, int)
1.10% 5.3315ms 1 5.3315ms 5.3315ms 5.3315ms UChar2FloatKernel3(float*, unsigned char const *, int)
1.04% 5.0184ms 1 5.0184ms 5.0184ms 5.0184ms UChar2FloatKernel4(float4*, uchar4 const *, int)
0.99% 4.7816ms 1 4.7816ms 4.7816ms 4.7816ms UChar2FloatKernel2(float4*, uchar4 const *, int)
You can decorate the input array by the const __restrict__ qualifiers which notifies the compiler that the data is read-only and not aliased by any other pointer. In this way, the compiler will detect that the access is uniform and can optimise it by using one of the read-only caches (the constant cache or, on compute capability >=3.5, read-only data cache known as texture cache).
You can also decorate the output array by the __restrict__ qualifier to suggest the compiler other optimizations.
Finally, the recommendation by DarkZeros is worth to be followed.
You better write a vectorized version of your code, writing float4 into out at once.
this should be pretty straightforward in case nElem happens to be a boundary of 4-multiple, otherwise, u might need to mind a residue.

CUDA volatile and threadfence

What is the difference between the following two functions?
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
__threadfence();
}
Could anyone help me?
I implement BitonicSort in some different versions based on CUDA SDK version.
For ATOMIC version (bitonicSortAtomic), I tried to use __threadfence() in __syncblocks_atomic to maintain memory consistency. But it doesn't work (the output is incorrect). I have to call comparator_volatile instead of comparator, then I get correct result. Any idea?
The BitonicSort benchmark:
// (C) Copyright 2013, University of Illinois. All Rights Reserved
#include <stdlib.h>
#include <stdio.h>
#include "parboil.h"
#define THREADS 256
#define BLOCKS 32
#define NUM_VALS 2*THREADS*BLOCKS
__device__ volatile int mutex = 0;
__device__ inline void __syncblocks_atomic(int goal) {
__syncthreads();
// __threadfence();
int tx = threadIdx.x;
if (tx == 0) {
atomicAdd((int *)&mutex, 1);
while(g_mutex != goal) {}
}
__syncthreads();
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
#ifdef NAIVE
__global__ void bitonicSortNaive(float *src, int stride, int size) {
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
uint dir = (tid & (size / 2)) == 0;
unsigned int pos = 2*tid - (tid & (stride - 1));
comparator(src[pos], src[pos+stride], dir);
}
#endif
#ifdef ATOMIC
__global__ void bitonicSortAtomic(float *src, int length) {
uint numBlocks = gridDim.x * gridDim.y * gridDim.z;
uint goalVal = 0;
uint tid = threadIdx.x + blockDim.x * blockIdx.x;
for(uint size=2; size<=length; size<<=1) {
for(uint stride=size>>1; stride>0; stride=stride>>1) {
uint dir = (tid & (size / 2)) == 0;
uint pos = 2*tid - (tid & (stride - 1));
comparator_volatile(src[pos], src[pos+stride], dir);
if(stride>THREADS || (stride==1 && size>=THREADS)) {
goalVal += numBlocks;
__syncblocks_atomic(goalVal);
}
else
__syncthreads();
} // end for stride
} // end for size
}
#endif
int main() {
printf("[BENCH] Bitonic Sort %d elements\n", NUM_VALS);
printf("[BENCH] Xuhao Chen <cxh#illinois.edu>\n");
#ifdef NAIVE
printf("[BENCH] Naive version\n");
#endif
#ifdef ATOMIC
printf("[BENCH] Atomic Barrier\n");
#endif
float *values = (float*) malloc( NUM_VALS * sizeof(float));
array_init(values, NUM_VALS);
float *dev_values;
size_t size = NUM_VALS * sizeof(float);
cudaMalloc((void**) &dev_values, size);
cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS,1);
dim3 threads(THREADS,1);
cudaDeviceSynchronize();
#ifdef NAIVE
int j, k;
for (k = 2; k <= NUM_VALS; k <<= 1) {
for (j=k>>1; j>0; j=j>>1) {
bitonicSortNaive<<<blocks, threads>>>(dev_values, j, k);
}
}
#endif
#ifdef ATOMIC
bitonicSortAtomic<<<blocks, threads>>>(dev_values, NUM_VALS);
#endif
cudaDeviceSynchronize();
cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
cudaFree(dev_values);
free(values);
}
__syncblocks_atomic is a function to implement global barrier. Since there is inter-block communication, I have to keep data consistency.
The CUDA programming guide states:
If a variable located in
global or shared memory is declared as volatile, the compiler assumes that its value
can be changed or used at any time by another thread and therefore any reference to
this variable compiles to an actual memory read or write instruction.
This basically means that the memory will be flushed immediately as you assign a value to the variable, and will be fetched directly from the memory (with no cache) when you try to read its value.
In you first code sample, since both A and B are volatile, 6 actual memory instructions are generated. One read/write each time you use either A or B. The good point is that other threads will be able to see that modifications earlier, while they are made. The downside is that the execution will be slower, because the caches will be disabled.
In your second code sample, on the other side, the GPU is authorized to use caches to accelerate its execution, until the end of the function, when it's forced to issue a memory write. If both A and B are already cached, only 2 memory writes are issued. The downside is that other threads might only be able to see the changed value after the fence.
Another thing you should consider is that operations are not atomic.
If other threads try to access A and B while your function is executing, they might see a partial execution of the function, in both cases. In the second code sample, this is a bit less likely to happen, because the thread will probably use its cached value, and flush the final values at once (anyway, you should not rely on this).
Also, volatile works as a faster version of __threadfence() among threads in the same warp (because threads in a warp act synchronously).

cudaMalloc global array cause seg fault

I found some difficulty when I try to access a global array from function that's executed from device:
float globTemp[3][3] = "some value in here";
__device__ float* globTemp_d;
__global__ void compute(int *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = y*w+x;
if(x<3 && y<3)
a[i] = 1+globTemp_d[i];
}
int hostFunc(){
float *a_d;
cudaMalloc((void**)&a_d, 3*3*sizeof(int));
cudaMalloc((void**)&globTemp_d, 3*3*sizeof(int));
cudaMemcpy(globTemp_d,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
compute<<<1,1>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
}
However, I get seg fault when i try to access globTemp_d[i]. Am I doing something wrong in here?
There are a variety of problems with your code:
Your grid is a 1D grid of 1D threadblocks (in fact you are launching a single block of 1 thread) but your kernel is written as if it were expecting a 2D threadblock structure (using .x and .y built-in variables). A single thread won't get the work done certainly, and a 1D threadblock won't work with your kernel code.
__device__ variables are not accessed with cudaMalloc and cudaMemcpy. We use a different set of API calls like cudaMemcpyToSymbol.
You're not doing any cuda error checking which is always recommended when you're having difficulty. You should do cuda error checking on both API calls and kernel calls.
You're mixing float variables (a_d ) with int variables in the kernel parameters (int *a) so I don't think this code would compile without at least a warning. And that can lead to strange behavior of course if you ignore it.
This is the closest I could come to your code while fixing all the errors:
#include <stdio.h>
__device__ float* globTemp_d;
__global__ void compute(float *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = (y*w)+x;
if((x<3) && (y<3))
a[i] = 1.0f+globTemp_d[i];
}
int main(){
float *a_d, *d_globTemp;
float globTemp[3][3] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f};
float a[(3*3)];
dim3 threads(3,3);
dim3 blocks(1);
cudaMalloc((void**)&a_d, 3*3*sizeof(float));
cudaMalloc((void**)&d_globTemp, 3*3*sizeof(float));
cudaMemcpy(d_globTemp,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(globTemp_d, &d_globTemp, sizeof(float *));
compute<<<blocks,threads>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i<(3*3); i++)
printf("a[%d] = %f\n", i, a[i]);
return 0;
}
This code can be simplified by dispensing with the __device__ variable and just passing d_globTemp as a parameter to the kernel, and using it in place of references to globTemp_d. However I did not make that simplification.