CUDA volatile and threadfence - cuda

What is the difference between the following two functions?
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
__threadfence();
}
Could anyone help me?
I implement BitonicSort in some different versions based on CUDA SDK version.
For ATOMIC version (bitonicSortAtomic), I tried to use __threadfence() in __syncblocks_atomic to maintain memory consistency. But it doesn't work (the output is incorrect). I have to call comparator_volatile instead of comparator, then I get correct result. Any idea?
The BitonicSort benchmark:
// (C) Copyright 2013, University of Illinois. All Rights Reserved
#include <stdlib.h>
#include <stdio.h>
#include "parboil.h"
#define THREADS 256
#define BLOCKS 32
#define NUM_VALS 2*THREADS*BLOCKS
__device__ volatile int mutex = 0;
__device__ inline void __syncblocks_atomic(int goal) {
__syncthreads();
// __threadfence();
int tx = threadIdx.x;
if (tx == 0) {
atomicAdd((int *)&mutex, 1);
while(g_mutex != goal) {}
}
__syncthreads();
}
__device__ inline void comparator(float &A, float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
__device__ inline void comparator_volatile(volatile float &A, volatile float &B, uint dir) {
float t;
if ((A > B) == dir) {
t = A;
A = B;
B = t;
}
}
#ifdef NAIVE
__global__ void bitonicSortNaive(float *src, int stride, int size) {
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
uint dir = (tid & (size / 2)) == 0;
unsigned int pos = 2*tid - (tid & (stride - 1));
comparator(src[pos], src[pos+stride], dir);
}
#endif
#ifdef ATOMIC
__global__ void bitonicSortAtomic(float *src, int length) {
uint numBlocks = gridDim.x * gridDim.y * gridDim.z;
uint goalVal = 0;
uint tid = threadIdx.x + blockDim.x * blockIdx.x;
for(uint size=2; size<=length; size<<=1) {
for(uint stride=size>>1; stride>0; stride=stride>>1) {
uint dir = (tid & (size / 2)) == 0;
uint pos = 2*tid - (tid & (stride - 1));
comparator_volatile(src[pos], src[pos+stride], dir);
if(stride>THREADS || (stride==1 && size>=THREADS)) {
goalVal += numBlocks;
__syncblocks_atomic(goalVal);
}
else
__syncthreads();
} // end for stride
} // end for size
}
#endif
int main() {
printf("[BENCH] Bitonic Sort %d elements\n", NUM_VALS);
printf("[BENCH] Xuhao Chen <cxh#illinois.edu>\n");
#ifdef NAIVE
printf("[BENCH] Naive version\n");
#endif
#ifdef ATOMIC
printf("[BENCH] Atomic Barrier\n");
#endif
float *values = (float*) malloc( NUM_VALS * sizeof(float));
array_init(values, NUM_VALS);
float *dev_values;
size_t size = NUM_VALS * sizeof(float);
cudaMalloc((void**) &dev_values, size);
cudaMemcpy(dev_values, values, size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS,1);
dim3 threads(THREADS,1);
cudaDeviceSynchronize();
#ifdef NAIVE
int j, k;
for (k = 2; k <= NUM_VALS; k <<= 1) {
for (j=k>>1; j>0; j=j>>1) {
bitonicSortNaive<<<blocks, threads>>>(dev_values, j, k);
}
}
#endif
#ifdef ATOMIC
bitonicSortAtomic<<<blocks, threads>>>(dev_values, NUM_VALS);
#endif
cudaDeviceSynchronize();
cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
cudaFree(dev_values);
free(values);
}
__syncblocks_atomic is a function to implement global barrier. Since there is inter-block communication, I have to keep data consistency.

The CUDA programming guide states:
If a variable located in
global or shared memory is declared as volatile, the compiler assumes that its value
can be changed or used at any time by another thread and therefore any reference to
this variable compiles to an actual memory read or write instruction.
This basically means that the memory will be flushed immediately as you assign a value to the variable, and will be fetched directly from the memory (with no cache) when you try to read its value.
In you first code sample, since both A and B are volatile, 6 actual memory instructions are generated. One read/write each time you use either A or B. The good point is that other threads will be able to see that modifications earlier, while they are made. The downside is that the execution will be slower, because the caches will be disabled.
In your second code sample, on the other side, the GPU is authorized to use caches to accelerate its execution, until the end of the function, when it's forced to issue a memory write. If both A and B are already cached, only 2 memory writes are issued. The downside is that other threads might only be able to see the changed value after the fence.
Another thing you should consider is that operations are not atomic.
If other threads try to access A and B while your function is executing, they might see a partial execution of the function, in both cases. In the second code sample, this is a bit less likely to happen, because the thread will probably use its cached value, and flush the final values at once (anyway, you should not rely on this).
Also, volatile works as a faster version of __threadfence() among threads in the same warp (because threads in a warp act synchronously).

Related

Optimizing memory access for complex numbers

I have a kernel that operates on complex numbers, and I am loading the values like this:
thrust::complex<float> x = X[tIdx];
where X is in global memory. When I profile this kernel with nvvp, I find that it is memory bandwidth-limited and the profiler suggests that I improve the memory access pattern:
Global Load L2 Transactions/Access=8, Ideal Transactions/Access=4
The disassembly confirms that this line is indeed split into two 32-bit loads, producing a strided access pattern:
LDG.E R9, [R16];
LDG.E R11, [R16+0x4];
How can I get this to compile into a single 64-bit load?
Potential solutions
I realize this is pretty closely related to this earlier question but the proposed solutions (change the global memory layout or use shared memory) seem less ideal than a 64-bit load.
The NVidia developer blog suggests reinterpret_cast to a vector data type such as float2, but I'm a little hazy about how this fits in with pointer aliasing rules.
I must also confess that this is somewhat of a theoretical question. For this particular kernel, I'm limited by the device memory bandwidth, so halving the # of L2 transactions shouldn't significantly improve the overall performance. But I anticipate working with more complex numbers in my future, and if there's a simple solution then I'd like to start using it now.
The basic problem here is that the compiler seems to need explicit alignment specifications for a type before it will generate vector load and store instructions. Consider the following trivial example:
class __align__(8) cplx0
{
public:
__device__ __host__ cplx0(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
class cplx1
{
public:
__device__ __host__ cplx1(float _re, float _img) : re(_re), img(_img) {};
float re, img;
};
template<typename T>
__global__ void memsetkernel(T* out, const T val, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = val;
}
template<typename T>
__global__ void memcpykernel(const T* __restrict__ in, T* __restrict__ out, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll 8
for(; tid < N; tid += stride) out[tid] = in[tid];
}
template<typename T>
void memcpy(const T* in, T* out, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memcpykernel<T><<<nblocks, nthreads>>>(in, out, Nitems);
cudaDeviceSynchronize();
}
template<typename T>
void memset(T* in, const T value, int Nitems)
{
int nthreads = 1024;
int nblocks = 13 * 2; // GTX 970 with 13 SM
memsetkernel<T><<<nblocks, nthreads>>>(in, value, Nitems);
cudaDeviceSynchronize();
}
int main(void)
{
const int Nitems = 1 << 24;
typedef cplx0 fcomplex0;
typedef cplx1 fcomplex1;
{
fcomplex0* in;
fcomplex0* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex0));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex0>(in, fcomplex0(1.0f,1.0f), Nitems);
memcpy<fcomplex0>(in, out, Nitems);
}
cudaFree(in);
cudaFree(out);
}
{
fcomplex1* in;
fcomplex1* out;
cudaMalloc((void **)&in, Nitems * sizeof(fcomplex1));
cudaMalloc((void **)&out, Nitems * sizeof(fcomplex1));
for(int i=0; i<10; i++) {
memset<fcomplex1>(in, fcomplex1(1.0f,1.0f), Nitems);
memcpy<fcomplex1>(in, out, Nitems);
cudaDeviceSynchronize();
}
cudaFree(in);
cudaFree(out);
}
cudaDeviceReset();
return 0;
}
Here we has two home-baked complex types, one with explicit alignment specifications, and one without. Otherwise they are identical. Putting them through a naïve mempcy and memset kernels in this test harness allows us to inspect the code generation behaviour of the toolchain for each type and benchmark the performance.
Firstly, the code. For cplx0 class, which has explicit 8-byte alignment, the compiler emits vectorized loads and stores in both kernels:
memcpykernel
ld.global.nc.v2.f32 {%f5, %f6}, [%rd17];
st.global.v2.f32 [%rd18], {%f5, %f6};
memsetkernel
st.global.v2.f32 [%rd11], {%f1, %f2};
whereas for the cplx1 case, it does not:
memcpykernel
ld.global.nc.f32 %f1, [%rd16];
ld.global.nc.f32 %f2, [%rd16+4];
st.global.f32 [%rd15+4], %f2;
st.global.f32 [%rd15], %f1;
memsetkernel
st.global.f32 [%rd11+4], %f2;
st.global.f32 [%rd11], %f1;
Looking at performance, there is a non-trivial difference in performance for the memset case (CUDA 8 release toolkit, GTX 970 with Linux 367.48 driver):
$ nvprof ./complex_types
==29074== NVPROF is profiling process 29074, command: ./complex_types
==29074== Profiling application: ./complex_types
==29074== Profiling result:
Time(%) Time Calls Avg Min Max Name
33.04% 19.264ms 10 1.9264ms 1.9238ms 1.9303ms void memcpykernel<cplx1>(cplx1 const *, cplx1*, int)
32.72% 19.080ms 10 1.9080ms 1.9055ms 1.9106ms void memcpykernel<cplx0>(cplx0 const *, cplx0*, int)
19.15% 11.165ms 10 1.1165ms 1.1120ms 1.1217ms void memsetkernel<cplx1>(cplx1*, cplx1, int)
15.09% 8.7985ms 10 879.85us 877.67us 884.13us void memsetkernel<cplx0>(cplx0*, cplx0, int)
The Thrust templated complex type does not have an explicit alignment definition (although it potentially could via specialization, although that would somewhat defeat the purpose). So your only choice here is to either make your own version of the Thrust type with explicit alignment, or use another complex type which does (like the cuComplex type which CUBLAS and CUFFT use).

Compare Thrust fill with kernel launch speed [duplicate]

I want to add 128-bit vectors with carry. My 128-bit version (addKernel128 in the code below) is twice slower than the basic 32-bit version (addKernel32 below).
Do I have memory coalescing problems ? How can I get better performance ?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#define UADDO(c, a, b) asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define UADDC(c, a, b) asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
__global__ void addKernel32(unsigned int *c, const unsigned int *a, const unsigned int *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
__global__ void addKernel128(unsigned *c, const unsigned *a, const unsigned *b, const int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size / 4)
{
uint4 a4 = ((const uint4 *)a)[tid],
b4 = ((const uint4 *)b)[tid],
c4;
UADDO(c4.x, a4.x, b4.x)
UADDC(c4.y, a4.y, b4.y) // add with carry
UADDC(c4.z, a4.z, b4.z) // add with carry
UADDC(c4.w, a4.w, b4.w) // add with carry (no overflow checking for clarity)
((uint4 *)c)[tid] = c4;
tid += blockDim.x * gridDim.x;
}
}
int main()
{
const int size = 10000000; // 10 million
unsigned int *d_a, *d_b, *d_c;
cudaMalloc((void**)&d_a, size * sizeof(int));
cudaMalloc((void**)&d_b, size * sizeof(int));
cudaMalloc((void**)&d_c, size * sizeof(int));
cudaMemset(d_a, 1, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_b, 2, size * sizeof(int)); // dummy init just for the example
cudaMemset(d_c, 0, size * sizeof(int));
int nbThreads = 512;
int nbBlocks = 1024; // for example
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
addKernel128<<<nbBlocks, nbThreads>>>(d_c, d_a, d_b, size);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float m = 0;
cudaEventElapsedTime(&m, start, stop);
cudaFree(d_c);
cudaFree(d_b);
cudaFree(d_a);
cudaDeviceReset();
printf("Elapsed = %g\n", m);
return 0;
}
Timing CUDA code on a WDDM GPU can be quite difficult for a variety of reasons. Most of these revolve around the fact that the GPU is being managed as a display device by Windows, and this can introduce a variety of artifacts into the timing. One example is that the windows driver and WDDM will batch work for the GPU, and may interleave display work in the middle of CUDA GPU work.
if possible, time your cuda code on linux, or else on a windows GPU
in TCC mode.
for performance, always build without the -G switch. In visual studio, this usually corresponds to building the release, not the debug version of the project.
To get a good performance comparison, it's usually advisable to do some "warm up runs" before actually measuring the timing results. These will eliminate "start-up" and other one-time measurement issues, are you are more likely to get sensible results. You may also wish to run your code a number of times and average the results.
It's also usually advisable to compile with an arch flag that corresponds to your GPU, so for example -arch=sm_20 for a cc2.0 GPU.

cudaMemcpy error from Device to Host

I am returning a two-dimensional structure after computation on a kernel, from device to host.
HANDLE_ERROR(cudaMemcpy(Pixel,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
Pixel is declared on host, Pixel_gpu is allocated on device, as below:
**Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
pixel **Pixel = (pixel**)malloc((img_ht)*sizeof(pixel*));
for(int i=0;i<(img_ht);i++)
Pixel[i]=(pixel*)malloc((img_wd)*sizeof(pixel));
Using this I end up getting illegal memory access error.
Trying a similar memory alignment for result, doesn't help either.
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
Kernel launching:
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
int thread_block=sqrt(prop.maxThreadsPerBlock);
dim3 DimGrid(ceil(img_wd/thread_block),ceil(img_ht/thread_block),1);
dim3 DimBlock(sqrt(prop.maxThreadsPerBlock),sqrt(prop.maxThreadsPerBlock),1);
//allocating gpu memory
pixel **Pixel_tmp_gpu, **Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_tmp_gpu,img_wd*img_ht*sizeof(pixel)));
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
float **kernel0_gpu, **kernel1_gpu;
HANDLE_ERROR(cudaMalloc(&kernel0_gpu,k*1*sizeof(float)));
HANDLE_ERROR(cudaMalloc(&kernel1_gpu,1*k*sizeof(float)));
cout<<"memory allocated"<<endl;
//copying needed data
HANDLE_ERROR(cudaMemcpy(Pixel_tmp_gpu,Pixel_tmp,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(Pixel_gpu,Pixel,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel0_gpu,kernel0,k*1*sizeof(float),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel1_gpu,kernel1,1*k*sizeof(float),cudaMemcpyHostToDevice));
cout<<"memory transfers done"<<endl;
vertical_conv<<<DimGrid,DimBlock>>>(Pixel_gpu, Pixel_tmp_gpu,img_wd, img_ht,kernel0_gpu,k);
time_t vertical_convolution=time(NULL);
cout<<" vertical_convolution time: "<<double(vertical_convolution - reading_file)<<"sec"<<endl;
horizontal_conv<<<DimGrid,DimBlock>>>(Pixel_tmp_gpu, Pixel_gpu, img_wd, img_ht, kernel1_gpu, k);
time_t horizontal_convolution=time(NULL);
cout<<" horizontal convolution time:" <<double(horizontal_convolution-vertical_convolution)<<" sec"<<endl;
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
The functions used:
struct pixel //to store RGB values
{
unsigned char r;
unsigned char g;
unsigned char b;
};
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
cout<<cudaGetErrorString(err)<<" in "<< file <<" at line "<< line<<endl;
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__device__ void padding(pixel** Pixel_val, int x_coord, int y_coord, int img_width, int img_height, pixel Px) //padding the image,depending on pixel coordinates, can be replaced by reflect for better result //currently zero padding
{
if(x_coord<img_width && y_coord<img_height && x_coord>=0 && y_coord>=0)
Px=Pixel_val[y_coord][x_coord];
}
The vertical convolution:
__global__ void vertical_conv(pixel** Pixel_in, pixel** Pixel_out,int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_g, tmp_b;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
if(row<img_ht && col<img_wd){
tmp_r=0, tmp_g=0, tmp_b=0;
for(int l=0;l<k;l++)
{
padding(Pixel_in, col, row+l-(k-1)/2, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[l][0];
tmp_b+=pix_val.b * kernel[l][0];
tmp_g+=pix_val.g * kernel[l][0];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
The horizontal convolution:
__global__ void horizontal_conv(pixel** Pixel_in, pixel** Pixel_out, int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_b, tmp_g;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
//horizontal convolution
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
tmp_r=0, tmp_g=0, tmp_b=0;
if(row<img_ht && col<img_wd)
{
for(int l=0; l<k;l++)
{
padding(Pixel_in, col+l-(k-1)/2, row, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[0][l];
tmp_g+=pix_val.g * kernel[0][l];
tmp_b+=pix_val.b * kernel[0][l];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
Can someone help me know what could be wrong here?
The Pixel_gpu is be one contiguous memory block, consisting of w*h elements of type pixel. Its size is
sizeOfDeviceMemory = img_wd * img_ht * sizeof(pixel)
On contrast to that, Pixel on the CPU side is an "array of pointers": The Pixel pointer points to h elements of type pixel*. Its size is
sizeOfHostMemory = img_ht * sizeof(pixel*)
Clearly, these sizes are different, and trying to write sizeOfDeviceMemory bytes to this pointer causes an illegal access.
Usually, you should allocate your memory on the host as one contiguous block as well:
pixel* Pixel = (pixel*)malloc(img_wd * img_ht * sizeof(pixel));
Then you can copy the memory to this pointer using the cudaMemcpy call that you already have.
If having a pixel* on the host is not OK for you, and you urgently need a pixel** (for example, to pass it to some other function), then you can create an "array of pointers" like you had before, but not allocate new memory for each row, but instead, let each pointer point to one "row" of the single, contiguous pixel block.

is there a way to do "saypx" in cuBLAS?

cublasSaxpy computes y' = a * x + y, where x and y are vectors and a is scalar.
It turns out I need to compute y' = a * y + x instead. I'm not seeing how to twist the cuBLAS library into doing that.
(Of course, I could compute y' = a * y, then y' = y' + x, but y' is read too often in that case. And I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code. I'm just surprised there's no apparent way to do "saypx" directly.)
[Added] There are functions similar to "saxpby" in Intel's version of cblas, which would do what I need. But oddly enough, that's not in cuBLAS.
[Added #2] It looks like I can use the cudnnAddTensor function, with some aliasing of descriptors (I have a FilterDescriptor that points to the tensor, which AddTensor won't accept, but I should be able to alias a TensorDescriptor to the same memory and shape.)
There isn't a way I am aware of to do what you are asking in CUBLAS, nor in standard BLAS. What you have found in MKL is an extension added by Intel, but I don't recall seeing something similar in other host and accelerator BLAS implementations.
The good news is that your assertion that "I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code", is untrue, at least for an operation as trivial as saxpy. Even a naïve implementation of saxpy will get very close to CUBLAS because there really aren't that many was to read two arrays, perform an FMAD and write back the result. As long as you get memory coalescing correct, it is pretty simple to write performant code. For example:
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>
#include "cublas_v2.h"
typedef enum
{
AXPY = 0,
AXPBY = 1
} saxpy_op_t;
__device__ __host__ __inline__
float axpby_op(float y, float x, float a)
{
return a * y + x;
}
__device__ __host__ __inline__
float axpy_op(float y, float x, float a)
{
return y + a * x;
}
template<typename T>
class pitched_accessor
{
T * p;
size_t pitch;
public:
__host__ __device__
pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};
__host__ __device__
T& operator[](size_t idx) { return p[pitch*idx]; };
__host__ __device__
const T& operator[](size_t idx) const { return p[pitch*idx]; };
};
template<saxpy_op_t op>
__global__
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x,
const float a, const unsigned int N1)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
#pragma unroll 8
for(; idx < N1; idx += stride) {
switch (op) {
case AXPY:
y[idx] = axpy_op(y[idx], x[idx], a);
break;
case AXPBY:
y[idx] = axpby_op(y[idx], x[idx], a);
break;
}
}
}
__host__ void saxby(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
__host__ void saxpy(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
auto it = yhat.begin();
for(; it != yhat.end(); ++it) {
float err = std::fabs(*it - result);
assert( err < tolerance );
}
}
int main()
{
const int N = 1<<22;
std::vector<float> x_h(N);
std::vector<float> y_h(N);
const float a = 2.f, y0 = 1234.f, x0 = 532.f;
std::fill(y_h.begin(), y_h.end(), y0);
std::fill(x_h.begin(), x_h.end(), x0);
float *x_d, *y_d;
size_t sz = sizeof(float) * size_t(N);
cudaMalloc((void **)&x_d, sz);
cudaMalloc((void **)&y_d, sz);
cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxby(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpby_op(y0, x0, a));
}
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxpy(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
}
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
cublasDestroy(handle);
}
return int(cudaDeviceReset());
}
This demonstrates that a very simple axpy kernel can be easily adapted to perform both the standard operation and the version you want, and run within 10% of the runtime of CUBLAS on the compute 5.2 device I tested it on:
$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%) Time Calls Avg Min Max Name
54.06% 11.190ms 5 2.2381ms 960ns 2.9094ms [CUDA memcpy HtoD]
40.89% 8.4641ms 3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH]
1.73% 357.59us 1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.72% 355.15us 1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.60% 332.21us 1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)

cudaMalloc global array cause seg fault

I found some difficulty when I try to access a global array from function that's executed from device:
float globTemp[3][3] = "some value in here";
__device__ float* globTemp_d;
__global__ void compute(int *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = y*w+x;
if(x<3 && y<3)
a[i] = 1+globTemp_d[i];
}
int hostFunc(){
float *a_d;
cudaMalloc((void**)&a_d, 3*3*sizeof(int));
cudaMalloc((void**)&globTemp_d, 3*3*sizeof(int));
cudaMemcpy(globTemp_d,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
compute<<<1,1>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
}
However, I get seg fault when i try to access globTemp_d[i]. Am I doing something wrong in here?
There are a variety of problems with your code:
Your grid is a 1D grid of 1D threadblocks (in fact you are launching a single block of 1 thread) but your kernel is written as if it were expecting a 2D threadblock structure (using .x and .y built-in variables). A single thread won't get the work done certainly, and a 1D threadblock won't work with your kernel code.
__device__ variables are not accessed with cudaMalloc and cudaMemcpy. We use a different set of API calls like cudaMemcpyToSymbol.
You're not doing any cuda error checking which is always recommended when you're having difficulty. You should do cuda error checking on both API calls and kernel calls.
You're mixing float variables (a_d ) with int variables in the kernel parameters (int *a) so I don't think this code would compile without at least a warning. And that can lead to strange behavior of course if you ignore it.
This is the closest I could come to your code while fixing all the errors:
#include <stdio.h>
__device__ float* globTemp_d;
__global__ void compute(float *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = (y*w)+x;
if((x<3) && (y<3))
a[i] = 1.0f+globTemp_d[i];
}
int main(){
float *a_d, *d_globTemp;
float globTemp[3][3] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f};
float a[(3*3)];
dim3 threads(3,3);
dim3 blocks(1);
cudaMalloc((void**)&a_d, 3*3*sizeof(float));
cudaMalloc((void**)&d_globTemp, 3*3*sizeof(float));
cudaMemcpy(d_globTemp,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(globTemp_d, &d_globTemp, sizeof(float *));
compute<<<blocks,threads>>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i<(3*3); i++)
printf("a[%d] = %f\n", i, a[i]);
return 0;
}
This code can be simplified by dispensing with the __device__ variable and just passing d_globTemp as a parameter to the kernel, and using it in place of references to globTemp_d. However I did not make that simplification.