How can I get kernel start time in CUDA? - cuda

I am trying with the following code to get kernel start time but the output of this code is not true.
I want the start time of the kernel function relative to the start time of this program. That is, if we consider the start time of this program as zero time, then what will be the start time of the kernel function.
What is my mistake?
//headers
#include <sys/time.h>
//kernel function defined here
int main(){
struct timeval kernelStartTime;
//memcpys and ....
gettimeofday(&kernelStartTime, 0);
MatrixMulCUDA<32><<<matrixMulgrid, matrixMulthreads>>>(C, A, B, dimsA.x, dimsB.x);
cudaDeviceSynchronize();
float startTime = (1000000.0 * kernelStartTime.tv_sec + kernelStartTime.tv_usec)/1000.0;
//some codes
printf("kernel start time = %f", startTime);
return 0;
}
Output:
kernel start time = 1.63786e+12
Thanks.

//headers
#include <sys/time.h>
//kernel function defined here
int main(){
struct timeval kernelStartTime, baseTime;
gettimeofday(&baseTime, 0);
//memcpys and ....
gettimeofday(&kernelStartTime, 0);
MatrixMulCUDA<32><<<matrixMulgrid, matrixMulthreads>>>(C, A, B, dimsA.x, dimsB.x);
cudaDeviceSynchronize();
float startTime = (1000000.0 * (kernelStartTime.tv_sec - baseTime.tv_sec) + (kernelStartTime.tv_usec - baseTime.tv_usec))/1000.0;
//some codes
printf("kernel start time = %f", startTime);
return 0;
}

Related

creating 16-bit input to cufftXtMakePlanMany and workSize for 1 GPU

I need to compute FFT on unsigned int 8bit data. Previously, I was using cufftPlanMany and my input was cufftReal and the output was cufftComplex, and I was using casting before and after FFT to convert from unsigned 8bit to cufftReal and then from cufftComplex to signed 8bit.
It came to my attention that cuFFT has a nice option to run FFT for half-precision data which I hope improves the running time. According to the documentation, it currently doesn't support all of cudaDataType (that would be wonderful if it can in the future), but at least I can run it with 16bit float (half-precision) with the following signature:
cufftResult
cufftXtMakePlanMany(cufftHandle plan, int rank, long long int *n, long long int *inembed,
long long int istride, long long int idist, cudaDataType inputtype,
long long int *onembed, long long int ostride, long long int odist,
cudaDataType outputtype, long long int batch, size_t *workSize,
cudaDataType executiontype);
with data types for input, output and execution respectively as: CUDA_R_16F, CUDA_C_16F and CUDA_C_16F. Tha twould be ideal that I can feed this cuFFT with my U8 data, is there any way for doing so? Otherwise, if the first casting from U8 to cufftReal is necessary how can I convert my data from cufftReal to CUDA_R_16F and then from CUDA_C_16F ? Is cuda smart enough to cast the input from float to half-precision data, because cufftExecR2C ultimaltey would be the same and there is no other function to be called for the half-precision?
The other question is about workSize which is designed for multiple GPU cases. Any idea how this size has to be calculated? (I have just 1 GPU). Am I responsible for managing that buffer?
TL;DR: I can see two possible approaches here, one using a half-precision transform and one using a single-precision transform (perhaps with CUFFT callbacks). The reasons to choose one or the other may depend on a number of factors such as size of your transform, control of scope of input data, the GPU you are running on, and other factors.
I'm not going to try to address the processing of the output data that you indicate here:
then from cufftComplex to signed 8bit.
since I don't know how to do that without more information. However the processing of the input data in each case should be illustrative for how you could process the output data.
Using half-precision transforms
A few things to note here are that you cannot (currently) use callbacks with half-precision transforms, and half-precision transforms can be more sensitive to input data characteristics (e.g. DC offset, transform size, etc.) than single or double-precision transforms. Also, half-precision transforms for the most part require a pascal or newer GPU (ignoring Jetson family).
Because half-precision transforms don't support callbacks, we'll use "ordinary" host code to process the input data; you could also do this processing on the device prior to the transform, the provided code outlines both possibilities. My "preprocessing" here is mostly just designed to prevent the 16-bit transform from overflowing. If you play around with this code you'll quickly see what an overflow looks like (inf and/or nan in the output).
$ cat t1961.cu
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <cufftXt.h>
#include <cuda_fp16.h>
#include <assert.h>
#include <iostream>
typedef half2 ctype;
typedef half rtype;
typedef unsigned char dtype;
long long sig_size = 1<<18;
const int amplitude = 127;
const float ramplitude = 1/(float)(4*amplitude);
__host__ __device__ half convert(int val){
return __float2half_rn((val - amplitude)*ramplitude);
}
__global__ void dev_convert(rtype *out, dtype *in, int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < sz)
out[idx] = convert(in[idx]);
}
int main(){
//put 4x sine waves into a U8 array
dtype *my_data = (dtype *)malloc(sig_size*sizeof(dtype));
for (int i = 0; i < sig_size; i++) my_data[i] = amplitude*(sin((i*8*3.141592654f)/sig_size)+1.0);
rtype *d_idata;
ctype *d_odata;
cudaMalloc(&d_idata, sizeof(rtype)*sig_size);
#ifdef USE_HOST
rtype *h_idata = (rtype *)malloc(sig_size*sizeof(rtype));
// convert to 16 bit float non-offset suitable for cufft
for (int i = 0; i < sig_size; i++) h_idata[i] = convert(my_data[i]);
cudaMemcpy(d_idata, h_idata, sig_size*sizeof(rtype), cudaMemcpyHostToDevice);
#else
const int bs = 256;
dtype *d_mydata;
cudaMalloc(&d_mydata, sig_size*sizeof(dtype));
cudaMemcpy(d_mydata, my_data, sig_size*sizeof(dtype), cudaMemcpyHostToDevice);
dev_convert<<<(sig_size+bs-1)/bs, bs>>>(d_idata, d_mydata, sig_size);
#endif
cudaMalloc(&d_odata, sizeof(ctype)*(sig_size/2+1));
cufftHandle plan;
cufftResult r;
r = cufftCreate(&plan);
assert(r == CUFFT_SUCCESS);
size_t ws = 0;
r = cufftXtMakePlanMany(plan, 1, &sig_size, NULL, 1, 1, CUDA_R_16F, NULL, 1, 1, CUDA_C_16F, 1, &ws, CUDA_C_16F);
assert(r == CUFFT_SUCCESS);
r = cufftXtExec(plan, d_idata, d_odata, CUFFT_FORWARD); // warm-up
assert(r == CUFFT_SUCCESS);
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
r = cufftXtExec(plan, d_idata, d_odata, CUFFT_FORWARD);
assert(r == CUFFT_SUCCESS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("forward FFT time for %ld samples: %fms\n", sig_size, et);
ctype *h_odata = (ctype *)malloc((sig_size/2+1)*sizeof(ctype));
cudaMemcpy(h_odata, d_odata, (sig_size/2+1)*sizeof(ctype), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
std::cout << __half2float(h_odata[i].x) << " + " << __half2float(h_odata[i].y) << "i" << std::endl;
return 0;
}
$ nvcc -o t1961 t1961.cu -lcufft
$ ./t1961
forward FFT time for 262144 samples: 0.027520ms
-258 + 0i
0.00349998 + 0.00127506i
-0.000146866 + -0.000833511i
0.00140095 + -0.00501251i
-1.57031 + -32752i
-0.00198174 + 0.00856018i
0.00474548 + 0.00359917i
-0.00226784 + 0.00987244i
$
Using a single precision transform with a load callback
This in my view has a few benefits. It is not as subject to the overflow phenomenon as the half precision transforms are, and the (load) callback routine allows us to still operate on U8 input data.
$ cat t1962.cu
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <cufftXt.h>
#include <cuda_fp16.h>
#include <assert.h>
#include <iostream>
typedef cufftComplex ctype;
typedef cufftReal rtype;
typedef unsigned char dtype;
long long sig_size = 1<<18;
const int amplitude = 127;
const cufftReal ramplitude = 1/(float)(4*amplitude);
__device__ rtype convert(int val){
return (val - amplitude)*ramplitude;
}
__device__ rtype myOwnCallback(void *dataIn,
size_t offset,
void *callerInfo,
void *sharedPtr) {
rtype ret;
ret = convert(((dtype *)dataIn)[offset]);
return ret;
}
__device__ cufftCallbackLoadR myOwnCallbackPtr = myOwnCallback;
int main(){
cufftCallbackLoadR hostCopyOfCallbackPtr;
cudaMemcpyFromSymbol(&hostCopyOfCallbackPtr,
myOwnCallbackPtr,
sizeof(hostCopyOfCallbackPtr));
//put 4x sine waves into a U8 array
dtype *my_data = (dtype *)malloc(sig_size*sizeof(dtype));
for (int i = 0; i < sig_size; i++) my_data[i] = amplitude*(sin((i*8*3.141592654f)/sig_size)+1.0);
ctype *d_odata;
dtype *d_mydata;
cudaMalloc(&d_mydata, sig_size*sizeof(dtype));
cudaMemcpy(d_mydata, my_data, sig_size*sizeof(dtype), cudaMemcpyHostToDevice);
cudaMalloc(&d_odata, sizeof(ctype)*(sig_size/2+1));
cufftHandle plan;
cufftResult r;
r = cufftCreate(&plan);
assert(r == CUFFT_SUCCESS);
size_t ws = 0;
r = cufftXtMakePlanMany(plan, 1, &sig_size, NULL, 1, 1, CUDA_R_32F, NULL, 1, 1, CUDA_C_32F, 1, &ws, CUDA_C_32F);
assert(r == CUFFT_SUCCESS);
void *rps[] = {(void *)hostCopyOfCallbackPtr};
r = cufftXtSetCallback(plan, rps, CUFFT_CB_LD_REAL, NULL);
assert(r == CUFFT_SUCCESS);
r = cufftXtExec(plan, (cufftReal *)d_mydata, d_odata, CUFFT_FORWARD); // warm-up
assert(r == CUFFT_SUCCESS);
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start);
r = cufftXtExec(plan, (cufftReal *)d_mydata, d_odata, CUFFT_FORWARD);
assert(r == CUFFT_SUCCESS);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
printf("forward FFT time for %ld samples: %fms\n", sig_size, et);
ctype *h_odata = (ctype *)malloc((sig_size/2+1)*sizeof(ctype));
cudaMemcpy(h_odata, d_odata, (sig_size/2+1)*sizeof(ctype), cudaMemcpyDeviceToHost);
for (int i = 0; i < 8; i++)
std::cout << h_odata[i].x << " + " << h_odata[i].y << "i" << std::endl;
return 0;
}
$ nvcc -o t1962 t1962.cu -rdc=true -lcufft_static -lculibos
$ ./t1962
forward FFT time for 262144 samples: 0.031488ms
-257.969 + 0i
0.00344251 + 0.00137726i
-3.96543e-05 + -0.00106905i
0.0013994 + -0.00490045i
0.0331312 + -32759.4i
-0.00190887 + 0.00865401i
0.00454092 + 0.00368094i
-0.00219025 + 0.00983646i
$
Yes, the results are not numerically identical between the two transform types. It's not reasonable to expect that 16-bit floating point calculations and 32-bit floating point calculations will be identical. In all probability the 32-bit calculations are "more accurate". For this sinewave case, the terms I consider most important are the DC term as well as the magnitude spike at the fundamental. Those are numerically close to each other. The other terms are "in the noise". The timing results are not exactly comparable either, as the 16-bit calculation case omits the cost of the kernel call to convert the data from U8 to F16. You can use a profiler or just refactor the code to get more comparable timing.
workSize can be ignored for the single GPU case when using cufftXtMakePlanMany, otherwise, use the provided routines to determine workSize.

CUDA: How to add a device function from host to an array declared in Device? [duplicate]

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?
If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);
Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.
It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

CUDA: Copy dynamically created array of function pointers on the CPU to GPU memory

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?
If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);
Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.
It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

Measuring time like NVIDIA Visual Profiler [duplicate]

I need to time a CUDA kernel execution. The Best Practices Guide says that we can use either events or standard timing functions like clock() in Windows. My problem is that using these two functions gives me a totally different result.
In fact, the result given by events seems to be huge compared to the actual speed in practice.
What I actually need all this for is to be able to predict the running time of a computation by first running a reduced version of it on a smaller data set. Unfortunately, the results of this benchmark are totally unrealistic, being either too optimistic (clock()) or waaaay too pessimistic (events).
You could do something along the lines of :
#include <sys/time.h>
struct timeval t1, t2;
gettimeofday(&t1, 0);
kernel_call<<<dimGrid, dimBlock, 0>>>();
HANDLE_ERROR(cudaThreadSynchronize();)
gettimeofday(&t2, 0);
double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;
printf("Time to generate: %3.1f ms \n", time);
or:
float time;
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );
kernel_call<<<dimGrid, dimBlock, 0>>>();
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );
printf("Time to generate: %3.1f ms \n", time);
A satisfactory answer has been already given to your question.
I have constructed classes for timing C/C++ as well as CUDA operations and want to share with other hoping they could be helpful to next users. You will just need to add the 4 files reported below to your project and #include the two header files as
// --- Timing includes
#include "TimingCPU.h"
#include "TimingGPU.cuh"
The two classes can be used as follows.
Timing CPU section
TimingCPU timer_CPU;
timer_CPU.StartCounter();
CPU perations to be timed
std::cout << "CPU Timing = " << timer_CPU.GetCounter() << " ms" << std::endl;
Timing GPU section
TimingGPU timer_GPU;
timer_GPU.StartCounter();
GPU perations to be timed
std::cout << "GPU Timing = " << timer_GPU.GetCounter() << " ms" << std::endl;
In both the cases, the timing is in milliseconds. Also, the two classes can be used under linux or windows.
Here are the 4 files:
TimingCPU.cpp
/**************/
/* TIMING CPU */
/**************/
#include "TimingCPU.h"
#ifdef __linux__
#include <sys/time.h>
#include <stdio.h>
TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }
TimingCPU::~TimingCPU() { }
void TimingCPU::StartCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return;
cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
}
double TimingCPU::GetCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return -1;
long cur_time = 1000000 * time.tv_sec + time.tv_usec;
double sec = (cur_time - cur_time_) / 1000000.0;
if(sec < 0) sec += 86400;
cur_time_ = cur_time;
return 1000.*sec;
}
#elif _WIN32 || _WIN64
#include <windows.h>
#include <iostream>
struct PrivateTimingCPU {
double PCFreq;
__int64 CounterStart;
};
// --- Default constructor
TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }
// --- Default destructor
TimingCPU::~TimingCPU() { }
// --- Starts the timing
void TimingCPU::StartCounter()
{
LARGE_INTEGER li;
if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";
(*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;
QueryPerformanceCounter(&li);
(*privateTimingCPU).CounterStart = li.QuadPart;
}
// --- Gets the timing counter in ms
double TimingCPU::GetCounter()
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
}
#endif
TimingCPU.h
// 1 micro-second accuracy
// Returns the time in seconds
#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__
#ifdef __linux__
class TimingCPU {
private:
long cur_time_;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
};
#elif _WIN32 || _WIN64
struct PrivateTimingCPU;
class TimingCPU
{
private:
PrivateTimingCPU *privateTimingCPU;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
}; // TimingCPU class
#endif
#endif
TimingGPU.cu
/**************/
/* TIMING GPU */
/**************/
#include "TimingGPU.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
struct PrivateTimingGPU {
cudaEvent_t start;
cudaEvent_t stop;
};
// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }
// default destructor
TimingGPU::~TimingGPU() { }
void TimingGPU::StartCounter()
{
cudaEventCreate(&((*privateTimingGPU).start));
cudaEventCreate(&((*privateTimingGPU).stop));
cudaEventRecord((*privateTimingGPU).start,0);
}
void TimingGPU::StartCounterFlags()
{
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
cudaEventRecord((*privateTimingGPU).start,0);
}
// Gets the counter in ms
float TimingGPU::GetCounter()
{
float time;
cudaEventRecord((*privateTimingGPU).stop, 0);
cudaEventSynchronize((*privateTimingGPU).stop);
cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
return time;
}
TimingGPU.cuh
#ifndef __TIMING_CUH__
#define __TIMING_CUH__
/**************/
/* TIMING GPU */
/**************/
// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.
struct PrivateTimingGPU;
class TimingGPU
{
private:
PrivateTimingGPU *privateTimingGPU;
public:
TimingGPU();
~TimingGPU();
void StartCounter();
void StartCounterFlags();
float GetCounter();
}; // TimingCPU class
#endif
There is an out-of-box GpuTimer struct for use:
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__
struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start()
{
cudaEventRecord(start, 0);
}
void Stop()
{
cudaEventRecord(stop, 0);
}
float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
#endif /* __GPU_TIMER_H__ */
If you want to measure GPU time you pretty much have to use events. Theres a great discussion thread on the do's and don'ts of timing your application over on the nvidia forums here.
You can use the compute visula profiler which will be great for your purpose. it measures the time of every cuda function and tells you how many times you called it .

Counting occurrences of specific events in CUDA kernels

Problem
I am trying to find the best way to count how many times my program ends up in some specific branches of my CUDA kernels. The idea is that some events should nearly never happen, but since the data processed by the GPU is given by a numerical optimization solver, there may be some situations where ill-defined cases become more common. Thus, I want to be able to track/monitor these phenomenons over multiple simulations to make some global statistics later.
Possible idea
The most straightforward way to do this may be to use a structure dedicated to monitoring such occurrences. Then, when entering a monitored branch, we increment the associated counter using atomicAdd. At the end of the simulation, we copy the counters back to the host and store them for some future statistics processing.
In my case, the cost of using atomicAdd should not be that important since I should not be entering those branches that much, but still, I may want to monitor some of the common branches later on, so what would be a better approach then? Since this is just for monitoring, I do not want the overhead to be too important.
I guess I could also have one monitoring structure per block and do a sum at the end, since it should not use much global memory anyway (1 unsigned int per monitored branch).
Code example
#include <iostream>
#include <time.h>
#include <cuda.h>
#include <stdio.h>
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
struct Stats
{
unsigned int even;
};
__global__ void test_kernel(int* A, int* B, Stats* stats)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int res = A[tid] + (int)tid;
if (res%2 == 0)
atomicAdd(&(stats->even), 1);
B[tid] = res;
}
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
void print_array(int* ar, unsigned int n)
{
for (unsigned int i = 0; i < n; ++i)
std::cout << ar[i] << " ";
std::cout << std::endl;
}
void print_stats(Stats* s)
{
std::cout << "even: " << s->even << std::endl;
}
int main()
{
// vector size
const unsigned int N = 10;
// device vectors
int *d_A, *d_B;
Stats *d_stats;
// host vectors
int *h_A, *h_B;
Stats *h_stats;
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc(&d_A, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_B, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_stats, sizeof(Stats)));
// allocate host memory
h_A = new int[N];
h_B = new int[N];
h_stats = new Stats;
// initialize host data
srand(time(NULL));
for (unsigned int i = 0; i < N; ++i)
{
h_A[i] = get_random_int(0,10);
h_B[i] = 0;
}
memset(h_stats, 0, sizeof(Stats));
// copy data to the device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_stats, h_stats, sizeof(Stats), cudaMemcpyHostToDevice));
// launch kernel
dim3 grid_size, block_size;
grid_size.x = N;
test_kernel<<<grid_size, block_size>>>(d_A, d_B, d_stats);
// copy result back to host
CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_stats, d_stats, sizeof(Stats), cudaMemcpyDeviceToHost));
print_array(h_B, N);
print_stats(h_stats);
// free device memory
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_stats));
// free host memory
delete [] h_A;
delete [] h_B;
delete h_stats;
}
Hardware/software information
The solution I am looking for should work for CC >= 2.0 devices and CUDA >= 5.0.
The atomicAdd is is one possibility and i would probably go that route. If you do not use the result of the atomicAdd function call the compiler will emit a reduction operation such as RED.E.ADD. Reduction is very fast as long as there are not many conflicts happening (i actually use it sometimes even if i do not need the operation to be atomic because it can be quicker than loading value from global memory, doing an arithmetic operation and saving back to global memory).
The second option you have is to use a profiler counter and use the profiler to analyze the result. Please see Profiler Counter Function for more details.