Read data in a proper way - cuda

I have a cpp file where I am creating an image and store the data to myOutput pointer:
int Rows = 80;
int Cols = 64;
for (int i = 0; i < Rows; i++ ){
for (int j = 0; j < Cols; j++ )
{
X = 1.0f * ((float) i - (float) Rows / 2) / (float) Rows;
Y = 2.0f * ((float) j - (float) Cols / 2) / (float) Cols;
.....
myOutput->Re = cosf( ......);
myOutput->Im = sinf(.......);
++myOutput;
}
}
Then , in cuda I am reading like:
int bx = blockIdx.x , by = blockIdx.y;
int tx = threadIdx.x , ty = threadIdx.y;
int RowIdx = ty + by * TILE_WIDTH;
int ColIdx = tx + bx * TILE_WIDTH;
Index = RowIdx * Cols + ColIdx;
//copy input data to shared memory
myshared[ty+1][tx+1] = *( devInputArray + Index );
(So , the myOutput generated from cpp is loaded in devInputArray).
Now , I want to process many images simultaneously.
So, in cpp ,the following additions must be made (for 2 images for example) :
int ImagesNb = 2;
for ( ImagesIdx = 0; ImagesIdx < ImagesNb; ImagesIdx++ ){
for (int i = 0; i < Rows; i++ ){
for (int j = 0; j < Cols; j++ )
{
X = (ImagesIdx + 1) * 1.0f * ((float) i - (float) Rows / 2) / (float) Rows;
Y = (ImagesIdx + 1) * 2.0f * ((float) j - (float) Cols / 2) / (float) Cols;
...
But , now I am not sure how to read the data from cuda.
I don't know how to take into account the number of images.
Before , I had a pointer which contained data (80 x 64) .
Now , it still contains the same dimension of every image but with more data.
I must change this:
Index = RowIdx * Cols + ColIdx;
//copy input data to shared memory
myshared[ty+1][tx+1] = *( devInputArray + Index );
but I can't figure how!
I hope it is clear!
UPDATED
I am trying something like this:
int bx = blockIdx.x , by = blockIdx.y , bz = blockIdx.z;
int tx = threadIdx.x , ty = threadIdx.y , tz = threadIdx.z;
int RowIdx = ty + by * TILE_WIDTH;
int ColIdx = tx + bx * TILE_WIDTH;
int ImagesIdx = tz + bz * blockDim.z;
Index = RowIdx * Cols + ColIdx + Rows * Cols * ImagesIdx
and :
dim3 dimGrid( ImagesNb * (Cols / TILE_WIDTH) , ImagesNb * (Rows / TILE_WIDTH) , ImagesNb);
dim3 dimBlock( TILE_WIDTH , TILE_WIDTH , 2);
but if I try for 2 images I am not getting right results..

Ok, for using a number of images you must add an extra dimension to shared variable in order to hold the number of images.

Related

Shared Memory slows down the blurring operation compared to the one without shared memory

When I use shared memory on gaussian blur kernel, the execution time is slower than the one without shared memory. The code is as the following. Could you help me to resolve this issue?
The execution time for shared memory is 0.27 ms however, the execution time for the one without shared memory is 0.18 ms.
In addition to them the number of inactive threads is almost two times more than the one without shared memory.
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
The blurring kernel in which shared memory is not used is as the following
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if(row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
The problem is you are making ineffective use of shared memory. Replacing a few of the global loads with shared loads is not going to be sufficient. As a result, your else clause:
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
is getting invoked too many times, and is drowning out any benefit of shared usage in the if clause.
Instead you want to arrange a shared memory tile in such a way that all the data can be retrieved from shared memory, after it is properly loaded.
The following is an example of how it could be done (in gaus_xdirection_shared_i):
$ cat t145.cu
__global__
void gaus_xdirection_shared(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ float columns[1024];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x;
if(row < height && col < width){
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col; // block pixel = b_p
columns[b_p] = in_channel[p];
__syncthreads();
//Load ends
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if((b_col + i) > -1 && (b_col + i) < b_width){
p_val += gaussian_kernel[k + i] * columns[b_p + i];
}
else{
if((col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
}
output_channel[p] = p_val;
}
}
__global__
void gaus_xdirection_shared_i(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
extern __shared__ float columns[];
int b_row = threadIdx.y;
int b_col = threadIdx.x;
int b_width = blockDim.x + 2*k;
int p = row * width + col;
//Load starts
int b_p = b_row * b_width + b_col+k; // block pixel = b_p
float temp;
if( row < height && col < width)
temp = in_channel[p];
else
temp = 0;
columns[b_p] = temp;
if (threadIdx.x < k){
// handle left edge/border
if (((p-k) >= row*width) && ((p-k) < width*height)) temp = in_channel[p-k];
else temp = 0;
columns[b_p-k] = temp;
// handle right edge/border
if (((p+blockDim.x) < (row+1)*width) && (row < height))
temp = in_channel[p+blockDim.x];
else
temp = 0;
columns[b_p+blockDim.x] = temp;}
__syncthreads();
//Load ends
temp = 0.0f;
for(int i = -k; i < k+1; ++i)
temp += gaussian_kernel[k+i] * columns[b_p + i];
if( row < height && col < width)
output_channel[p] = temp;
}
__global__
void gaus_xdirection(float *in_channel, float *output_channel, float *gaussian_kernel, const int width, const int height, int k){
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
if( row < height && col < width){
int p = row * width + col;
float p_val = 0.0f;
for(int i = -k; i < k+1; ++i){
if( (col + i) > -1 && (col + i) < width){
p_val += gaussian_kernel[k + i] * in_channel[p + i];
}
}
output_channel[p] = p_val;
}
}
int main(){
float *in_channel;
float *output_channel;
float *gaussian_kernel;
int width, height, k;
int th = 32;
width = 1024;
height = 1024;
k = 7;
cudaMalloc(&in_channel, width*height*sizeof(float));
cudaMalloc(&output_channel, width*height*sizeof(float));
cudaMalloc(&gaussian_kernel, (2*k+1)*sizeof(float));
dim3 b(th, th);
dim3 g((width+b.x-1)/b.x,(height+b.y-1)/b.y);
gaus_xdirection_shared<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection_shared_i<<<g,b,th*(th+2*k)*sizeof(float)>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
gaus_xdirection<<<g,b>>>(in_channel, output_channel, gaussian_kernel, width, height,k);
cudaDeviceSynchronize();
}
$ nvcc -o t145 t145.cu
$ cuda-memcheck ./t145
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t145
==27500== NVPROF is profiling process 27500, command: ./t145
==27500== Profiling application: ./t145
==27500== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 44.53% 1.0205ms 1 1.0205ms 1.0205ms 1.0205ms gaus_xdirection_shared(float*, float*, float*, int, int, int)
33.35% 764.46us 1 764.46us 764.46us 764.46us gaus_xdirection(float*, float*, float*, int, int, int)
22.12% 506.95us 1 506.95us 506.95us 506.95us gaus_xdirection_shared_i(float*, float*, float*, int, int, int)
API calls: 97.88% 141.58ms 3 47.192ms 115.32us 141.22ms cudaMalloc
1.58% 2.2808ms 1 2.2808ms 2.2808ms 2.2808ms cudaDeviceSynchronize
0.36% 514.21us 202 2.5450us 165ns 118.09us cuDeviceGetAttribute
0.10% 146.33us 2 73.166us 52.335us 93.998us cuDeviceTotalMem
0.04% 58.346us 2 29.173us 26.147us 32.199us cuDeviceGetName
0.03% 50.393us 3 16.797us 6.9170us 34.369us cudaLaunchKernel
0.01% 9.5440us 2 4.7720us 1.8600us 7.6840us cuDeviceGetPCIBusId
0.00% 1.3980us 3 466ns 279ns 801ns cuDeviceGetCount
0.00% 1.3100us 4 327ns 186ns 712ns cuDeviceGet
0.00% 564ns 2 282ns 237ns 327ns cuDeviceGetUuid
$
I have not carefully tested the above code, it may contain defects. But it should give you an idea of how to structure a larger shared memory tile, and it seems to run without runtime error, and it seems to be faster.

PyCUDA how to get the number of used registers per thread when launching the kernels?

I have a kernel, how can I get the number of used registers per thread when launching the kernels? I mean in a PyCuda way.
A simple example will be:
__global__
void
make_blobs(float* matrix, float2 *pts, int num_pts, float sigma, int rows, int cols) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < cols && y < rows) {
int idx = y*cols + x;
float temp = 0.f;
for (int i = 0; i < num_pts; i++) {
float x_0 = pts[i].x;
float y_0 = pts[i].y;
temp += exp(-(pow(x - x_0, 2) + pow(y - y_0, 2)) / (2 * sigma*sigma));
}
matrix[idx] = temp;
}
}
Is there anyway to get the number without crashing the program if the real number used has exceeded the max?
The above is OK, it dose not exceed the max in my machine. I just want to get the number in a convenient way. Thanks!
PyCuda already provides this as part of the Cuda function object. The property is called pycuda.driver.Function.num_regs.
Below is a small example that shows how to use it:
import pycuda.autoinit
from pycuda.compiler import SourceModule
kernel_src = """
__global__ void
make_blobs(float* matrix, float2 *pts, int num_pts, float sigma, int rows, int cols) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < cols && y < rows) {
int idx = y*cols + x;
float temp = 0.f;
for (int i = 0; i < num_pts; i++) {
float x_0 = pts[i].x;
float y_0 = pts[i].y;
temp += exp(-(pow(x - x_0, 2) + pow(y - y_0, 2)) / (2 * sigma*sigma));
}
matrix[idx] = temp;
}
}"""
compiledKernel = SourceModule(kernel_src)
make_blobs = compiledKernel.get_function("make_blobs")
print(make_blobs.num_regs)
Note that you don't need to use SourceModule. You can also load the module from e.g. a cubin file. More details can be found in the documentation.

How can I convolution image in CUDA

I have a question about image convolution in CUDA. When I test it with small maxtrix (16*16) evething is ok. But with larger matrix, the result is always change when I run.
I think problem is 2 for loops into kernel.
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i < img_width*img_height; i++)
{
printf("%d, ",(int)output[i]);
}
printf("\n\n");
}
Here is my result, I test it with 24*24 image, I run it 2 time, and I also write simple function to compared the output.
And here is result when I compare the output, there are 32 differents,at index 240, 241 ....
You have made a fairly common error in your program. When you create a grid of threads like this:
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
you are intentionally creating (usually) extra threads in each dimension, so as to fully cover the problem space (i.e. image size). There is nothing wrong with this.
However, it means we will be launching extra threads, which are outside the valid image dimension. We must ensure that these threads do nothing. The usual approach is to add a thread check to the kernel, so that threads outside the valid image dimensions do nothing. Here's a modified kernel and fully worked example showing that change:
$ cat t1219.cu
#include <iostream>
#include <cstdlib>
const int iw = 1025;
const int ih = 1025;
const int rng = 10;
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if ((x < img_width) && (y < img_height)){ // thread check
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
}
int main(){
float *in, *out;
int is = ih*iw;
in = new float[is];
out = new float[is];
for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;}
image_convolution(in,out, ih, iw);
for (int iy = 1; iy < ih-1; iy++)
for (int ix = 1; ix < iw-1; ix++){
float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]);
if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}}
return 0;
}
$ nvcc -o t1219 t1219.cu
$ cuda-memcheck ./t1219
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
For image dimensions which are exact multiples of the block size (16,16) (which was true for my previous test case) this problem won't show up -- the code will work correctly. For all other test cases, we need such a thread check.

YUV to RGB Conversion Error

I'm currently working on an App, that transforms a RGB picture to YV12, converts it to NV12 and then back to RGB.
I'm getting the following error on my conversion:
http://www.pic-upload.de/view-21874004/ConversionError.jpg.html
So the left side is, what I want. A simple blue color. The right side displays what I'm getting. It looks like, there is way too much green in the conversion result.
Here's the code that transforms the rgb to yv12:
__global__ void RGBtoYV12(unsigned char* yuv, unsigned char* pData)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int width = gridDim.x * blockDim.x * 1.5;
int iwidth = gridDim.x;
int rgbID = i * 4;
int upos = blockDim.x * gridDim.x;
int vpos = upos + upos / 4;
int col = i % iwidth;
int row = i / iwidth; //bzw. threadIdx.x;
int r = pData[rgbID], g = pData[rgbID+1], b = pData[rgbID+2];
//Y
unsigned char y = 0.299 * r + 0.587 * g + 0.114 * b;
yuv[upos - (row+1)*iwidth + col] = y;
if ( !((i/gridDim.x)%2) && !(i%2))
{
//YV12
// U
yuv[width - ( (iwidth/2) * ((row/2)+1) - ((col/2)+1) )] = 0.493 * (b - y);//((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
// V
yuv[vpos - ( (iwidth/2) * ((row/2)+1) - ((col/2)+1) )] = 0.887 * (r - y); //((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
}
The conversion from NV12 is processed like this:
__global__ void NV12toRGB(unsigned char* nv12, unsigned char* rgba, int decodedPitch)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int i = iy * decodedPitch + ix;
int rgbStart = (iy * gridDim.x * blockDim.x + ix) * 4;
int quadX = (ix / 2);
int quadY = (iy / 2);
int uvAdr = decodedPitch / 2 * quadY + quadX;
int uvStart = decodedPitch * gridDim.y * blockDim.y;
int y = nv12[i];
int u = nv12[uvStart + 2 * uvAdr];
int v = nv12[uvStart + 2 * uvAdr + 1];
// R
int r = y + 1.13983 * v;
// G
int g = y - 0.39393 * u - 0.58081 * v;
// B
int b = y + 2.028 * u;
rgba[rgbStart] = r;
rgba[rgbStart+1] = g;
rgba[rgbStart+2] = b;
rgba[rgbStart+3] = 255;
}
As you can see, I do the conversion with cuda on GPU. I think the indexation of the color-values is correct, but I don't know, what goes wrong with the color conversion. Any help or other conversion formulas, i could try out, would be much appreciated.
Greetings
You have floating point multiplications, but results declared as integers. You lose all the precision there. For example
int r = y + 1.13983 * v;
Replace this with
float r = y + 1.13983 * v;
There may be other issues as well, but this stands out.

Count the number of cycles in a CUDA kernel

How can I count the number of cycles performed by a function like the following. Should I count straight forward the number of sums and muls and divs? Where can I check how many cycles an addition takes in CUDA?
__global__
void mandelbrotSet_per_element(Grayscale *image){
float minR = -2.0f, maxR = 1.0f;
float minI = -1.2f, maxI = minI + (maxR-minR) * c_rows / c_cols;
float realFactor = (maxR - minR) / (c_cols-1);
float imagFactor = (maxI - minI) / (c_rows-1);
bool isInSet;
float c_real, c_imag, z_real, z_imag;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
while (y < c_rows){
while (x < c_cols) {
c_real = minR + x * realFactor;
c_imag = maxI - y * imagFactor;
z_real = c_real; z_imag = c_imag;
isInSet = true;
for (int k = 0; k < c_iterations; k++){
float z_real2 = z_real * z_real;
float z_imag2 = z_imag * z_imag;
if (z_real2 + z_imag2 > 4){
isInSet = false;
break;
}
z_imag = 2 * z_real * z_imag + c_imag;
z_real = z_real2 - z_imag2 + c_real;
}
if (isInSet) image[y*c_cols+x] = 255;
else image[y*c_cols+x] = 0;
x += blockDim.x * gridDim.x;
}
x = blockDim.x * blockIdx.x + threadIdx.x;
y += blockDim.y * gridDim.y;
}
}
Instruction throughput is described in the programming guide here
You can also try measuring a sequence of instructions using the native clock() function described here
The compiler tends to obscure actual counts of operations at the source code level (increasing or possibly decreasing apparent arithmetic intensity) so if you want to indentify exactly what the machine is doing you may want to inspect the ptx (nvcc -ptx ...) or possibly the machine assembly level code, called SASS, which you can extract from an executable using the cuobjdump utility.