cuda - directx 12 texture2D (in 1D array) interop - cuda

I'm trying to update in cuda a texture used in directx12. I may miss something but I have no tip about it.
there is an "all the time black" area in the top right area of the image.
only when I have R G B having the same value for all pixels, I get the expected result (modulo the first problem), if not I have unexpected artefacts, as if the array was not having the expected structure.
What do I miss ?
Here is the creation of the texture:
{
TextureWidth = m_width;
TextureHeight = m_height;
auto nPixels = TextureWidth * TextureHeight * 3;
auto pixelBufferSize = sizeof(float)* nPixels;
D3D12_RESOURCE_DESC textureDesc{};
textureDesc.MipLevels = 1;
textureDesc.Format = DXGI_FORMAT_R32G32B32_FLOAT;
textureDesc.Width = TextureWidth;
textureDesc.Height = TextureHeight;
textureDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
textureDesc.DepthOrArraySize = 1;
textureDesc.SampleDesc.Count = 1;
textureDesc.SampleDesc.Quality = 0;
textureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
ThrowIfFailed(m_device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), D3D12_HEAP_FLAG_SHARED,
&textureDesc, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, nullptr, IID_PPV_ARGS(&m_textureBuffer)));
NAME_D3D12_OBJECT(m_textureBuffer);
// Describe and create a SRV for the texture.
{
D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc{};
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
srvDesc.Format = textureDesc.Format;
srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
srvDesc.Texture2D.MipLevels = 1;
m_device->CreateShaderResourceView(m_textureBuffer.Get(), &srvDesc, m_srvHeap->GetCPUDescriptorHandleForHeapStart());
NAME_D3D12_OBJECT(m_srvHeap);
}
// Share m_textureBuffer with cuda
{
HANDLE sharedHandle{};
WindowsSecurityAttributes windowsSecurityAttributes{};
LPCWSTR name{};
ThrowIfFailed(m_device->CreateSharedHandle(m_textureBuffer.Get(), &windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle));
D3D12_RESOURCE_ALLOCATION_INFO d3d12ResourceAllocationInfo;
d3d12ResourceAllocationInfo = m_device->GetResourceAllocationInfo(m_nodeMask, 1, &CD3DX12_RESOURCE_DESC::Buffer(pixelBufferSize));
auto actualSize = d3d12ResourceAllocationInfo.SizeInBytes;
cudaExternalMemoryHandleDesc externalMemoryHandleDesc;
memset(&externalMemoryHandleDesc, 0, sizeof(externalMemoryHandleDesc));
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
externalMemoryHandleDesc.size = actualSize;
externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
checkCudaErrors(cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc));
cudaExternalMemoryBufferDesc externalMemoryBufferDesc;
memset(&externalMemoryBufferDesc, 0, sizeof(externalMemoryBufferDesc));
externalMemoryBufferDesc.offset = 0;
externalMemoryBufferDesc.size = pixelBufferSize;
externalMemoryBufferDesc.flags = 0;
checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&m_cudaDevVertptr, m_externalMemory, &externalMemoryBufferDesc));
RunKernel(TextureWidth, TextureHeight, (float*)m_cudaDevVertptr, m_streamToRun, 1.0f);
checkCudaErrors(cudaStreamSynchronize(m_streamToRun));
}
}
And here the cuda code for updating this texture:
int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }
__global__ void TextureKernel(float *pixels, unsigned int width, unsigned int height, float time)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if (y < height && x < width)
{
auto pos = (y * width + x) * 3;
auto sint = __sinf(time) * 0.1f + 0.10f;
auto sintAlt = (x / 32) % 2 == 0 ? 1.0f : sint;
pixels[pos + 0] = sintAlt; //RED
pixels[pos + 1] = 0; // (x + y) % 2 == 0 ? 1.0f : __sinf(time) * 0.25f + 0.75f; //GREEN
pixels[pos + 2] = 0; // (x + y) % 2 == 0 ? 1.0f : 0.0f; //BLUE
//pixels[pos + 0] = __sinf(time + 0.) * 0.5f + 0.5f;
//pixels[pos + 1] = __sinf(time * 0.09) * 0.5f + 0.5f;
//pixels[pos + 2] = __sinf(time + 2) * 0.5f + 0.5f;
}
}
void RunKernel(size_t meshWidth, size_t meshHeight, float *texture_dev, cudaStream_t streamToRun, float animTime)
{
//dim3 block(16, 16, 1);
//dim3 grid(meshWidth / 16, meshHeight / 16, 1);
auto unit = 32;
dim3 threads(unit, unit);
dim3 grid(iDivUp(meshWidth, unit), iDivUp(meshHeight, unit));
TextureKernel <<<grid, threads, 0, streamToRun >>>(texture_dev, meshWidth, meshHeight, animTime);
getLastCudaError("TextureKernel execution failed.\n");
}
And an extract of the resulting image I get with this code:
And the full repo if needed:
https://github.com/mprevot/CudaD3D12Update
EDIT
Two problems occur here.
The first is the format of texture: R32G32B32float, but the RTV (?) is expecting actually R32G32B32A32float. Matching everything at R32G32B32A32float can solve the weird colors arrays. The other way is to match the RTV to a R32G32B32float texture, but I don't see how.
The second problem is to work with cudaExternalMemoryGetMappedBuffer instead of cudaExternalMemoryGetMappedMipmappedArray; however how to use it with the texture described by D3D12_RESOURCE_DESC textureDesc{}; as well as a 1D cuda array float* is no clear yet.
I tried with the following code (for a 1D mipmap array), without success (cudaErrorInvalidValue).
auto textureSurface = TextureWidth * TextureHeight;
auto texturePixels = textureSurface * TextureChannels;
cudaExternalMemoryMipmappedArrayDesc cuTexDesc{};
cuTexDesc.numLevels = 1;
cuTexDesc.extent = make_cudaExtent(texturePixels, 0, 0);
cuTexDesc.formatDesc = cudaCreateChannelDesc<float>();
auto result = cudaMallocMipmappedArray(&cuMipArray[0], &cuTexDesc.formatDesc, cuTexDesc.extent, cuTexDesc.numLevels);

You assume that a 2D texture image with three channels of type float will have a simple row-wise linear memory layout. As demonstrated by your result, this is generally not true.
Textures are optimized for spatially-coherent access. Their memory layout is designed to keep things that are close in n-dimensional texture space close in memory. This cannot be achieved for anything with more than one dimension by a simple row-major memory layout. The exact memory layout of a particular texture image is generally not something you can assume to know or rely upon. It will depend on the GPU you're using (typically, the data will be stored in some way that employs things like tiling or Morton order, with padding in places to keep stuff aligned).
As you noticed yourself, what you want to do is use cudaExternalMemoryGetMappedMipmappedArray() to map a CUDA array (arrays are the CUDA-analogon to texture images) to your external data coming from D3D12. The format of this CUDA array will have to match the format of the texture created in D3D12. You should then be able to use the texture or surface functions of the CUDA runtime API to access the texture image represented by this CUDA array…

The right thing to do is to import the texture as external memory, then as mipmap array, then use this array to create a cuda surface, and then modify this surface in the cuda kernel.
The import and mapping is done this way:
cudaExternalMemoryMipmappedArrayDesc cuExtmemMipDesc{};
cuExtmemMipDesc.extent = make_cudaExtent(texDesc.Width, texDesc.Height, 0);
cuExtmemMipDesc.formatDesc = cudaCreateChannelDesc<float4>();
cuExtmemMipDesc.numLevels = 1;
cuExtmemMipDesc.flags = cudaArraySurfaceLoadStore;
cudaMipmappedArray_t cuMipArray{};
CheckCudaErrors(cudaExternalMemoryGetMappedMipmappedArray(&cuMipArray, m_externalMemory, &cuExtmemMipDesc));
cudaArray_t cuArray{};
CheckCudaErrors(cudaGetMipmappedArrayLevel(&cuArray, cuMipArray, 0));
cudaResourceDesc cuResDesc{};
cuResDesc.resType = cudaResourceTypeArray;
cuResDesc.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&cuSurface, &cuResDesc));
// where cudaSurfaceObject_t cuSurface{};
the cuda part looks like this:
int iDivUp(int a, int b) { return a % b != 0 ? a / b + 1 : a / b; }
__global__ void UpdateSurface(cudaSurfaceObject_t surf, unsigned int width, unsigned int height, float time)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= height | x >= width) return;
auto xVar = (float)x / (float)width;
auto yVar = (float)y / (float)height;
auto cost = __cosf(time) * 0.5f + 0.5f;
auto costx = __cosf(time) * 0.5f + xVar;
auto costy = __cosf(time) * 0.5f + yVar;
auto costxx = (__cosf(time) * 0.5f + 0.5f) * width;
auto costyy = (__cosf(time) * 0.5f + 0.5f) * height;
auto costxMany = __cosf(y * time) * 0.5f + yVar;
auto costyMany = __cosf((float)x/100 * time) * 0.5f + xVar;
auto margin = 1;
float4 pixel{};
if (y == 0) // paint the first row
pixel = make_float4(costyMany * 0.3, costyMany * 1, costyMany * 0.4, 1);
else if (y == height - 1) // paint the last row
pixel = make_float4(costyMany * 0.6, costyMany * 0.7, costyMany * 1, 1);
else if (x % 5 == 0) // paint a column of 1 pixel wide every 5 pixels
{
if (x > width / 2) // a certain color for the right half
pixel = make_float4(0.1, 0.5, costx * 1, 1);
else // another color for the left half
pixel = make_float4(costx * 1, 0.1, 0.2, 1);
}
else if (x > width - margin - 1 | x <= margin) // first and last columns
pixel = make_float4(costxMany, costxMany * 0.9, costxMany * 0.6, 1);
else // all the rest of the texture
pixel = make_float4(costx * 0.3, costx * 0.4, costx * 0.6, 1);
surf2Dwrite(pixel, surf, x * 16, y);
}
void RunKernel(size_t textureW, size_t textureH, cudaSurfaceObject_t surfaceObject, cudaStream_t streamToRun, float animTime)
{
auto unit = 10;
dim3 threads(unit, unit);
dim3 grid(iDivUp(textureW, unit), iDivUp(textureH, unit));
UpdateSurface <<<grid, threads, 0, streamToRun >>> (surfaceObject, textureW, textureH, animTime);
getLastCudaError("UpdateSurface execution failed.\n");
}
I updated the git repo to reflect those changes (https://github.com/mprevot/CudaD3D12Update)

Related

Applying a Color Transformation to very Pixel of a texture (Libgdx)

A game I am currently developing uses a 5x5 matrix to change the colors of the image on a per pixel basis. I was wondering if anyone has developed an extremely fast algorithm for something like this.
For every Pixel(setPixel(sourcePixel * Matrix))
I have built my own algorithm for this by getting and setting pixels on pixmap then drawing a new pixmap from this through iterating every pixel with set/get pixel. I have found a reasonably fast algorithm for this (150 million pixels ~3 seconds), but I was thinking of another idea rather than using the pixmap but I am unsure of how to implement this. Libgdx provides a FileHandle.readBytes() method that reads image files (in my case PNG) to byte arrays. My thought was rather than creating a pixmap, read the byte array while iterating the pixels. While iterating I would be drawing a new pixmap meaning their really is no point for me to make one for the base pixmap in the first place. With tests I found that with my current algorithm, 70% of the time it takes is from the method (PixMap.getPixel(x, y), and I could bypass this by straight reading the byte array. I have looked into PNG readers for byte array's online but to no avail.
Note I am unable to use ImageIO due it being an android based game. Would it make it faster by reading the byte array data while iterating/ is it possible to do this?
In the code below, JList is basically a HashMap in this context
private static JList<Integer, Pixmap> colorShiftImage(Pixmap p, JList<Integer, float[][]> cms){
JList<float[][], Pixmap> tempList = new JList<>();
for(int i = cms.size() - 1; i > -1; --i){
tempList.add(cms.getInt(i), new Pixmap(p.getWidth(), p.getHeight(), Pixmap.Format.RGBA8888));
}
for(int y = p.getHeight() - 1; y > -1; --y){
for(int x = p.getWidth() - 1; x > -1; --x){
int v = p.getPixel(x, y);
if(v != 0) {
r = ((v & 0xff000000) >>> 24);
g = ((v & 0x00ff0000) >>> 16);
b = ((v & 0x0000ff00) >>> 8);
a = ((v & 0x000000ff));
for(int i = tempList.size() - 1; i > -1; --i) {
float[][] c = tempList.getIDList().get(i);
tempList.getInt(i).drawPixel(x, y, (((l((r * c[0][0]) + (c[1][0] * g) + (c[2][0] * b) + (c[3][0] * a) + c[4][0])) << 24)
| ((l((r * c[0][1]) + (c[1][1] * g) + (c[2][1] * b) + (c[3][1] * a) + c[4][1])) << 16)
| ((l((r * c[0][2]) + (c[1][2] * g) + (c[2][2] * b) + (c[3][2] * a) + c[4][2])) << 8)
| ((l((r * c[0][3]) + (c[1][3] * g) + (c[2][3] * b) + (c[3][3] * a) + c[4][3])))));
}
}
}
}
JList<Integer, Pixmap> returnL = new JList<>();
for(int i = tempList.size() - 1; i > - 1; --i){
returnL.add(cms.getIDList().get(i), tempList.getInt(i));
}
return returnL;
}
public static int l(float v){
if(v < 0)return 0;
else if(v > 255)return 255;
return (int) v;
}

Find max of matrix with window size in CUDA [duplicate]

I just started in CUDA. Now I have a question.
I have N*N matrix, and a window scale is 8x8. I want subdivided this matrix into multiple sub-matrix and find max value of this.
For example if I have 64*64 matrix so I will have 8 small matrix with 8*8 scale and find out 8 max values. Finally I save all max values into new array, but its order always change. I want find solution to keep them in right order
__global__ void calculate_emax_kernel(float emap[],float emax[], int img_height, int img_width,int windows_size)
{
int x_index = blockIdx.x*blockDim.x+threadIdx.x;
int y_index = blockIdx.y*blockDim.y+threadIdx.y;
int num_row_block = img_height/windows_size;
int num_col_block = img_width/windows_size;
__shared__ float window_elements[256];
__shared__ int counter;
__shared__ int emax_count;
if (threadIdx.x == 0) emax_count = 0;
__syncthreads();
int index;
int emax_idx = 0;
if(y_index >= img_height|| x_index >= img_width) return;
for(int i = 0; i < num_row_block; i++)
{
for(int j = 0; j < num_col_block; j++)
{
counter = 0;
if(y_index >= i*windows_size && y_index < (i+1)*windows_size
&& x_index >= j*windows_size && x_index < (j+1)*windows_size)
{
int idx = y_index*img_height + x_index;
index = atomicAdd(&counter, 1);
window_elements[index] = emap[idx];
__syncthreads();
// reduction
unsigned int k = (windows_size*windows_size)/2;
while(k != 0)
{
if(index < k)
{
window_elements[index] = fmaxf(window_elements[index], window_elements[index+k]);
}
k /= 2;
}
if(index == 0)
{
emax[i*num_row_block+j] = window_elements[index];
}
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
This is my configuration
void construct_emax(float *input,float *output, int img_height, int img_width)
{
int windows_size = 4;
float * d_input, * d_output;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
calculate_emax_kernel<<<gridsize,blocksize>>>(d_input,d_output,img_height,img_width,windows_size);
}
With CUDA, parallel reduction is tricky; segmented parallel reduction is trickier. Now you are doing it in 2-D, and your segment/window is smaller than the thread block.
For large window size, I don't think it is a problem. You could use one thread block to reduce one window. For example if you have a 16x16 window, you could simply use 16x16 thread block. If you have even larger window size, for example 64x64, you could still use 16x16 thread block. First reduce the 64x64 window to 16x16 elements during data loading, then reduce to 1 scalar within the thread block.
For window size smaller than the block size, you will have to reduce multiple windows per thread block for higher performance. You could use your current block/grid configuration, where each 256-thread block (16x16) is responsible for 16 4x4 windows. But this will not be optimal because each 32-thread wrap is organized in two parts (2x16). This is not good for coalesced global memory access, and it is hard to map a 2x16 warp to one or more 4x4 windows for efficient parallel reduction.
Alternatively I would suggest you use 1-D thread block with 256 threads. Every m threads reduce one mxm window. Then you could use 2-D grid to cover the whole image.
const int m = window_size;
dim3 blocksize(256);
dim3 gridsize((img_width+255)/256, (img_height+m-1)/m);
In the kernel function, you could
reduce each mxm window to a 1xm vector during global data loading;
use tree reduction method to reduce the 1xm vector to a scalar.
This following code is a conceptual demo which works when m is a power of 2 and m <= 32. You could further modify it for arbitrary m and better boundary checking.
#include <assert.h>
#include <cuda.h>
#include <thrust/device_vector.h>
__global__ void calculate_emax_kernel(const float* input, float* output,
int height, int width, int win_size,
int out_width) {
const int tid = threadIdx.x;
const int i = blockIdx.y * win_size;
const int j = blockIdx.x * 256 + tid;
const int win_id = j % win_size;
__shared__ float smax[256];
float tmax = -1e20;
if (j < width) {
for (int tile = 0; tile < win_size; tile++) {
if (i + tile < height) {
tmax = max(tmax, input[(i + tile) * width + j]);
}
}
}
smax[tid] = tmax;
for (int shift = win_size / 2; shift > 0; shift /= 2) {
if (win_id < shift) {
smax[tid] = max(smax[tid], smax[tid + shift]);
}
}
if (win_id == 0 && j < width) {
output[blockIdx.y * out_width + (j / win_size)] = smax[tid];
}
}
int main() {
const int height = 1024;
const int width = 1024;
const int m = 4;
thrust::device_vector<float> in(height * width);
thrust::device_vector<float> out(
((height + m - 1) / m) * ((width + m - 1) / m));
dim3 blocksize(256);
dim3 gridsize((width + 255) / 256, (height + m - 1) / m);
assert(m == 2 || m == 4 || m == 8 || m == 16 || m == 32);
calculate_emax_kernel<<<gridsize, blocksize>>>(
thrust::raw_pointer_cast(in.data()),
thrust::raw_pointer_cast(out.data()),
height, width, m, (width + m - 1) / m);
return 0;
}
In case you're willing to use a library, few pointers:
use NPP, set of primitives (from nvidia)
https://docs.nvidia.com/cuda/npp/group__image__filter__max.html
a lower level library, for other reduce operations and more granularity in the way you use the hardware (from nvidia / nvlabs)
http://nvlabs.github.io/cub/

Upload data in shared memory for convolution kernel

I am having some difficulties to understand the batch loading as in the comments is referred. In order to compute the convolution in a pixel the mask whose size is 5 must become centered on this specific pixel. The image is divided into tiles. These tiles after applying the convolution mask are the final output tiles whose size is TILE_WIDTH*TILE_WIDTH. For the pixels that belong to the border of the output tile the mask must borrow some pixels from the neighbor tile, when this tile belong to the borders of the image. Otherwise, these borrowed values are assigned to zero. These two steps are depicted in
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
For that reason the shared memory array has TILE_WIDTH + Mask_width - 1 dimension in each side. The following parts of the code are unclear to me.
The destY and destX index.
Dividing the output index by the input tile width what does it means?
The srcY add srcX index.
Why destY and destX index take part in srcY add srcX index?
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
Why in the second loading we use the offset TILE_WIDTH * TILE_WIDTH?
Generally, what is the intuitive explanation of having two loadings?
Can all these question followed by an intuitive example based on the image bellow?
Thank you!
EDIT: Image added. In green there are the output tiles and in red we have the mask centered in 114 index. It is obvious that the mask borrows elements from different tiles.
Finally, this image refers to one channel.
Example: Based on the image below I have tryied to wrote an example. The output tile has blockIdx.x=1 and blockIdx.y=1 based on that destY=0 and destX=0. Also,
srcY = 1*6+0-3=3, srcX = 3 and src = (3*18+3)*3+0=171. Based on the calculations and the image example we do not have a match. In the first shared memory possision the value that should be stored is that with global index 57. What is wrong with the abovementioned calculations? Can any one help please?
#define Mask_width 5
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))
__global__ void convolution(float *I, const float* __restrict__ M, float *P,
int channels, int width, int height) {
__shared__ float N_ds[w][w];
int k;
for (k = 0; k < channels; k++) {
// First batch loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
destY = dest / w, destX = dest % w,
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
src = (srcY * width + srcX) * channels + k;
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
// Second batch loading
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
destY = dest / w, destX = dest % w;
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
src = (srcY * width + srcX) * channels + k;
if (destY < w) {
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
}
__syncthreads();
float accum = 0;
int y, x;
for (y = 0; y < Mask_width; y++)
for (x = 0; x < Mask_width; x++)
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];
y = blockIdx.y * TILE_WIDTH + threadIdx.y;
x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width)
P[(y * width + x) * channels + k] = clamp(accum);
__syncthreads();
}
}
Your question is similar in concept to my first question on StackOverflow: Moving a (BS_X+1)(BS_Y+1) global memory matrix by BS_XBS_Y threads.
You are facing the following problem: each thread block of size TILE_WIDTHxTILE_WIDTH should fill a shared memory area of size (TILE_WIDTH + Mask_width - 1)x(TILE_WIDTH + Mask_width - 1).
4) Generally, what is the intuitive explanation of having two loadings?
Since the shared memory area (TILE_WIDTH + Mask_width - 1)x(TILE_WIDTH + Mask_width - 1) is larger than the block size TILE_WIDTHxTILE_WIDTH and assuming it is smaller than 2xTILE_WIDTHxTILE_WIDTH, then each thread should move at most two elements from global memory to shared memory. This is the reason why you have a two-stages loading.
1) The destY and destX index. Dividing the output index by the input tile width what does it means?
This concerns the first load stage which is appointed to load TILE_WIDTHxTILE_WIDTH elements from global memory and fills the uppermost part of the shared memory area.
So, the operation
dest = threadIdx.y * TILE_WIDTH + threadIdx.x;
flattens the 2D coordinates of the generic thread while
destX = dest % w;
destY = dest / w;
makes the inverse operation, in that it calculates the 2D coordinates of the generic thread with respect to the shared memory area.
2) The srcY add srcX index. Why destY and destX index take part in srcY add srcX index?
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
(blockIdx.x * TILE_WIDTH, blockIdx.y * TILE_WIDTH) would be the coordinates of the global memory location if the block size and the shared memory size were the same. Since you are "borrowing" memory values also from neighboor tiles, then you have to shift the above coordinates by (destX - Mask_radius, destY - Mask_radius).
3) Why in the second loading we use the offset TILE_WIDTH * TILE_WIDTH?
You have this offset because in the first memory stage you have already filled the "first" TILE_WIDTHxTILE_WIDTH locations of the shared memory.
EDIT
The picture below illustrates the correspondence between the flattened thread index dest and the shared memory locations. In the picture, the blue boxes represent the elements of the generic tile while the red boxes the elements of the neighboor tiles. The union of the blue and red boxes correspond to the overall shared memory locations. As you can see, all the 256 threads of a thread block are involved in filling the upper part of the shared memory above the green line, while only 145 are involved in filling the lower part of the shared memory below the green line. Now you should also understand the TILE_WIDTH x TILE_WIDTH offset.
Please, notice that you have at most 2 memory loads per thread due to the particular choice of your parameters. For example, if you have TILE_WIDTH = 8, then the number of threads in a thread block is 64, while the shared memory size is 12x12=144, which means that each thread is in charge to perform at least 2 shared memory writes since 144/64=2.25.

tex object access always returns zero -- any ideas?

I'm running CUDA 5.0, with compute_30,sm_30 set using a 670.
I create a mipmapped array via:
cudaExtent size;
size.width = window_width; // 600
size.height = window_height; // 600
size.depth = 1;
int levels = getMipMapLevels(size);
levels = MIN(levels, 9); // 9
cudaChannelFormatDesc fp32;
fp32.f = cudaChannelFormatKindFloat;
fp32.x = fp32.y = fp32.z = fp32.w = 32;
cudaMipmappedArray_t A;
checkCuda(cudaMallocMipmappedArray(&A, &fp32, size, levels, cudaArraySurfaceLoadStore));
I load the first level of A with surf2Dwrites. I know that works since I copy that array to the host and dump it to an image file. I now wish to fill the other miplevels of A with the mipmaps. One iteration through that loop looks like:
width >>= 1; width = MAX(1, width);
height >>= 1; height = MAX(1, height);
cudaArray_t from, to;
checkCuda(cudaGetMipmappedArrayLevel(&from, A, newlevel-1));
checkCuda(cudaGetMipmappedArrayLevel(&to, A, newlevel));
cudaTextureObject_t from_texture;
create_texture_object(from, true, &from_texture);
cudaSurfaceObject_t to_surface;
create_surface_object(to, &to_surface);
dim3 blocksize(16, 16, 1);
dim3 gridsize((width+blocksize.x-1)/blocksize.x,(height+blocksize.y-1)/blocksize.y, 1);
d_mipmap<<<gridsize, blocksize>>>(to_surface, from_texture, width, height);
checkCuda(cudaDeviceSynchronize());
checkCuda(cudaGetLastError());
uncreate_texture_object(&from_texture);
uncreate_surface_object(&to_surface);
The create_surface_object() code is known to work. Just in case, here's the create_texture_object() code:
static void create_texture_object(cudaArray_t tarray, bool filter_linear, cudaTextureObject_t *tobject)
{
assert(tarray && tobject);
// build the resource
cudaResourceDesc color_res;
memset(&color_res, 0, sizeof(cudaResourceDesc));
color_res.resType = cudaResourceTypeArray;
color_res.res.array.array = tarray;
// the texture descriptor
cudaTextureDesc texdesc;
memset(&texdesc, 0, sizeof(cudaTextureDesc));
texdesc.addressMode[0] = cudaAddressModeClamp;
texdesc.addressMode[1] = cudaAddressModeClamp;
texdesc.addressMode[2] = cudaAddressModeClamp;
texdesc.filterMode = filter_linear ? cudaFilterModeLinear : cudaFilterModePoint;
texdesc.normalizedCoords = 1;
checkCuda(cudaCreateTextureObject(tobject, &color_res, &texdesc, NULL));
}
The d_mipmap device function is the following:
__global__ void
d_mipmap(cudaSurfaceObject_t out, cudaTextureObject_t in, int w, int h)
{
float x = blockIdx.x * blockDim.x + threadIdx.x;
float y = blockIdx.y * blockDim.y + threadIdx.y;
float dx = 1.0/float(w);
float dy = 1.0/float(h);
if ((x < w) && (y < h))
{
#if 0
float4 color =
(tex2D<float4>(in, (x + .25f) * dx, (y + .25f) * dy)) +
(tex2D<float4>(in, (x + .75f) * dx, (y + .25f) * dy)) +
(tex2D<float4>(in, (x + .25f) * dx, (y + .75f) * dy)) +
(tex2D<float4>(in, (x + .75f) * dx, (y + .75f) * dy));
color /= 4.0f;
surf2Dwrite(color, mipOutput, x * sizeof(float4), y);
#endif
float4 color0 = tex2D<float4>(in, (x + .25f) * dx, (y + .25f) * dy);
surf2Dwrite(color0, out, x * sizeof(float4), y);
}
}
That contains both the mipmap sampling code (if'd out) plus debugging code.
The problem is, color0 is always uniformly zero, and I've been unable to understand why. I've changed the filtering to point (from linear) with no success. I've checked for errors. Nothing.
I am using CUDA/OpenGL interop here, but the mipmap generation is being done on CUDA arrays only.
I really really do not want to have to use texture references.
Any suggestions on where to look?
The bug turns out to be the use of cudaMipmappedArrays (either the array or the texture object -- I'm unable to tell which is broken.)
When I modify the code to use cudaArrays only, the texture reference starts working again.
Since the bindless texture program sample works, the bug appears to be limited to float32 channel mipmapped textures only. (I have a test program that shows the bug occurs with both 1 and 4 channel float32 mipmapped textures.)
I've reported the bug to Nvidia.

2D kernel calling and launch parameters for non-square matrix

I am attempting to port the following (simplified) nested loop as a CUDA 2D kernel. The sizes of NgS and NgO will increase with larger data sets; for now I just want to get this kernel to output the correct results for all values:
// macro that translates 2D [i][j] array indices to 1D flattened array indices
#define idx(i,j,lda) ( (j) + ((i)*(lda)) )
int NgS = 1859;
int NgO = 900;
// 1D flattened matrices have been initialized as:
Radio_cpu = new double [NgS*NgO];
Result_cpu = new double [NgS*NgO];
// ignoring the part where they are filled w/ data
for (m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
Result_cpu[idx(n,m,NgO)]] = k0*Radio_cpu[idx(n,m,NgO)]];
}
}
The examples I have come across usually deal with square loops, and I have been unable to get the correct output for all the GPU array indices compared to the CPU version. Here is the host code calling the kernel:
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (NgO + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (NgS + dimBlock.y - 1) / dimBlock.y;
// Result_gpu and Radio_gpu are allocated versions of the CPU variables on GPU
trans<<<dimGrid,dimBlock>>>(NgO, NgS, k0, Radio_gpu, Result_gpu);
Here is the kernel:
__global__ void trans(int NgO, int NgS,
double k0, double * Radio, double * Result) {
int n = blockIdx.x * blockDim.x + threadIdx.x;
int m = blockIdx.y * blockDim.y + threadIdx.y;
if(n > NgS || m > NgO) return;
// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int idxxx = m + (n * grid_width);
Result[idxxx] = k0 * Radio[idxxx];
}
With the current code, I proceeded to compare the Result_cpu variable with Result_gpu variable once copied back. When I cycle through the values I get:
// matches from NgS = 0...913
Result_gpu[NgS = 913][NgO = 0]: -56887.2
Result_cpu[Ngs = 913][NgO = 0]: -56887.2
// mismatches from NgS = 914...1858
Result_gpu[NgS = 914][NgO = 0]: -12.2352
Result_cpu[NgS = 914][NgO = 0]: 79448.6
This pattern is the same, irregardless of the value of NgO. I have been trying to figure out where I have made a mistake by looking at various examples for a few hours and trying out changes, but so far this scheme has worked minus the obvious issue at hand whereas the others have caused kernel invocation errors/left the GPU array uninitialized for all values. Since I clearly cannot see the mistake, I'd really appreciate if someone could point me in the right direction towards a fix. I'm pretty sure it's right under my nose and I can't see it.
In case it matters, I'm testing this code on a Kepler card, compiling using MSVC 2010, CUDA 4.2 and 304.79 driver and have compiled the code with both arch=compute_20,code=sm_20 and arch=compute_30,code=compute_30 flags with no difference.
#vaca_loca: I tested the following kernel (it works for me also with non-square block dimensions):
__global__ void trans(int NgO, int NgS,
double k0, double * Radio, double * Result) {
int n = blockIdx.x * blockDim.x + threadIdx.x;
int m = blockIdx.y * blockDim.y + threadIdx.y;
if(n > NgO || m > NgS) return;
int ofs = m * NgO + n;
Result[ofs] = k0 * Radio[ofs];
}
void test() {
int NgS = 1859, NgO = 900;
int data_sz = NgS * NgO, bytes = data_sz * sizeof(double);
cudaSetDevice(0);
double *Radio_cpu = new double [data_sz*3],
*Result_cpu = Radio_cpu + data_sz,
*Result_gpu = Result_cpu + data_sz;
double k0 = -1.7961233;
srand48(time(NULL));
int i, j, n, m;
for(m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
Radio_cpu[m + n*NgO] = lrand48() % 234234;
Result_cpu[m + n*NgO] = k0*Radio_cpu[m + n*NgO];
}
}
double *g_Radio, *g_Result;
cudaMalloc((void **)&g_Radio, bytes * 2);
g_Result = g_Radio + data_sz;
cudaMemcpy(g_Radio, Radio_cpu, bytes, cudaMemcpyHostToDevice);
dim3 dimBlock(16, 16);
dim3 dimGrid;
dimGrid.x = (NgO + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (NgS + dimBlock.y - 1) / dimBlock.y;
trans<<<dimGrid,dimBlock>>>(NgO, NgS, k0, g_Radio, g_Result);
cudaMemcpy(Result_gpu, g_Result, bytes, cudaMemcpyDeviceToHost);
for(m=0; m<NgO; m++) {
for (n=0; n<NgS; n++) {
double c1 = Result_cpu[m + n*NgO],
c2 = Result_gpu[m + n*NgO];
if(std::abs(c1-c2) > 1e-4)
printf("(%d;%d): %.7f %.7f\n", n, m, c1, c2);
}
}
cudaFree(g_Radio);
delete []Radio_cpu;
}
though, in my opinion, accessing data from global memory using quads might not be very cache-friendly since access stride is pretty large. You might consider using 2D textures instead if it's critical for your algorithm to access data in 2D locality