YUV to RGB Conversion Error - cuda

I'm currently working on an App, that transforms a RGB picture to YV12, converts it to NV12 and then back to RGB.
I'm getting the following error on my conversion:
http://www.pic-upload.de/view-21874004/ConversionError.jpg.html
So the left side is, what I want. A simple blue color. The right side displays what I'm getting. It looks like, there is way too much green in the conversion result.
Here's the code that transforms the rgb to yv12:
__global__ void RGBtoYV12(unsigned char* yuv, unsigned char* pData)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int width = gridDim.x * blockDim.x * 1.5;
int iwidth = gridDim.x;
int rgbID = i * 4;
int upos = blockDim.x * gridDim.x;
int vpos = upos + upos / 4;
int col = i % iwidth;
int row = i / iwidth; //bzw. threadIdx.x;
int r = pData[rgbID], g = pData[rgbID+1], b = pData[rgbID+2];
//Y
unsigned char y = 0.299 * r + 0.587 * g + 0.114 * b;
yuv[upos - (row+1)*iwidth + col] = y;
if ( !((i/gridDim.x)%2) && !(i%2))
{
//YV12
// U
yuv[width - ( (iwidth/2) * ((row/2)+1) - ((col/2)+1) )] = 0.493 * (b - y);//((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
// V
yuv[vpos - ( (iwidth/2) * ((row/2)+1) - ((col/2)+1) )] = 0.887 * (r - y); //((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
}
The conversion from NV12 is processed like this:
__global__ void NV12toRGB(unsigned char* nv12, unsigned char* rgba, int decodedPitch)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int i = iy * decodedPitch + ix;
int rgbStart = (iy * gridDim.x * blockDim.x + ix) * 4;
int quadX = (ix / 2);
int quadY = (iy / 2);
int uvAdr = decodedPitch / 2 * quadY + quadX;
int uvStart = decodedPitch * gridDim.y * blockDim.y;
int y = nv12[i];
int u = nv12[uvStart + 2 * uvAdr];
int v = nv12[uvStart + 2 * uvAdr + 1];
// R
int r = y + 1.13983 * v;
// G
int g = y - 0.39393 * u - 0.58081 * v;
// B
int b = y + 2.028 * u;
rgba[rgbStart] = r;
rgba[rgbStart+1] = g;
rgba[rgbStart+2] = b;
rgba[rgbStart+3] = 255;
}
As you can see, I do the conversion with cuda on GPU. I think the indexation of the color-values is correct, but I don't know, what goes wrong with the color conversion. Any help or other conversion formulas, i could try out, would be much appreciated.
Greetings

You have floating point multiplications, but results declared as integers. You lose all the precision there. For example
int r = y + 1.13983 * v;
Replace this with
float r = y + 1.13983 * v;
There may be other issues as well, but this stands out.

Related

PyCUDA how to get the number of used registers per thread when launching the kernels?

I have a kernel, how can I get the number of used registers per thread when launching the kernels? I mean in a PyCuda way.
A simple example will be:
__global__
void
make_blobs(float* matrix, float2 *pts, int num_pts, float sigma, int rows, int cols) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < cols && y < rows) {
int idx = y*cols + x;
float temp = 0.f;
for (int i = 0; i < num_pts; i++) {
float x_0 = pts[i].x;
float y_0 = pts[i].y;
temp += exp(-(pow(x - x_0, 2) + pow(y - y_0, 2)) / (2 * sigma*sigma));
}
matrix[idx] = temp;
}
}
Is there anyway to get the number without crashing the program if the real number used has exceeded the max?
The above is OK, it dose not exceed the max in my machine. I just want to get the number in a convenient way. Thanks!
PyCuda already provides this as part of the Cuda function object. The property is called pycuda.driver.Function.num_regs.
Below is a small example that shows how to use it:
import pycuda.autoinit
from pycuda.compiler import SourceModule
kernel_src = """
__global__ void
make_blobs(float* matrix, float2 *pts, int num_pts, float sigma, int rows, int cols) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < cols && y < rows) {
int idx = y*cols + x;
float temp = 0.f;
for (int i = 0; i < num_pts; i++) {
float x_0 = pts[i].x;
float y_0 = pts[i].y;
temp += exp(-(pow(x - x_0, 2) + pow(y - y_0, 2)) / (2 * sigma*sigma));
}
matrix[idx] = temp;
}
}"""
compiledKernel = SourceModule(kernel_src)
make_blobs = compiledKernel.get_function("make_blobs")
print(make_blobs.num_regs)
Note that you don't need to use SourceModule. You can also load the module from e.g. a cubin file. More details can be found in the documentation.

How can I convolution image in CUDA

I have a question about image convolution in CUDA. When I test it with small maxtrix (16*16) evething is ok. But with larger matrix, the result is always change when I run.
I think problem is 2 for loops into kernel.
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i < img_width*img_height; i++)
{
printf("%d, ",(int)output[i]);
}
printf("\n\n");
}
Here is my result, I test it with 24*24 image, I run it 2 time, and I also write simple function to compared the output.
And here is result when I compare the output, there are 32 differents,at index 240, 241 ....
You have made a fairly common error in your program. When you create a grid of threads like this:
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
you are intentionally creating (usually) extra threads in each dimension, so as to fully cover the problem space (i.e. image size). There is nothing wrong with this.
However, it means we will be launching extra threads, which are outside the valid image dimension. We must ensure that these threads do nothing. The usual approach is to add a thread check to the kernel, so that threads outside the valid image dimensions do nothing. Here's a modified kernel and fully worked example showing that change:
$ cat t1219.cu
#include <iostream>
#include <cstdlib>
const int iw = 1025;
const int ih = 1025;
const int rng = 10;
__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
int img_width, const int img_height,
const int kernel_width, const int kernel_height )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if ((x < img_width) && (y < img_height)){ // thread check
float sum = 0;
for ( int j = 0; j < kernel_height; j++ )
{
for ( int i = 0; i < kernel_width; i++ )
{
int dX = x + i - kernel_width / 2;
int dY = y + j - kernel_height / 2;
if ( dX < 0 )
dX = 0;
if ( dX >= img_width )
dX = img_width - 1;
if ( dY < 0 )
dY = 0;
if ( dY >= img_height )
dY = img_height - 1;
const int idMat = j * kernel_width + i;
const int idPixel = dY * img_width + dX;
sum += (float)input[idPixel] * kernelConv[idMat];
}
}
const int idOut = y * img_width + x;
out[idOut] = abs(sum);
}
}
void image_convolution(float * input,float* output, int img_height, int img_width)
{
int kernel_height = 3;
int kernel_width = 3;
float kernel[] ={ 0,-0.25,0,
-0.25,1,-0.25,
0,-0.25,0
};
float * mask = new float[kernel_height*kernel_width];
for (int i = 0; i < kernel_height*kernel_width; i++)
{
mask[i] = kernel[i];
}
float * d_input, * d_output, * d_kernel;
cudaMalloc(&d_input, img_width*img_height*sizeof(float));
cudaMalloc(&d_output, img_width*img_height*sizeof(float));
cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));
cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
}
int main(){
float *in, *out;
int is = ih*iw;
in = new float[is];
out = new float[is];
for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;}
image_convolution(in,out, ih, iw);
for (int iy = 1; iy < ih-1; iy++)
for (int ix = 1; ix < iw-1; ix++){
float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]);
if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}}
return 0;
}
$ nvcc -o t1219 t1219.cu
$ cuda-memcheck ./t1219
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
For image dimensions which are exact multiples of the block size (16,16) (which was true for my previous test case) this problem won't show up -- the code will work correctly. For all other test cases, we need such a thread check.

Read data in a proper way

I have a cpp file where I am creating an image and store the data to myOutput pointer:
int Rows = 80;
int Cols = 64;
for (int i = 0; i < Rows; i++ ){
for (int j = 0; j < Cols; j++ )
{
X = 1.0f * ((float) i - (float) Rows / 2) / (float) Rows;
Y = 2.0f * ((float) j - (float) Cols / 2) / (float) Cols;
.....
myOutput->Re = cosf( ......);
myOutput->Im = sinf(.......);
++myOutput;
}
}
Then , in cuda I am reading like:
int bx = blockIdx.x , by = blockIdx.y;
int tx = threadIdx.x , ty = threadIdx.y;
int RowIdx = ty + by * TILE_WIDTH;
int ColIdx = tx + bx * TILE_WIDTH;
Index = RowIdx * Cols + ColIdx;
//copy input data to shared memory
myshared[ty+1][tx+1] = *( devInputArray + Index );
(So , the myOutput generated from cpp is loaded in devInputArray).
Now , I want to process many images simultaneously.
So, in cpp ,the following additions must be made (for 2 images for example) :
int ImagesNb = 2;
for ( ImagesIdx = 0; ImagesIdx < ImagesNb; ImagesIdx++ ){
for (int i = 0; i < Rows; i++ ){
for (int j = 0; j < Cols; j++ )
{
X = (ImagesIdx + 1) * 1.0f * ((float) i - (float) Rows / 2) / (float) Rows;
Y = (ImagesIdx + 1) * 2.0f * ((float) j - (float) Cols / 2) / (float) Cols;
...
But , now I am not sure how to read the data from cuda.
I don't know how to take into account the number of images.
Before , I had a pointer which contained data (80 x 64) .
Now , it still contains the same dimension of every image but with more data.
I must change this:
Index = RowIdx * Cols + ColIdx;
//copy input data to shared memory
myshared[ty+1][tx+1] = *( devInputArray + Index );
but I can't figure how!
I hope it is clear!
UPDATED
I am trying something like this:
int bx = blockIdx.x , by = blockIdx.y , bz = blockIdx.z;
int tx = threadIdx.x , ty = threadIdx.y , tz = threadIdx.z;
int RowIdx = ty + by * TILE_WIDTH;
int ColIdx = tx + bx * TILE_WIDTH;
int ImagesIdx = tz + bz * blockDim.z;
Index = RowIdx * Cols + ColIdx + Rows * Cols * ImagesIdx
and :
dim3 dimGrid( ImagesNb * (Cols / TILE_WIDTH) , ImagesNb * (Rows / TILE_WIDTH) , ImagesNb);
dim3 dimBlock( TILE_WIDTH , TILE_WIDTH , 2);
but if I try for 2 images I am not getting right results..
Ok, for using a number of images you must add an extra dimension to shared variable in order to hold the number of images.

Simulatenous computation and data load to shared memory: the case of tiled matrix-matrix multiplication

I want to write a matrix multiplication algorithm, basing on the shared memory example from CUDA, that is performing the computing and data load simultaneously. I have the code looking like this:
float As[BLOCK_SIZE][BLOCK_SIZE];
float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[aBegin + wA * ty + tx];
Bs[ty][tx] = B[bBegin + wB * ty + tx];
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
__shared__ float A2s[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float B2s[BLOCK_SIZE][BLOCK_SIZE];
A2s[ty][tx] = As[ty][tx];
B2s[ty][tx] = Bs[ty][tx];
__syncthreads();
if (a+1 <= aEnd)
{
As[ty][tx] = A[a+1 + wA * ty + tx];
Bs[ty][tx] = B[b+1 + wB * ty + tx];
}
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
Csub += A2s[ty][k] * B2s[k][tx];
}
__syncthreads();
}
But it works slower than the original solution, as the second data loading is performed sequentially with the computation. How can I make parallel?
You should avoid moving data A and B to local arrays As and Bs, namely
As[ty][tx] = A[aBegin + wA * ty + tx];
Bs[ty][tx] = B[bBegin + wB * ty + tx];
You could directly move them to shared memory A2s and B2s, namely
A2s[ty][tx] = A[aBegin + wA * ty + tx];
B2s[ty][tx] = B[bBegin + wB * ty + tx];
Also, the data loads
As[ty][tx] = A[a+1 + wA * ty + tx];
Bs[ty][tx] = B[b+1 + wB * ty + tx];
seem to be unexploited.
Finally, you should move the declaration of shared memory arrays outside the for loop and a final assignment to the output matrix was also missing.
Try something like:
__global__ void TiledMatrixMultiplicationKernel(float* A, float* B, float* C, int Width)
{
__shared__float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__float Bs[BLOCK_SIZE][BLOCK_SIZE];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int Row = by * BLOCK_SIZE + ty;
int Col = bx * BLOCK_SIZE + tx;
float Csub = 0;
for (int m = 0; m < Width/BLOCK_SIZE; ++m) {
As[ty][tx] = A[Row*Width + (m*BLOCK_SIZE + tx)];
Bs[ty][tx] = B[Col + (m*BLOCK_SIZE + ty)*Width];
__syncthreads();
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
__syncthreads();
}
C[Row*Width+Col] = Csub;
}

Count the number of cycles in a CUDA kernel

How can I count the number of cycles performed by a function like the following. Should I count straight forward the number of sums and muls and divs? Where can I check how many cycles an addition takes in CUDA?
__global__
void mandelbrotSet_per_element(Grayscale *image){
float minR = -2.0f, maxR = 1.0f;
float minI = -1.2f, maxI = minI + (maxR-minR) * c_rows / c_cols;
float realFactor = (maxR - minR) / (c_cols-1);
float imagFactor = (maxI - minI) / (c_rows-1);
bool isInSet;
float c_real, c_imag, z_real, z_imag;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
while (y < c_rows){
while (x < c_cols) {
c_real = minR + x * realFactor;
c_imag = maxI - y * imagFactor;
z_real = c_real; z_imag = c_imag;
isInSet = true;
for (int k = 0; k < c_iterations; k++){
float z_real2 = z_real * z_real;
float z_imag2 = z_imag * z_imag;
if (z_real2 + z_imag2 > 4){
isInSet = false;
break;
}
z_imag = 2 * z_real * z_imag + c_imag;
z_real = z_real2 - z_imag2 + c_real;
}
if (isInSet) image[y*c_cols+x] = 255;
else image[y*c_cols+x] = 0;
x += blockDim.x * gridDim.x;
}
x = blockDim.x * blockIdx.x + threadIdx.x;
y += blockDim.y * gridDim.y;
}
}
Instruction throughput is described in the programming guide here
You can also try measuring a sequence of instructions using the native clock() function described here
The compiler tends to obscure actual counts of operations at the source code level (increasing or possibly decreasing apparent arithmetic intensity) so if you want to indentify exactly what the machine is doing you may want to inspect the ptx (nvcc -ptx ...) or possibly the machine assembly level code, called SASS, which you can extract from an executable using the cuobjdump utility.