CUDA C - CRC32 - Finding unknown polynom and crcxor - program speed up - cuda

I was looking for questions related to my problem but only found questions regarding CRC32 reversing. My topic is a bit different.
I am a novice programmer and I have such a task to do. I have input (3 strings of 4 bytes). For this data, I know three checksums computed using a hash function similar to CRC32. However, it is not a standard CRC32 because it differs between the default and unknown values ​​of the polynomial and the crcxor parameter.
So for the input data of 4 bytes I calculate the CRC using different values ​​of the polynomial from 0 to 0xFFFFFFFF and using different values ​​of the parameter crcxor with the values between 0 and 0xFFFF. I wrote this program in CUDA C because it runs faster than the CPU. This is my third CUDA C program right after "Hello World" and "VectorAdd" :). To calculate all possible 0xFFFF x 0xFFFFFFFF variants, it takes about 5 hours for my NVIDIA GTX1060 card.
I wanted to ask if it is possible to modify or optimize the following program code in order to do this task faster?
Ultimately, I would like to calculate 0xFFFFFFFF x 0xFFFFFFFF but I don't know yet if it can be done in a short time.
If anyone would like to have a look at my program code and provide valuable feedback, I would be extremely grateful.
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
__device__ unsigned long calculate_crc(unsigned long data, unsigned long poly, unsigned long cxor)
// truncated function for constant values crcinit = 0 refin = 0 refout = 0 direct = 0
{
unsigned long i, j, k, c, bit, crc = 0;
for (i=0,k=24; i<4; i++,k-=8)
{
c = (data>>k)&0xFF;
for (j=0x80; j; j>>=1)
{
bit = crc & 0x80000000;
crc<<= 1;
if (c & j) bit^= 0x80000000;
if (bit) crc^= poly;
}
}
crc^= cxor;
crc&= 0xFFFFFFFF;
return crc;
}
__global__ void calculate_crc_parameters(unsigned long n)
{
unsigned long polynom = 0;
unsigned long crcxor = 0;
//Input data:
const unsigned long data1 = 0x928F640C;
const unsigned long data2 = 0x0121B30E;
const unsigned long data3 = 0xCB652607;
// calculated CRC for the above input data and for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x00000000, refin: 0, refout: 0, direct: 0:
// for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0
// finds it right away because crcxor = 0
const unsigned long crc1 = 0x7076BCEB;
const unsigned long crc2 = 0x1F719D7A;
const unsigned long crc3 = 0x8369D986;
// other example crc - for crcxor> 0
// computed CRC for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x000000FF, refin: 0, refout: 0, direct: 0:
// for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0x000000FF
// Program find it after 1m 12sec.
/*
const unsigned long crc1 = 0x7076BC14;
const unsigned long crc2 = 0x1F719D85;
const unsigned long crc3 = 0x8369D979;
*/
// computed CRC for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x0000FFFE, refin: 0, refout: 0, direct: 0:
// for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0x0000FFFE
// searches for 5 hours
/*
const unsigned long crc1 = 0x70764315;
const unsigned long crc2 = 0x1F716284;
const unsigned long crc3 = 0x83692678;
*/
// CRCs - polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0xFF7A1DB7, refin: 0, refout: 0, direct: 0:
// no implementation for 8-byte crcxor yet - and it would count for a long time
/*
const unsigned long crc1 = 0x8F0CA15C;
const unsigned long crc2 = 0xE00B80CD;
const unsigned long crc3 = 0x7C13C431;
*/
unsigned int index_x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int stride_x = blockDim.x * gridDim.x;
unsigned int index_y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int stride_y = blockDim.y * gridDim.y;
unsigned int index_z = blockIdx.z * blockDim.z + threadIdx.z;
unsigned int stride_z = blockDim.z * gridDim.z;
if((index_x<n)&&(index_y<n)&&(index_z<n))
{
polynom = (index_x << 16) ^ index_y; // "gluing" the polynomial
// to get polynom e.g. 0xFF7A1DB7 we have to "glue it" with index_x and index_y
// if index_x == 0xFF7A then LSH by 16 places and we get 0xFF7A0000
// then xor from index_y: 0xFF7A0000 xor 0x00001DB7 and is 0xFF7A1DB7
crcxor = index_z; // crcxor will take the values of index_z that is from 0x0000 to 0xFFFF
if(calculate_crc(data1,polynom,crcxor)==crc1)
if(calculate_crc(data2,polynom,crcxor)==crc2)
if(calculate_crc(data3,polynom,crcxor)==crc3) // compute three checksums and compare them
printf("\nCRC parameters found ---> polynom: 0x%08X, crcxor: 0x%08X\n", polynom,crcxor);
// if the calculated 3 crc sums agree with the known 3 crcs, then display the parameters for which they were calculated
if ((crcxor%0xFF==0)&&(polynom==0xFFFFFFFF)) printf("#"); // 1m 12s from displaying # to the next #
// if the # sign is displayed 256 times, this will be the end of the program
index_x+=stride_x;
index_y+=stride_y;
index_z+=stride_z;
}
}
int main(void)
{
unsigned long N = 0x10000; // 0xFFFF + 0x01 = 65536dec
////////////////////////////////////////////////
// for computing only in X and Y axes - for crcxor = zero all the time
dim3 dimBlock( 4, 4, 1);
dim3 dimGrid(16384, 16384, 1);
////////////////////////////////////////////////
// for computing on the X, Y and Z axes, i.e. for crcxor taking values from the Z axis from 0 to 65535
//dim3 dimBlock( 4, 4, 64); // 4 * 4 * 64 = 1024 --- maximum block size
//dim3 dimGrid(16384, 16384, 1024); //uncomment this 2 lines for crcxor > 0
// 4 4 64
// * * *
// 16384 16384 1024
// = = =
// 0x10000 0x10000 0x10000
// x, y, and z will trigger 65,536 times each
cudaProfilerStart();
calculate_crc_parameters<<<dimGrid, dimBlock>>>(N);
cudaDeviceSynchronize();
cudaDeviceReset();
cudaProfilerStop();
return 0;
}
I compile it in cmd by: nvcc name.cu -o name
I work on win10 with Cuda Toolkit 11.5
Card is NVIDIA GTX 1060.
Could the use of pointers or memory allocations somehow speed up this program?
I computing test crc values here

Optimization should begin with the algorithm, as opposed to optimizing a painfully pointless brute-force approach.
You can factor the search for a polynomial and a final exclusive-or, doing the polynomial first, and then (trivially) finding the exclusive-or value. All you need to do is take the exclusive-or of two of your data values, and the find the polynomial that produces the exclusive-or of the two CRCs of those values, assuming a zero final exclusive or. You will need to try at least two pairs in order to narrow it down to one choice for the polynomial.
Once you have the polynomial, now compute the CRC on one of your data values, exclusive-or that with the desired CRC, and now you have your final exclusive-or value. No search needed for the second step.
The polynomial search is fast enough that you can just use your CPU. No GPU or CUDA or whatever is needed. It took 40 seconds on my three-year old laptop. You only need to try odd polynomials. Even polynomials are not valid.
Exclusive-oring the data and the CRCs also cancels the initial value. So you can find the polynomial this way for CRCs that have both a non-zero initial value and a non-zero final exclusive or. However in order to then solve for both the initial value and final exclusive-or, you will need examples with different length messages, i.e. other than all four-byte messages. There are 232 possible combinations of initial value and final exclusive-or that will match any and all CRCs of four-byte messages.
As an aside, your CRC routine is needlessly complicated. See equivalent below. This prints poly = ff7a1db7, xor = 0000fffe:
#include <stdio.h>
#include <stdint.h>
uint32_t calculate_crc(uint32_t data, uint32_t poly, uint32_t xor) {
for (int i = 0; i < 32; i++)
data = data & 0x80000000 ? (data << 1) ^ poly : data << 1;
return data ^ xor;
}
void findp(uint32_t data1, uint32_t data2, uint32_t data3,
uint32_t crc1, uint32_t crc2, uint32_t crc3) {
uint32_t d = data2, c = crc2;
data1 ^= data3; crc1 ^= crc3;
data2 ^= data3; crc2 ^= crc3;
data3 ^= d; crc3 ^= c;
uint32_t poly = 1;
do {
if (calculate_crc(data1, poly, 0) == crc1 &&
calculate_crc(data2, poly, 0) == crc2 &&
calculate_crc(data3, poly, 0) == crc3)
printf("poly = %08x, xor = %08x\n",
poly, calculate_crc(d, poly, 0) ^ c);
poly += 2;
} while (poly != 1);
}
int main(void) {
findp(0x928F640C, 0x0121B30E, 0xCB652607,
0x70764315, 0x1F716284, 0x83692678);
return 0;
}
There is an even faster, in fact massively faster, approach by solving a set of linear equations over GF(2). However it would take me longer than 40 seconds to write that code, so this is where I would stop. Unless I had many, many of these CRCs to find. Or unless I was trying to find, for example, a 64-bit CRC polynomial.

Related

Cuda In-Situ memory race issue for algorithms such as convolution of morphologicam dilation

I wrote a dilation kernel in CUDA and it works well when my input and my output images are different buffers, but I am facing what I understand to be a memory race issue when I call my kernel in an in-situ case, i.e. the input and the output buffers point to the same memory location.
I tried :
a. using cooperative groups,
b. using a mutex and an atomic addition but as suggested in this paper and in several sources on the web,
c. using a lock-free inter-block synchronization, the synchronization proposed in this same paper.
All my attempts failed because :
a. did not work because my input buffer is a const pointer and I have a compilation error when I have to cast it into a void* parameter (which makes sense), so I could not go further.
b. did not work because I faced a wierd behaviour : I have 16x16 blocks, each with 32x32 threads. Synchronizing the blocks should increase the mutex to 256 but the program blocks after 48 atomic additions.
c. did not work because it seams to be no inter-block synchronization, although the code I used directly from the paper seems good to me. I could improve a little the race effect by adding some __syncthreads()
This is the dilation function ;
template <typename T>
__global__ void GenericDilate2dImg_knl(const ImageSizeInfo imgSizeInfo,
volatile int* syncArrayIn, volatile int* syncArrayOut,
const unsigned long localSizeX, const unsigned long localSizeY,
const int borderPolicyType, const T outOfImageValue,
const struct StructuringElementInfo seInfo,
const T* pInBuf, T* pOutBuf)
{
// Extract sizeX, sizeY, etc. from imgSizeInfo
SPLIT_SIZES_FROM_STRUCT(imgSizeInfo)
// Declare the shared buffer pSharedBuf
extern __shared__ char pSharedMem[];
T* pSharedBuf = reinterpret_cast<T*>(pSharedMem);
const unsigned long x = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned long y = blockDim.y * blockIdx.y + threadIdx.y;
const unsigned long planIdx = blockDim.z * blockIdx.z + threadIdx.z;
const unsigned long nbPlans = sizeZ * sizeC * sizeT;
const unsigned long idx = x + y * sizeX + planIdx * sizeX*sizeY;
// Copy the input image data into shared memory
if (x < blockDim.x * gridDim.x && y < blockDim.y * gridDim.y && planIdx < blockDim.z * gridDim.z) {
copyDataToSharedMemory2d(pInBuf, sizeX, sizeY, planIdx,
localSizeX, localSizeY,
seInfo._paddingX, seInfo._paddingY,
borderPolicyType, outOfImageValue,
pSharedBuf);
}
// Wait to ensure that the copy is terminated
if (pInBuf == pOutBuf) {
// Grid synchronization for in-situ case
//__gpu_sync(gridDim.x * gridDim.y); // Use a mutex
__gpu_sync2(1, syncArrayIn, syncArrayOut); // Use a lock-free barrier
}
else
// The input and ouput buffers point to different data
// -> we simply need to synchronize the threads inside the block
__syncthreads();
// Compute the convolution for pixels inside the image
if (x < sizeX && y < sizeY && planIdx < nbPlans) {
T vMax = 0;
for (unsigned int curCoefIdx = 0; curCoefIdx < seInfo._nbOffsets; ++curCoefIdx) {
const unsigned int sx = threadIdx.x + seInfo._paddingX + seInfo._pOffsetsX[curCoefIdx];
const unsigned int sy = threadIdx.y + seInfo._paddingY + seInfo._pOffsetsY[curCoefIdx];
const unsigned long sidx = sx + sy * localSizeX;
const T curVal = pSharedBuf[sidx];
vMax = (vMax > curVal ? vMax : curVal);
}
// Round the result
pOutBuf[idx] = vMax;
}
}
My function to copy from global to shared memory is :
template <typename T>
__device__ void copyDataToSharedMemory2d(const T* pInBuf,
const unsigned long sizeX, const unsigned long sizeY, const unsigned long planIdx,
const unsigned long localSizeX, const unsigned long localSizeY,
const int paddingX, const int paddingY,
const int borderPolicyType, const T outOfImageValue,
T* pSharedBuf)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int localX = threadIdx.x;
const int localY = threadIdx.y;
// Fill the shared buffer tile by tile
// A tile is related to the group size
const unsigned int groupSizeX = blockDim.x;
const unsigned int groupSizeY = blockDim.y;
// For each tile
for (int offsetY = 0; offsetY < localSizeY; offsetY += groupSizeY) {
int curLocalY = localY + offsetY;
int curGlobalY = y + offsetY - paddingY;
for (int offsetX = 0; offsetX < localSizeX; offsetX += groupSizeX) {
int curLocalX = localX + offsetX;
int curGlobalX = x + offsetX - paddingX;
// If the current coordinate is inside the shared sub-image
if (curLocalX < localSizeX && curLocalY < localSizeY) {
const int idx = curLocalX + curLocalY * localSizeX;
pSharedBuf[idx] = getPixel2d(pInBuf, sizeX, sizeY, curGlobalX, curGlobalY, planIdx, borderPolicyType, outOfImageValue);
}
}
}
}
Where getPixel2d allows me to manage the data out of the image:
template <typename T>
__device__
T getPixel2d(const T* pInBuf,
const unsigned long sizeX, const unsigned long sizeY,
const int x, const int y, const int z,
const int borderPolicyType, const T outOfImageValue)
{
int x_inside = x;
if (x < 0 || x >= sizeX) {
switch (borderPolicyType) {
case 0://outside the image, there is a constant value
return outOfImageValue;
case 1://outside the image, we propagate the data at the image borders
if (x < 0)
x_inside = 0;
else // x >= sizeX
x_inside = sizeX - 1;
break;
case 2://Miror effect
if (x < 0)
x_inside = -(x + 1);
else // x >= sizeX
x_inside = sizeX - ((x - sizeX) + 1);
break;
}
}
// y-coordinate inside the image
int y_inside = y;
if (y < 0 || y >= sizeY) {
switch (borderPolicyType) {
case 0://outside the image, there is a constant value
return outOfImageValue;
case 1://outside the image, we propagate the data at the image borders
if (y < 0)
y_inside = 0;
else // y >= sizeY
y_inside = sizeY - 1;
break;
case 2://Miror effect
if (y < 0)
y_inside = -(y + 1);
else // y >= sizeY
y_inside = sizeY - ((y - sizeY) + 1);
break;
default: break;
}
}
return pInBuf[x_inside + y_inside * sizeX + z * sizeX * sizeY];
}
and now, here are my inter-block synchronization functions :
// Using a mutex
__device__ volatile int g_mutex;
__device__ void __gpu_sync(int goalVal) {
//thread ID in a block
int tid_in_block = threadIdx.x * blockDim.y + threadIdx.y;
// only thread 0 is used for synchronization
if (tid_in_block == 0) {
atomicAdd((int*)&g_mutex, 1);
printf("[%d] %d Vs %d\n", blockIdx.x * gridDim.y + blockIdx.y, g_mutex, goalVal);
//only when all blocks add 1 to g_mutex
//will g_mutex equal to goalVal
while (g_mutex </*!=*/ goalVal) {
;//Do nothing here
}
}
__syncthreads();
}
// Lock-free barrier
__device__ void __gpu_sync2(int goalVal, volatile int* Arrayin, volatile int* Arrayout) {
// thread ID in a block
int tid_in_blk = threadIdx.x * blockDim.y + threadIdx.y;
int nBlockNum = gridDim.x * gridDim.y;
int bid = blockIdx.x * gridDim.y + blockIdx.y;
// only thread 0 is used for synchronization
if (tid_in_blk == 0) {
Arrayin[bid] = goalVal;
}
if (bid == 1) {
if (tid_in_blk < nBlockNum) {
while (Arrayin[tid_in_blk] != goalVal) {
;//Do nothing here
}
}
__syncthreads();
if (tid_in_blk < nBlockNum) {
Arrayout[tid_in_blk] = goalVal;
}
}
if (tid_in_blk == 0) {
while (Arrayout[bid] != goalVal) {
;//Do nothing here
}
}
__syncthreads();
}
The image I get for in-situ calculation is :
I used a 11x15 structuring emelent and the size of the shared buffer is (nbThreadsPerBlock+2*paddindX) * (nbThreadsPerBlock+2*paddindY). The wrong result (showed by the arrows) appears at the top of some blocks, but always at the same location and with the same values. I'd expect a more random result for memory race effect...
Is there a better approach to manage in-situ calculation or any reason that would prevent the grid synchronization to work?
EDIT
The size of the image I used is 510x509 and I run my code on a NVidia Quadro RTX 5000.
I would normally suggest minimal reproducible example for a question like this, as well as an indication of the GPU you are running on, but we can probably proceed without that. In short, what you are trying to do will not work reliably, as you've already discovered.
You have chosen a thread strategy of assigning one thread in your grid per output point:
pOutBuf[idx] = vMax;
which is sensible and fine. I imagine based on this:
I have 16x16 blocks, each with 32x32 threads.
that your input images are 512x512 (16x32 threads in each direction, one thread per output point).
And as you've already stated, you have 256 blocks (each of 1024 threads) in your grid. Furthermore, for the in-situ case, we can simplify your kernel to the following pseudo-code:
__global__ void GenericDilate2dImg_knl(...){
read_in_image();
grid_wide_sync();
write_out_image();
}
For such a methodology to work, then, the read_in_image() step must be able to read the entire image, before any writing occurs. However your methodology will not work in the general case, and evidently not on your specific GPU, either. In order to read in the entire image as per above, we must have every threadblock in the grid simultaneously resident on the SMs in your GPU. All 256 blocks need to be deposited, and running on an SM. But the GPU provides no inherent guarantees of such a thing. If your GPU has, for example 24 SMs in it, each of which can hold a maximum of 2048 threads, then your GPU would have a "running" or "instantaneous" capacity of 24*2048 threads, or 48 of your threadblocks. There would not be enough room for all 256 threadblocks to be running. Not only does your algorithm depend on that, but all 3 of your grid sync methods depend on that notion as well.
The fact that your 2nd grid sync method stops after 48 "atomic additions" suggested the example numbers above to me. It's a plausible proximal explanation for why that method may have failed that way: your GPU only allowed 48 of your threadblocks to be resident, and the other 208 threadblocks were waiting in the wings, not yet deposited on any SM, and therefore not allowing any of their threads to run. Those threads in those 208 threadblocks need to run to pick up the relevant input data, as well as to satisfy the requirements of the grid-wide sync. But they are not running, because they are waiting for room to open up on a SM. And room never opens up on a SM, because the full SMs have threadblocks that are waiting at the grid sync point. So you have deadlock.
This problem is not easily solvable in the general case. Any grid sync mechanism, including cooperative groups, has an inherent requirement that all threadblocks be actually simultaneously schedulable on your particular GPU. Therefore in the general case, where we don't know the data set size or the GPU we will be running on, the problem is quite difficult.
One possible approach is to divide your input data set into regions, and have your kernel process a region at a time. This may require multiple grid syncs, one to handle the in/out division in each region, and one to handle the progression of the kernel as it steps through regions. You would also have to handle the region edges carefully.
Another possible approach if you know the specifics of the data set size and the GPU you are running on, is just to make sure you are running on a GPU "large enough" to handle the data set size. For example, an A100 GPU could probably have as many 216 blocks simultaneously resident, so for that case you could handle a somewhat smaller image size, perhaps 14x32=448 height and 448 width dimensions.
Given that these approaches for in-place or in-situ work for this particular example require considerable complexity, I personally would be strongly motivated to use the methodology where output is different than input. That approach will likely run noticeably quicker as well. A grid wide sync is not a "free" construct from a performance perspective.

Sub-Matrix computations

I want to calculate the pair wise distance between two sub-matrices of a matrix. For example I have a matrix A (MxN) and two blocks of that matrix B1 (mxn) and B2 (kxt). More specifically, I want to calculate the distance of the B1(1,1) element from all the other elements of the B2 and to do this process for all the B1 elements. To be more clear the B1 and B2 may be not compact parts of the matrices and basically the information I know is the coordinates of the elements of B1 and B2 on the matrix A. Here is an example.
for(int i = 0; i < nRowsCoordsB1 ; i++ ){//nRows of B1
for(int j = 0; j < nRowsCoordsB2 ; j++ ){//nRows of B2
//CoordsofB1 is a nRowsB1x2 matrix that contains the element coordinates of the B1 sub matrix
a_x = CoordsofB1[ i ]; //take the x coord of the corresponding row i
a_y = CoordsofB1[ i + nRowsCoordsB1 ]; //take the y coord of the corresponding row
b_x = CoordsofB2[ j ];
b_y = CoordsofB2[ j + nRowsCoordsB2 ];
int element1 = A[ a_x + a_y*nRowsofA ];
int element2 = A[ b_x + b_y*nRowsofA ] ;
sum +=abs( element1 - element2 ) ;
}
}
*Output = sum/(float)(numberOfElementsofB1*numberOfElementsofB2);
Now I want to speedup computations with CUDA :) Because I am new in Cuda perspective I found it a little complicated. Since now I think that I have understand the logic of allocating block threads in Matrix level but here the fact that I have two different parts of the matrix with different size, CoordsofB1 and CoordsofB2, confuse me a little on how I can access them take the coordinates and use them in the hole matrix. I thought that we should work in A using constrains but I did not come with a clear thought.
Also the fact that in the end of the for loops the sum is divided with a quantity confuse me on who we would combined in the cuda translated code.
Any suggestions-snippets-examples-references would be great.
PS: the reason I use column-major ordering is because the code is evaluated in matlab.
UPDATE: Can we allocate thread block of size equal the size of the biggest sub matrix B1 or B2 and work with them using the correct conditions? I comment the last line because I was not sure about what to do with it. Any comments?
int r = blockDim.x * blockIdx.x + threadIdx.x; // rows
if( r < nRowsCoordsB1 ){
a_x = CoordsofB1[ r ];
a_y = CoordsofB1[ r + nRowsCoordsB1 ];
if( r < nRowsCoordsB2 ;){
b_x = CoordsofB2[ r ];
b_y = CoordsofB2[ r + nRowsCoordsB2 ];
int element1 = A[ a_x + a_y*nRowsofA ];
int element2 = A[ b_x + b_y*nRowsofA ] ;
sum +=abs( element1 - element2 ) ;
}
}
//*Output = sum/(float)(numberOfElementsofB1*numberOfElementsofB2);
Here a sketch
I have the coordinates of each element inside the B1 and B2 and I want to calculate the differences between the values in
[ (B1(1,1) - B2(1,1)) + (B1(1,1) - B2(1,2)) + ... + (B1(1,1) - B2(:,:)) ] +
[ (B1(1,2) - B2(1,1)) + (B1(1,2) - B2(1,2)) + ... + (B1(1,2) - B2(:,:)) ] +
[ (B1(:,:) - B2(1,1)) + (B1(:,:) - B2(1,2)) + ... + (B1(:,:) - B2(:,:)) ].
If I understand it correctly, what you are trying to do can be written in the following matlab code.
rep_B1 = repmat(B1(:), 1, length(B2(:)) );
rep_B2 = repmat(B2(:)', length(B1(:), 1) );
absdiff_B1B2 = abs(rep_B1 - repB2);
Result = mean( absdiff_B1B2(:) );
Your will notice that before the reduction, there is a matrix absdiff_B1B2 of the size length(B1(:)) x length(B2(:)), i.e. m*n x k*t (this matrix is never stored to global mem if you implement the above code in one CUDA kernel). You could partition this matrix into 16x16 sub-matrices and use one 256-thread-block per sub-matrix to decompose the workload to GPU.
On the other hand, you could use thrust to make your life easier.
Update
Since B1 and B2 are sub-matrices of A, you could first use cudaMemcpy2D() to copy them to linear space, then use a kernel to construct and then reduce the matrix absdiff_B1B2.
For the final normalization operation (last line of your code), you could do it on CPU.
Here's the code using thrust to show how to construct and reduce the matrix absdiff_B1B2 in a single kernel. However you will find that the construction procedure use no shared memory and is not optimized. Further optimization using shared mem will improve the performance.
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
template<typename T>
struct abs_diff
{
inline __host__ __device__ T operator()(const T& x, const T& y)
{
return abs(x - y);
}
};
int main()
{
using namespace thrust::placeholders;
const int m = 98;
const int n = 87;
int k = 76;
int t = 65;
double result;
thrust::device_vector<double> B1(m * n, 1.0);
thrust::device_vector<double> B2(k * t, 2.0);
result = thrust::inner_product(
thrust::make_permutation_iterator(
B1.begin(),
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
_1 % (m * n))),
thrust::make_permutation_iterator(
B1.begin(),
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
_1 % (m * n))) + (m * n * k * t),
thrust::make_permutation_iterator(
B2.begin(),
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
_1 / (m * n))),
0.0,
thrust::plus<double>(),
abs_diff<double>());
result /= m * n * k * t;
std::cout << result << std::endl;
return 0;
}
Perhaps the solution below using a 2D thread grid, could be an alternative to Eric's use of thrust to have some more insight to the problem.
The code snippet below is to illustrate the concept only. It is an untested code.
2D grid
Define a partial_distances matrix of size nRowsCoordsB1 X nRowsCoordsB2 that will contain all the involved absolute value differences between the elements of B1 and B2. In the main file you will have
__global__ void distance_calculator(int* partial_distances, int* CoordsofB1, int* CoordsofB2, int nRowsCoordsB1, int nRowsCoordsB2) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int a_x = CoordsofB1[i];
int a_y = CoordsofB1[i+nRowsCoordsB1];
int b_x = CoordsofB2[j];
int b_y = CoordsofB2[j+nRowsCoordsB2];
partial_distances[j*nRowsCoordsB1+i] = abs(A[a_x+a_y*nRowsofA]-A[b_x+b_y*nRowsofA]);
}
int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
#define BLOCKSIZE 32
int main() {
int* partial_distances; cudaMalloc((void**)&partial_distances,nRowsCoordsB1*nRowsCoordsB2*sizeof(int));
dim3 BlocSize(BLOCKSIZE,BLOCKSIZE);
dim3 GridSize;
GridSize.x = iDivUp(nRowsCoordsB1,BLOCKSIZE);
GridSize.y = iDivUp(nRowsCoordsB2,BLOCKSIZE);
distance_calculator<<<GridSize,BlockSize>>>(partial_distances,CoordsofB1,CoordsofB2,nRowsCoordsB1,nRowsCoordsB2);
REDUCTION_STEP
}
The REDUCTION_STEP could be implemented as the iterative call to a 1D reduction kernel to sum up all the elements corresponding to a particular element of B1.
An alternative would be to use dynamic parallelism to call the reduction routine directly within the kernel, but this is an option not suitable to the card you are using.

GPU gives no performance improvement in Julia set computation

I am trying to compare performance in CPU and GPU. I have
CPU : Intel® Core™ i5 CPU M 480 # 2.67GHz × 4
GPU : NVidia GeForce GT 420M
I can confirm that GPU is configured and works correctly with CUDA.
I am implementing Julia set computation. http://en.wikipedia.org/wiki/Julia_set
Basically for every pixel, if the co-ordinate is in the set it will paint it red
else paint it white.
Although, I get identical answer with both CPU and GPU but instead of getting a
performance improvement, I get a performance penalty by using GPU.
Running times
CPU : 0.052s
GPU : 0.784s
I am aware that transferring data from device to host can take up some time.
But still, how do I know if use of GPU is actually beneficial?
Here is the relevant GPU code
#include <stdio.h>
#include <cuda.h>
__device__ bool isJulia( float x, float y, float maxX_2, float maxY_2 )
{
float z_r = 0.8 * (float) (maxX_2 - x) / maxX_2;
float z_i = 0.8 * (float) (maxY_2 - y) / maxY_2;
float c_r = -0.8;
float c_i = 0.156;
for( int i=1 ; i<100 ; i++ )
{
float tmp_r = z_r*z_r - z_i*z_i + c_r;
float tmp_i = 2*z_r*z_i + c_i;
z_r = tmp_r;
z_i = tmp_i;
if( sqrt( z_r*z_r + z_i*z_i ) > 1000 )
return false;
}
return true;
}
__global__ void kernel( unsigned char * im, int dimx, int dimy )
{
//int tid = blockIdx.y*gridDim.x + blockIdx.x;
int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid *= 3;
if( isJulia((float)blockIdx.x, (float)threadIdx.x, (float)dimx/2, (float)dimy/2)==true )
{
im[tid] = 255;
im[tid+1] = 0;
im[tid+2] = 0;
}
else
{
im[tid] = 255;
im[tid+1] = 255;
im[tid+2] = 255;
}
}
int main()
{
int dimx=768, dimy=768;
//on cpu
unsigned char * im = (unsigned char*) malloc( 3*dimx*dimy );
//on GPU
unsigned char * im_dev;
//allocate mem on GPU
cudaMalloc( (void**)&im_dev, 3*dimx*dimy );
//launch kernel.
**for( int z=0 ; z<10000 ; z++ ) // loop for multiple times computation**
{
kernel<<<dimx,dimy>>>(im_dev, dimx, dimy);
}
cudaMemcpy( im, im_dev, 3*dimx*dimy, cudaMemcpyDeviceToHost );
writePPMImage( im, dimx, dimy, 3, "out_gpu.ppm" ); //assume this writes a ppm file
free( im );
cudaFree( im_dev );
}
Here is the CPU code
bool isJulia( float x, float y, float maxX_2, float maxY_2 )
{
float z_r = 0.8 * (float) (maxX_2 - x) / maxX_2;
float z_i = 0.8 * (float) (maxY_2 - y) / maxY_2;
float c_r = -0.8;
float c_i = 0.156;
for( int i=1 ; i<100 ; i++ )
{
float tmp_r = z_r*z_r - z_i*z_i + c_r;
float tmp_i = 2*z_r*z_i + c_i;
z_r = tmp_r;
z_i = tmp_i;
if( sqrt( z_r*z_r + z_i*z_i ) > 1000 )
return false;
}
return true;
}
#include <stdlib.h>
#include <stdio.h>
int main(void)
{
const int dimx = 768, dimy = 768;
int i, j;
unsigned char * data = new unsigned char[dimx*dimy*3];
**for( int z=0 ; z<10000 ; z++ ) // loop for multiple times computation**
{
for (j = 0; j < dimy; ++j)
{
for (i = 0; i < dimx; ++i)
{
if( isJulia(i,j,dimx/2,dimy/2) == true )
{
data[3*j*dimx + 3*i + 0] = (unsigned char)255; /* red */
data[3*j*dimx + 3*i + 1] = (unsigned char)0; /* green */
data[3*j*dimx + 3*i + 2] = (unsigned char)0; /* blue */
}
else
{
data[3*j*dimx + 3*i + 0] = (unsigned char)255; /* red */
data[3*j*dimx + 3*i + 1] = (unsigned char)255; /* green */
data[3*j*dimx + 3*i + 2] = (unsigned char)255; /* blue */
}
}
}
}
writePPMImage( data, dimx, dimy, 3, "out_cpu.ppm" ); //assume this writes a ppm file
delete [] data
return 0;
}
Further, following suggestions from #hyde I have looped the computation-only part to generate 10,000 images. I am not bothering to write all those images though. Computation only is what I am doing.
Here are the running times
CPU : more than 10min and code still running
GPU : 1m 14.765s
Turning comments to answer:
To get relevant figures, you needs to calculate more than one image, so that execution time is seconds or tens of seconds at least. Also, including file saving time in results is going to add noise and hide the actual CPU vs GPU difference.
Another way to get real results is to select a Julia set which has lot points belonging to the set, then upping the iteration count so high it takes many seconds to calculate just one image. Then there is only one single calculation setup, so this is likely to be the most advantageous scenario for GPU/CUDA.
To measure how much overhead there is, change image size to 1x1 and iteration limit 1, and then calculate enough images that it takes at least a few seconds. In this scenario, GPU is likely significantly slower.
To get most relevant timings for your use case, select image size and iteration count you are really going to use, and then measure the image count, where both versions are equally fast. That will give you a rough rule-of-thumb to decide which you should use when.
Alternative approach for practical results, if you are going to get just one image: find the iteration limit for single worst-case image, where CPU and GPU are equally fast. If that many or more iterations would be advantageous, choose GPU, otherwise choose CPU.

Performing several 1D moving averages in parallel using CUDA Thrust

I'm not a programmer with any abilities. Just someone curious about CUDA and so I'm doing a little reading. I ran across an example of using Thrust to do a moving average:
Simple Moving Average Thrust Example
The example, such as it is, runs and mostly works correctly. However it's trivial in the sense that it only does one moving average operation.
How I would do say 352 of these moving average operations in parallel, all operating on the same data stream? In my mind the program flow might be:
Generate the data & send it to one CUDA core. (Same as existing code
but think lengths of 1000 or 10000 instead of 30)
Copy it from the CUDA core it's in to all of the the other 351 CUDA
cores in my GTX 465
Tell each CUDA core what number of data items to average over.
(4, 5, 6,..., 352, 353, 354)
Tell the device to run the average in each core in parallel
Read back the results from each core
I get that this code
// compute SMA using standard summation
simple_moving_average(data, w, averages);
makes it all happen, but how to I get Thrust to do many of these in parallel?
My interest here is about something like stock data. If I'm looking at GOOG prices I'd put that in the GPU using all cores and leave it there. I'd then be free to do lots of processing without loading the data anymore and just reading back results from each core. NOTE: I might not want to use GOOG in all cores. Some cores might be GOOG, others with some other symbol, but I'll get there later. I'm just thinking I don't want the stock data in global memory if there's enough room in each core.
I assume this is pretty straightforward for CUDA & Thrust?
Here is the possible way how to do this with arrayfire:
Note that I am NOT affiliated with this library whatsoever.
I am pretty sure this can also be done with thrust
but I found this one a lot simpler with arrayfire.
And if the library is free why can't I use it instead of thrust ?
In arrayfire you can use matrix to run several SMA operations in parallel:
unsigned n_SMAs = 1000; // # of SMA indicators to evaluate
unsigned len = 2000; // # of stock prices per indicator
unsigned w = 6; // window size
// generate stock prices: [0..10]
af::array data = af::randu(n_SMAs, len) * 10;
// compute inclusive prefix sums along colums of the matrix
af::array s = af::accum(data, 1);
// compute the average
af::array avg = (s.cols(w, af::end) - s.cols(0, af::end - w)) / w;
af::eval(avg);
std::cout << avg.dims() << "\n" << avg << "\n";
let me know if that's what you are looking for. This is how I understood your question: compute several SMA indicators in parallel
My understanding is that you are interested into the following two situations:
You have a long sequence of items and you want to calculate a certain number of averages, by averaging on different numbers of items, i.e., using different lengths for the moving average window. This is what I understand from your original question.
You have a series of sequences, stored consecutively in memory, and you want to average them in parallel with a fixed averaging window of size 2 * RADIUS + 1. This is what the ArrayFire code proposed by #asm does - you have accepted it.
Instead of using CUDA Thrust, I think it would be easier to write your own CUDA kernel to do the above operations. Below, a fully worked example that operates in the same way as the ArrayFire code proposed by #asm, thus covering case #2. Modifying it to cover case #1 would be straightforward.
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCK_SIZE_X 8
#define BLOCK_SIZE_Y 8
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int M, unsigned int N) {
__shared__ unsigned int temp[BLOCK_SIZE_Y][BLOCK_SIZE_X + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gindexy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int gindex = gindexy * N + gindexx;
unsigned int lindexx = threadIdx.x + RADIUS;
unsigned int lindexy = threadIdx.y;
// --- Read input elements into shared memory
temp[lindexy][lindexx] = ((gindexx < N)&&(gindexy < M))? in[gindex] : 0;
if (threadIdx.x < RADIUS) {
temp[lindexy][threadIdx.x] = ((gindexx >= RADIUS)&&(gindexx < (N + RADIUS))&&(gindexy < M)) ? in[gindex - RADIUS] : 0;
temp[lindexy][threadIdx.x + (RADIUS + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X))] = (((gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)) < N)&&(gindexy < M))? in[gindexy * N + gindexx + min(BLOCK_SIZE_X, N - blockIdx.x * BLOCK_SIZE_X)] : 0;
if ((threadIdx.y == 0)&&(gindexy < M)&&((gindexx + BLOCK_SIZE_X) < N)&&(gindexy < M)) printf("Inside 2 - tidx = %i; bidx = %i; tidy = %i; bidy = %i; lindexx = %i; temp = %i\n", threadIdx.x, blockIdx.x, threadIdx.y, blockIdx.y, threadIdx.x + (RADIUS + BLOCK_SIZE_X), temp[lindexy][threadIdx.x + (RADIUS + BLOCK_SIZE_X)]);
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexy][lindexx + offset];
}
// --- Store the result
out[gindexy * N + gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int M = 2;
const unsigned int N = 4 + 2 * RADIUS;
const unsigned int constant = 3;
thrust::device_vector<unsigned int> d_in(M * N, constant);
thrust::device_vector<unsigned int> d_out(M * N);
dim3 GridSize(iDivUp(N, BLOCK_SIZE_X), iDivUp(M, BLOCK_SIZE_Y));
dim3 BlockSize(BLOCK_SIZE_X, BLOCK_SIZE_Y);
moving_average<<<GridSize, BlockSize>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int j=0; j<M; j++) {
for (int i=0; i<N; i++)
printf("Element j = %i; i = %i; h_out = %i\n", j, i, h_out[N*j+i]);
}
return 0;
}

cuda multiplication

Serial code snippet looks like this:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
I converted this to CUDA using this kernel:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(tid = 0; tid <nx*ny; tid++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
However the GPU kernel does not give any speedup improvement? Any suggestions on a better solution?? Thanks in advance
If this is the serial code:
int i, j;
for(j=0; j<ny; j++)
{
for(i=0; i<nx; i++)
{
x[i + j*nx] *= y[i];
}
}
then you should be doing this:
__global__ void fn(float *x, int nx)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx, i = tid - j * nx;
x[tid] *= y[i];
}
fn<<<nx*ny/B, B>>>(x, nx); // with B = 256, 512, etc.
What you're doing is fairly bizarre: you're instructing each thread of the CUDA kernel to iterate over all values of tid between 0 and nx*ny, and compute the same function as your CPU version! Moreover, instead of just iterating over the indices, you're actually doing the loop less efficiently than you did for the CPU version; in other words, you do the same thing in each thread, just less efficiently, than you are doing in 1 thread on the CPU. It's no wonder that this is slower; it should be much, much slower. Your CUDA kernel is:
int **tid** = blockIdx.x * blockDim.x + threadIdx.x;
int i,j;
for(**tid** = 0; **tid** <nx*ny; **tid**++)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
This does nx*ny iterations, same as your host code, for each thread; you lose all benefit of the parallelism, since each thread is doing the same thing; you would get the same performance using one thread on the GPU, and the same result!
If this is the verbatim code from your CUDA source file, you need to change it and redo the comparison; if this is code you have written to help explain what your code is doing for a lay non-CUDA audience, then you need to present your actual CUDA code so that we can see what's going on... as it is, the performance analysis I have done - the trivial one - is all you can expect.
Given your comment to this answer:
the nx * ny = 2205; so I used no. of blocks =
(nx*ny+(threads-1))/threads and threads = 64.
is implying you are intending to launch one thread per computation, the correct CUDA implementation would just be:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int j = tid/nx;
int i = tid - j*nx;
if (tid < (nx*ny))
x[tid] *= y[i];
If you were intending for each thread to compute more than one computation per kernel launch, then you would size the grid to "fill" each of the SM on the target GPU, not use the same number of threads as the input size, and then do something like:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int gsize = blockDim.x * gridDim.x;
int i,j;
for(; tid <nx*ny; tid+=gsize)
{
j = tid/nx;
i = tid - j*nx;
x[tid] *= y[i];
}
That would get you at least coalesced reads and writes to x, and remove the enormous number of redundant calculations in your posted version. There are a number of further optimizations that could be made, but it would require more information about the problem than has been supplied in the question and subsequent comments. Your indexing scheme contains an integer division and then an integer multiply-add per calculation. That is a lot of overhead for a single FLOP per input value. However, having said all of that, if the problem size I quoted is that actual problem size you are interested in, the GPU will never be faster than even a modest host CPU. You would require many orders of magnitude larger problems to realize useful speed up using the GPU for this sort low arithmetic intensity operation.
How big is the block? it may be that the time needed to copy a small amount of data to the GPU and setup the envirnoment is much longer than the calculation time.
Remember also that CUDA does a jit compile on the first run so to get accurate benchmarking you need to run it many times.
Try this using shared memory. One of the best implementations around:
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride; // In number of elements
float *elements;
} Matrix;
// Thread block size
#define BLOCK_SIZE 16
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row +
BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Same as in previous example, except the followings:
// d_A.width = d_A.stride = A.width;
// d_B.width = d_B.stride = B.width;
// d_C.width = d_C.stride = C.width;
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A and Bsub of B
Matrix Asub = GetSubMatrix(A, blockRow, m);
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}