Related
#define TS 32
int num_devices = 0;
__global__ void shared_kernel(float* A, float* B, float* C, int M, int N, int K) {
int global_col = blockDim.x * blockIdx.x + threadIdx.x;
int global_row = blockDim.y * blockIdx.y + threadIdx.y;
int local_col = threadIdx.x;
int local_row = threadIdx.y;
if (global_row >= M || global_col >= N) return;
__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];
const int num_tiles = K / TS;
float acc = 0;
for(int t = 0; t < num_tiles; t++){
const int t_row = TS * t + local_row;
const int t_col = TS * t + local_col;
Asub[local_row][local_col] = A[global_row * K + t_col];
Bsub[local_row][local_col] = B[t_row * N + global_col];
__syncthreads();
printf("[DEBUG] first sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col];
}
__syncthreads();
printf("[DEBUG] second sync threads, global_row: %d, global_col: %d\n", global_row, global_col);
}
C[global_row * N + global_col] = acc;
}
static float *a_d, *b_d, *c_d;
void mat_mul(float *A, float *B, float *C, int M, int N, int K) {
cudaMemcpy(a_d, A, M * K * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_d, B, K * N * sizeof(float), cudaMemcpyHostToDevice);
dim3 blockDim(TS, TS);
dim3 gridDim(M/TS, N/TS);
shared_kernel<<<gridDim, blockDim>>>(a_d, b_d, c_d, M, N, K);
cudaMemcpy(C, c_d, M * N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
}
void mat_mul_init(float *A, float *B, float *C, int M, int N, int K) {
cudaGetDeviceCount(&num_devices);
cudaSetDevice(0);
cudaMalloc(&a_d, M * K * sizeof(float));
cudaMalloc(&b_d, K * N * sizeof(float));
cudaMalloc(&c_d, M * N * sizeof(float));
}
Above example is a matrix multiplication with shared memory.
I ran above kernel with dim3 blockDim(TS, TS) and dim3 gridDim(M/TS, N/TS) and M, N, K = 128.
I checked that float * C has zero value after launching kernel. Also, I found that only few of global_row are printed(from 37 to 81) after first __syncthreads(), and there is no printf DEBUG message after the second __syncthreads().
I suspect that __syncthreads() is causing the problem, but I don't know how to fix it. My code is almost the same as other matrix multiplication code in other site.
Would you give me some hint how to solve this?
Any time you are having trouble with a CUDA code, I recommend using proper CUDA error checking and run your code with compute-sanitizer or cuda-memcheck. For this type of analysis, it will be easier if you don't use in-kernel printf.
If you did that, you would see output like this:
========= Invalid __shared__ read of size 4
========= at 0x000002f0 in shared_kernel(float*, float*, float*, int, int, int)
========= by thread (0,2,0) in block (0,1,0)
========= Address 0x00002000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
... (and more output)
So from that, we can see that your kernel is making invalid __shared__ read operations. Where is that happening in your kernel? You could use the methodology here to identify a specific line of code. However this is a fairly simple kernel, and there is only one line that is reading from shared memory, it is here:
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col]; // shared reads here
A quick inspection will show that if you let this loop iterate over a range of K=128, then you will index out of bounds here:
for (int k = 0; k < K; ++k) {
acc += Asub[local_row][k] * Bsub[k][local_col];
^ ^
when k is greater than 31, because this would exceed your shared array dimensions:
#define TS 32
__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];
I'm not going to bother writing a fixed kernel/code for you, because as you've already pointed out, this topic is covered in many other places, and a canonical example is already provided in the programming guide.
FWIW, if i change your for-loop to this:
for (int k = 0; k < TS; ++k) {
then the run-time errors go away for me. cuda-memcheck reports no errors.
I multiplay each row from pB to each row from pA and put max value to pC.
The problem is: in internal loop the only last row of receptors taken as "max value". As result the right column is totally wrong.
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1, 2, 3, 4, 5
, 2, 4, 6, 8, 2};
float pB[k * n] = { 9, 8, 7, 6, 5
, 4, 3, 2, 1, 9
, 8, 7, 6, 5, 4 };
float expected[k * n] = { 18, 32, 42, 48, 25
, 8, 12, 12, 8, 45
,16, 28, 36, 40, 20 };
float pC[k * n] = { 18, 32, 42, 48, 10
, 8, 12, 12, 8, 18
,16, 28, 36, 40, 8 };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
CPPUNIT_ASSERT_EQUAL_MESSAGE( "passed processing", 0, rst );
}
// pDevB and pDevC nave the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
if( value > pDevC[j] )
{
pDevC[j] = value;
}
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC, pfnMsg fnMsg )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = nullptr;
float* pDevB = nullptr;
float* pDevC = nullptr;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
Sorry, I missed at some point that you had edited your code.
The problem you are having is a race condition. In the failing case you are launching 2 blocks. The design of your algorithm is such that each block is operating on the same set of output elements (in pdevC). Therefore, since both blocks can execute simultaneously, both blocks can write to the same output elements simultaneously. This is a collision and there are two ways you can avoid it:
redesign your algorithm to partition the work differently between
blocks. Instead of each block checking all (or the same set of) the output elements
against a particular set of inputs, have each block only be
responsible for a portion of the output elements but checking
against all the inputs. This is a common code refactoring
operation that is done when converting a sequential/serial
algorithm, to one that runs in parallel.
use atomic operations to prevent the collisions from happening. If your algorithm only has a small amount of these types of collisions, it may be convenient and not very costly to use atomics. But when the algorithm uses atomics for every output element (perhaps multiple times, as in this case) it's probably better (for higher performance) to try to refactor the code as in method 1 above.
What follows is some code where I illustrate the second approach (because it is easier for me to write). There is no atomic function that provides an atomicMax operation on float, so I crafted my own, following the template given in the atomic functions documentation for creating arbitrary atomic operations using atomicCAS. That is what atomicMaxf is.
If you elect to use the first approach (recommended), I would point out that calling the kernel in a loop is probably not necessary for your algorithm. I would craft a new kernel that assigns one thread to each output point, and then computes all the necessary max operations on the various input points, in a loop (or nested loops) in the kernel. Since each thread is writing to one and only one unique output point, there is no possibility for write collisions between threads.
This code should provide correct results, anyway:
#include <stdio.h>
__device__ float atomicMaxf(float* address, float val)
{
int *address_as_int =(int*)address;
int old = *address_as_int, assumed;
while (val > __int_as_float(old)) {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val));
}
return __int_as_float(old);
}
// pDevB and pDevC have the same size
__global__ void KernelDotMax( const float* pDevA, const float* pDevB, const size_t m, const size_t k, float* pDevC )
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if( i < m )
{
for( size_t j = 0; j < k; j++ )
{
const float value = pDevA[ i * k + j ] * pDevB[j];
atomicMaxf(pDevC+j, value);
// if( value > pDevC[j] )
// {
// pDevC[j] = value;
// }
}
}
}
__host__ int CalcDotMax( const float* pA, const float* pB, int m, int k, int n, float* pC )
{
int nbrCtas = m;
int threadsPerCta = 64;
if( nbrCtas >= 32 )
{
nbrCtas = 32;
threadsPerCta = 64;
}
float* pDevA = NULL;
float* pDevB = NULL;
float* pDevC = NULL;
cudaError_t code = ::cudaMalloc( (void**)&pDevA, m * k * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevB, k * n * sizeof(float) );
code = ::cudaMalloc( (void**)&pDevC, k * n * sizeof(float) );
code = ::cudaMemcpy( pDevA, pA, m * k * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevB, pB, k * n * sizeof(float), cudaMemcpyHostToDevice);
code = ::cudaMemcpy( pDevC, pC, k * n * sizeof(float), cudaMemcpyHostToDevice);
for( size_t index = 0; index < n * k; index += k )
{
KernelDotMax<<<nbrCtas,threadsPerCta>>>( pDevA, &pDevB[index], m, k, &pDevC[index] );
}
code = ::cudaMemcpy( pC, pDevC, k * n * sizeof(float), cudaMemcpyDeviceToHost);
code = ::cudaFree( pDevA );
code = ::cudaFree( pDevB );
code = ::cudaFree( pDevC );
return 0;
}
void TestCalcDotMax_2x5x3()
{
const size_t m = 2; // nReceptors
const size_t k = 5; // nSources
const size_t n = 3; // nChemicals
float pA[m * k] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f
, 2.0f, 4.0f, 6.0f, 8.0f, 2.0f};
float pB[k * n] = { 9.0f, 8.0f, 7.0f, 6.0f, 5.0f
, 4.0f, 3.0f, 2.0f, 1.0f, 9.0f
, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f };
float expected[k * n] = { 18.0f, 32.0f, 42.0f, 48.0f, 25.0f
, 8.0f, 12.0f, 12.0f, 8.0f, 45.0f
,16.0f, 28.0f, 36.0f, 40.0f, 20.0f };
float pC[k * n] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
int rst = ::CalcDotMax( pA, pB, m, k, n, pC );
printf("passed processing: %d \n", rst );
for (int i=0; i<(k*n); i++)
if (pC[i] != expected[i]) printf("mismatch at %d, should be: %f was: %f\n", i, expected[i], pC[i]);
}
int main(){
TestCalcDotMax_2x5x3();
return 0;
}
Thanks a lot - it works now. Is possible to keep the index of iteratiion [idx] at the moment of comparing? Like this:
struct ValIndex_t
{
float value;
int index;
};
__device__ float atomicMaxPare( float* address, float val, int* index, int idx )
{
int *address_as_int = reinterpret_cast<int*>( address->value ); // assume that float has size of integer 32 bit
int old = *address_as_int, assumed;
while( val > ::__int_as_float(old) )
{
assumed = old;
old = ::atomicCAS( address_as_int, assumed, ::__float_as_int(val) );
*index = idx;
}
return ::__int_as_float(old);
}
__global__ void CudaPareDotMax( float* pDevA, const float* pDevB, ValIndex_t* pDevC, const size_t m, const size_t k, const size_t n )
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if( idx < m )
{
for( size_t row = 0; row < n; row++ )
{
for( size_t col = 0; col < k; col++ )
{
const size_t slice = col + row * k;
const size_t index = slice + k * n * idx;
pDevA[index] *= pDevB[ col + k * idx ];
float& prvalue = (pDevC + slice )->value;
int& prindex = (pDevC + slice )->index;
::atomicMaxPare( &prvalue, pDevA[ index ], &prindex, idx );
}
}
}
}
Or I have to use another atomic function for exchange? Not quite understand how to join it exactly at the moment when value became max. Thanks again
The problem is simple: I have two matrices, A and B, that are M by N, where M >> N. I want to first take the transpose of A, and then multiply that by B (A^T * B) to put that into C, which is N by N. I have everything set up for A and B, but how do I call cublasSgemm properly without it returning the wrong answer?
I understand that cuBlas has a cublasOperation_t enum for transposing things beforehand, but somehow I'm not quite using it correctly. My matrices A and B are in row-major order, i.e. [ row1 ][ row2 ][ row3 ]..... in device memory. That means for A to be interpreted as A-transposed, BLAS needs to know my A is in column-major order. My current code looks like below:
float *A, *B, *C;
// initialize A, B, C as device arrays, fill them with values
// initialize m = num_row_A, n = num_row_B, and k = num_col_A;
// set lda = m, ldb = k, ldc = m;
// alpha = 1, beta = 0;
// set up cuBlas handle ...
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
My questions:
Am I setting up m, k, n correctly?
What about lda, ldb, ldc?
Thanks!
Since cuBLAS always assume that the matrices are stored in column-major, you could either transpose your matrices first into colum-major by using cublas_geam(), or you could treat your matrix A, stored in row-major, as a new matrix AT stored in column-major. The matrix AT is actually the transpose of A. For B do the same thing. Then you could calculate matrix C stored in column-major by C=AT * BT^T
float* AT = A;
float* BT = B;
The leading dimension is a param related to the storage, which doesn't change no matter if you use the transpose flag CUBLAS_OP_T or not.
lda = num_col_A = num_row_AT = N;
ldb = num_col_B = num_row_BT = N;
ldc = num_row_C = N;
m and n in the cuBLAS GEMM routine are the #rows and #cols of the result matrix C,
m = num_row_C = num_row_AT = num_col_A = N;
n = num_col_C = num_row_BT = num_col_B = N;
k is the common dimension of A^T and B,
k = num_col_AT = num_row_B = M;
Then you could invoke the GEMM routine by
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, m, n, k, &alpha, AT, lda, BT, ldb, &beta, C, ldc);
If you want the matrix C to be stored in row-major, you could calculate the CT stored in column-major with the formula CT = BT * AT^T by
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, n, m, k, &alpha, BT, ldb, AT, lda, &beta, CT, ldc);
Please note you don't have to swap m and n since C is a square matrix in this case.
I'm trying to write a matrix multiplication code in cuda, which is pretty similar to Nvidia's cuda programming guide, but it is not working. It is supposed to do C=alpha*A*B+beta*C , but for every A,B C remains unchanged.
__global__ void MatMulKernel(int m,int n,int k,double *A,double *B,double *C,double alpha,double beta)
{
double Ctemp = 0.0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
for (ind = 0; ind < k; ++ind)
{
Ctemp += A[row+ind*m]*B[ind+col*k];
}
C[row+m*col] = alpha*Ctemp+beta*C[row+m*col];
//C[row+m*col] = Ctemp;
__syncthreads();
}
extern "C" void
local_mm_cuda (const int m, const int n, const int k, const double alpha,
const double *A, const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc)
{
int row, col;
/* Verify the sizes of lda, ldb, and ldc */
assert (lda >= m);
assert (ldb >= k);
assert (ldc >= m);
// allocating memory for device array
double *dA,*dB,*dC;
size_t sizeA = sizeof(double)*m*k;
size_t sizeB = sizeof(double)*n*k;
size_t sizeC = sizeof(double)*m*n;
cudaMalloc((void**)&dA,sizeA);
cudaMalloc((void**)&dB,sizeB);
cudaMalloc((void**)&dC,sizeC);
cudaMemcpy(dA, A, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(dC, C, sizeC, cudaMemcpyHostToDevice);
// calling matrix multiplication kernal
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(m,n,k,dA,dB,dC,alpha,beta);
cudaThreadSynchronize();
// saving C calculated back in C
cudaMemcpy(dC,C, sizeC,cudaMemcpyDeviceToHost);
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
Try to modify
"dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);"
to
"dim3 dimGrid( (n+dimBlock.x-1)/dimBlock.x, (m+dimBlock.y-1)/dimBlock.y); "
I'm looking for some 1D problems in CUDA and HPC, e.g. Black Scholes.
By 1D problems, I mean problems in which all the work is done on 1D arrays. Although matrix multiplication can be expressed in this way, I want problems in which the basic problem is just 1D.
I am trying to develop a 1D library for CUDA and would need some benchmark problems to test it. I realize that a lot of real world problems are expressed as 2D, I would really like to see some real world 1D problems.
Thanks.
EDIT: Thanks for all the answers. It'll be great if the answers contain more HPC problems, e.g. Black Scholes, rather than just generic algorithms.
Thanks.
A common problem in parallel programing is a reduction: You are given an array of numbers and you have to compute a "prefix sum", that is, every element stores a sum of all preceidings elements (+ itself or not. I prefer inclusive).
It is fairly simple problem, but since it is often repeated many times in more complex algorithms, having that efficient is cruicial.
Another common problem is sorting.
There already some papers on that topic, take this one for example:
enter link description here
I think it is a good problem to start with, to solve bigger problems on top of it.
A simple problem you can use for 1 to 3 dimensions is the heat equation. There are several different numerical methods for solving it, some of them can be implementes in parallel.
A method that works at least with OpenMp and MPI is the finite difference method. I suppose if you combine it with a clever stencil you should be able to implement it efficently in Cuda C.
A classical 1D example is provided by the heat equation.
Below, I'm posting a concrete, fully worked CPU/GPU example on this topic exploiting the Jacobi solution scheme. Please, note that two time-step kernels are provided, one not using shared memory and one using shared memory.
#include <stdio.h>
#include <stdlib.h>
#include <thrust\device_vector.h>
#include "Utilities.cuh"
#define BLOCKSIZE 512
/****************************/
/* CPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DCPU(float * __restrict__ h_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
float *h_DeltaT = (float *)malloc(N * sizeof(float));
// --- Enforcing boundary condition at the left end.
*h_T = T0;
h_DeltaT[0] = 0.f;
float current_max;
do {
// --- Internal region between the two boundaries.
for (int i = 1; i < N - 1; i++) h_DeltaT[i] = dt * alpha * ((h_T[i - 1] + h_T[i + 1] - 2.f * h_T[i]) / (dx * dx));
// --- Enforcing boundary condition at the right end.
h_DeltaT[N - 1] = dt * 2.f * ((k * ((h_T[N - 2] - h_T[N - 1]) / dx) + Q_N_1) / (dx * rho * cp));
// --- Update the temperature and find the maximum DeltaT over all nodes
current_max = h_DeltaT[0]; // --- Remember: h_DeltaT[0] = 0
for (int i = 1; i < N; i++)
{
h_T[i] = h_T[i] + h_DeltaT[i]; // h_T[0] keeps
current_max = abs(h_DeltaT[i]) > current_max ? abs(h_DeltaT[i]) : current_max;
}
// --- Increase iteration counter
(*Niter)++;
} while (*Niter < maxIterNumber && current_max > maxErr);
delete [] h_DeltaT;
}
/**************************/
/* GPU CALCULATION KERNEL */
/**************************/
__global__ void HeatEquation1DGPU_IterationKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) {
// --- Internal region between the two boundaries.
if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid] = dt * alpha *((d_T[tid - 1] + d_T[tid + 1] - 2.f * d_T[tid]) / (dx * dx));
// --- Enforcing boundary condition at the left end.
if (tid == 0) d_DeltaT[0] = 0.f;
// --- Enforcing boundary condition at the right end.
if (tid == N - 1) d_DeltaT[tid] = dt * 2.f * ((k * ((d_T[tid - 1] - d_T[tid]) / dx) + Q_N_1) / (dx * rho * cp));
// --- Update the temperature
d_T[tid] = d_T[tid] + d_DeltaT[tid];
d_DeltaT[tid] = abs(d_DeltaT[tid]);
}
}
__global__ void HeatEquation1DGPU_IterationSharedKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
// --- Shared memory has 0, 1, ..., BLOCKSIZE - 1, BLOCKSIZE locations, so it has BLOCKSIZE locations + 2 (left and right) halo cells.
__shared__ float d_T_shared[BLOCKSIZE + 2]; // --- Need to know BLOCKSIZE beforehand
if (tid < N) {
// --- Load data from global memory to shared memory locations 1, 2, ..., BLOCKSIZE - 1
d_T_shared[threadIdx.x + 1] = d_T[tid];
// --- Left halo cell
if ((threadIdx.x == 0) && (tid > 0)) { d_T_shared[0] = d_T[tid - 1]; }
// --- Right halo cell
if ((threadIdx.x == blockDim.x - 1) && (tid < N - 1)) { d_T_shared[threadIdx.x + 2] = d_T[tid + 1]; }
__syncthreads();
// --- Internal region between the two boundaries.
if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid] = dt * alpha *((d_T_shared[threadIdx.x] + d_T_shared[threadIdx.x + 2] - 2.f * d_T_shared[threadIdx.x + 1]) / (dx * dx));
// --- Enforcing boundary condition at the left end.
if (tid == 0) d_DeltaT[0] = 0.f;
// --- Enforcing boundary condition at the right end.
if (tid == N - 1) d_DeltaT[tid] = dt * 2.f * ((k * ((d_T_shared[threadIdx.x] - d_T_shared[threadIdx.x + 1]) / dx) + Q_N_1) / (dx * rho * cp));
// --- Update the temperature
d_T[tid] = d_T[tid] + d_DeltaT[tid];
d_DeltaT[tid] = abs(d_DeltaT[tid]);
}
}
/****************************/
/* GPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DGPU(float * __restrict__ d_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
// --- Absolute values of DeltaT
float *d_DeltaT; gpuErrchk(cudaMalloc(&d_DeltaT, N * sizeof(float)));
// --- Enforcing boundary condition at the left end.
gpuErrchk(cudaMemcpy(d_T, &T0, sizeof(float), cudaMemcpyHostToDevice));
float current_max = 0.f;
do {
//HeatEquation1DGPU_IterationKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
HeatEquation1DGPU_IterationSharedKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
thrust::device_ptr<float> d = thrust::device_pointer_cast(d_DeltaT);
current_max = thrust::reduce(d, d + N, current_max, thrust::maximum<float>());
// --- Increase iteration counter
(*Niter)++;
} while (*Niter < maxIterNumber && current_max > maxErr);
gpuErrchk(cudaFree(d_DeltaT));
}
/********/
/* MAIN */
/********/
int main()
{
// --- See https://en.wikipedia.org/wiki/Thermal_diffusivity
// --- Parameters of the problem
const float k = 0.19f; // --- Thermal conductivity [W / (m * K)]
const float rho = 930.f; // --- Density [kg / m^3]
const float cp = 1340.f; // --- Specific heat capacity [J / (kg * K)]
const float alpha = k / (rho * cp); // --- Thermal diffusivity [m^2 / s]
const float length = 1.6f; // --- Total length of the domain [m]
const int N = 64 * BLOCKSIZE; // --- Number of grid points
const float dx = (length / (float)(N - 1));// --- Discretization step [m]
const float dt = (float)(dx * dx / (4.f * alpha));
// --- Time step [s]
const float T0 = 0.f; // --- Temperature at the first end of the domain [C]
const float Q_N_1 = 10.f; // --- Heat flux at the second end of the domain [W / m^2]
const float maxErr = 1.0e-5f; // --- Maximum admitted DeltaT
const int maxIterNumber = 10.0 / dt; // --- Number of overall time steps
/********************/
/* GPU CALCULATIONS */
/********************/
float *h_T_final_device = (float *)malloc(N * sizeof(float)); // --- Final "host-side" result of GPU calculations
int Niter_GPU = 0; // --- Iteration counter for GPU calculations
// --- Device temperature allocation and initialization
float *d_T; gpuErrchk(cudaMalloc(&d_T, N * sizeof(float)));
gpuErrchk(cudaMemset(d_T, 0, N * sizeof(float)));
// --- GPU calculations
HeatEquation1DGPU(d_T, &Niter_GPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
// --- Transfer the GPU calculation results from device to host
gpuErrchk(cudaMemcpy(h_T_final_device, d_T, N * sizeof(float), cudaMemcpyDeviceToHost));
/********************/
/* CPU CALCULATIONS */
/********************/
// --- Host temperature allocation and initialization
float *h_T_final_host = (float *)malloc(N * sizeof(float));
memset(h_T_final_host, 0, N * sizeof(float));
int Niter_CPU = 0;
HeatEquation1DCPU(h_T_final_host, &Niter_CPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
/************************/
/* CHECKING THE RESULTS */
/************************/
for (int i = 0; i < N; i++) {
printf("Node = %i; T_host = %3.10f; T_device = %3.10f\n", i, h_T_final_host[i], h_T_final_device[i]);
if (h_T_final_host[i] != h_T_final_device[i]) {
printf("Error at i = %i; T_host = %f; T_device = %f\n", i, h_T_final_host[i], h_T_final_device[i]);
return 0;
}
}
printf("Test passed!\n");
delete [] h_T_final_device;
gpuErrchk(cudaFree(d_T));
return 0;
}
Reduction (finding min, max or sum of array) and Sorting are best examples of 1D problems. There can be many variables of these algorithms like sorting on structures etc