Reducing memory hit in CUDA parallization - cuda

I have a loop that I am trying to parallelize in CUDA. It goes something like this:
float *buf = new float[buf_size]; // buf_size <= 100
for (int j; j<N; j++){
caluculate_with(buf);
}
delete [] buf;
The nature of the loop is that it does not matter the values in the buffer array at the beginning of each iteration. So that the loop itself can be quite trivially parallelized.
But in CUDA, I now need a much larger buffer because of asynchronous call to kernel.
void __global__ loop_kernel(float *buf_gpu) {
const int idx = index_gpu(blockIdx, blockDim, threadIdx);
float *buf = buf_gpu + (idx*buf_size);
caluculate_with(buf);
}
....
float * buf_gpu;
cudaMalloc(&buf_gpu,sizeof(float)*N*buf_size);
loop_kernel<<<mesh,block>>>(buf_gpu);
cudaFree(buf_gpu);
}
Since each call to the kernel gets its own segment of the buffer, the buffer size now scales with loop size N, which is obvious problematic. Instead of using (buffer size) amount of memory, I now have to allocate (buffer size * loop size). The GPU memory limit of my GTX590 is hit for somewhat typical value of N in the problem I am working on).
EDIT: elaborate on my other attempt.
Since the buf_size is not too big, I also tried rewriting the kernel like this:
void __global__ loop_kernel() {
float *buf = new float[buf_size];
caluculate_with(buf);
delete [] buf;
}
...
assert(cudaSuccess == cudaDeviceSetLimit(cudaLimitMallocHeapSize,8*1024*1024));
loop_kernel<<<mesh,block>>>();
assert(cudaSuccess == cudaDeviceSynchronize());
The cudaDeviceSynchronize() assertion fails with return status 4. No idea what that means.

You haven't told us anything about calculate_with() so it's not clear if any of that is parallelizable, but that is certainly something that may be worth investigating.
One approach, however, is simply to limit your buffer size to what can be handled by GPU memory, and then call the kernel in a loop based on that buffer size:
void __global__ loop1_kernel(float *buf_gpu) {
const int idx = index_gpu(blockIdx, blockDim, threadIdx);
float *buf = buf_gpu + (idx*buf_size);
caluculate_with(buf);
}
....
float * buf_gpu;
cudaMalloc(&buf_gpu,sizeof(float)*num_buffs*buf_size);
for (int j=0; j<(N/num_buffs; j++){
loop_kernel<<<mesh,block>>>(buf_gpu);
cudaMemcpy(host_data, buf_gpu, (sizeof(float)*num_buffs*buf_size), cudaMemcpyDeviceToHost);
}
cudaFree(buf_gpu);
}
Obviously, the cudaMemcpy line only needs to be whatever data is actually produced that needs to be saved from the kernel operation.

Related

Can I run a CUDA device function without parallelization or calling it as part of a kernel?

I have a program that loads an image onto a CUDA device, analyzes it with cufft and some custom stuff, and updates a single number on the device which the host then queries as needed. The analysis is mostly parallelized, but the last step sums everything up (using thrust::reduce) for a couple final calculations that aren't parallel.
Once everything is reduced, there's nothing to parallelize, but I can't figure out how to just run a device function without calling it as its own tiny kernel with <<<1, 1>>>. That seems like a hack. Is there a better way to do this? Maybe a way to tell the parallelized kernel "just do these last lines once after the parallel part is finished"?
I feel like this must have been asked before, but I can't find it. Might just not know what to search for though.
Code snip below, I hope I didn't remove anything relevant:
float *d_phs_deltas; // Allocated using cudaMalloc (data is on device)
__device__ float d_Z;
static __global__ void getDists(const cufftComplex* data, const bool* valid, float* phs_deltas)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
// Do stuff with the line indicated by index i
// ...
// Save result into array, gets reduced to single number in setDist
phs_deltas[i] = phs_delta;
}
static __global__ void setDist(const cufftComplex* data, const bool* valid, const float* phs_deltas)
{
// Final step; does it need to be it's own kernel if it only runs once??
d_Z += phs2dst * thrust::reduce(thrust::device, phs_deltas, phs_deltas + d_y);
// Save some other stuff to refer to next frame
// ...
}
void fftExec(unsigned __int32 *host_data)
{
// Copy image to device, do FFT, etc
// ...
// Last parallel analysis step, sets d_phs_deltas
getDists<<<out_blocks, N_THREADS>>>(d_result, d_valid, d_phs_deltas);
// Should this be a serial part at the end of getDists somehow?
setDist<<<1, 1>>>(d_result, d_valid, d_phs_deltas);
}
// d_Z is copied out only on request
void getZ(float *Z) { cudaMemcpyFromSymbol(Z, d_Z, sizeof(float)); }
Thank you!
There is no way to run a device function directly without launching a kernel. As pointed out in comments, there is a working example in the Programming Guide which shows how to use memory fence functions and an atomically incremented counter to signal that a given block is the last block:
__device__ unsigned int count = 0;
__global__ void sum(const float* array, unsigned int N, volatile float* result)
{
__shared__ bool isLastBlockDone;
float partialSum = calculatePartialSum(array, N);
if (threadIdx.x == 0) {
result[blockIdx.x] = partialSum;
// Thread 0 makes sure that the incrementation
// of the "count" variable is only performed after
// the partial sum has been written to global memory.
__threadfence();
// Thread 0 signals that it is done.
unsigned int value = atomicInc(&count, gridDim.x);
// Thread 0 determines if its block is the last
// block to be done.
isLastBlockDone = (value == (gridDim.x - 1));
}
// Synchronize to make sure that each thread reads
// the correct value of isLastBlockDone.
__syncthreads();
if (isLastBlockDone) {
// The last block sums the partial sums
// stored in result[0 .. gridDim.x-1] float totalSum =
calculateTotalSum(result);
if (threadIdx.x == 0) {
// Thread 0 of last block stores the total sum
// to global memory and resets the count
// varilable, so that the next kernel call
// works properly.
result[0] = totalSum;
count = 0;
}
}
}
I would recommend benchmarking both ways and choosing which is faster. On most platforms kernel launch latency is only a few microseconds, so a short running kernel to finish an action after a long running kernel can be the most efficient way to get this done.

CUDA streams performance

I am currently learning CUDA streams through the computation of a dot product between two vectors. The ingredients are a kernel function that takes in vectors x and y and returns a vector result of size equal to the number of blocks, where each block contributes its own reduced sum.
I also have a host function dot_gpu that calls the kernel and reduces the vector result to the final dot product value.
The synchronous version does just this:
// copy to device
copy_to_device<double>(x_h, x_d, n);
copy_to_device<double>(y_h, y_d, n);
// kernel
double result = dot_gpu(x_d, y_d, n, blockNum, blockSize);
while the async one goes like:
double result[numChunks];
for (int i = 0; i < numChunks; i++) {
int offset = i * chunkSize;
// copy to device
copy_to_device_async<double>(x_h+offset, x_d+offset, chunkSize, stream[i]);
copy_to_device_async<double>(y_h+offset, y_d+offset, chunkSize, stream[i]);
// kernel
result[i] = dot_gpu(x_d+offset, y_d+offset, chunkSize, blockNum, blockSize, stream[i]);
}
for (int i = 0; i < numChunks; i++) {
finalResult += result[i];
cudaStreamDestroy(stream[i]);
}
I am getting worse performance when using streams and was trying to investigate the reasons. I tried to pipeline the downloads, kernel calls and uploads, but with no results.
// accumulate the result of each block into a single value
double dot_gpu(const double *x, const double* y, int n, int blockNum, int blockSize, cudaStream_t stream=NULL)
{
double* result = malloc_device<double>(blockNum);
dot_gpu_kernel<<<blockNum, blockSize, blockSize * sizeof(double), stream>>>(x, y, result, n);
#if ASYNC
double* r = malloc_host_pinned<double>(blockNum);
copy_to_host_async<double>(result, r, blockNum, stream);
CudaEvent copyResult;
copyResult.record(stream);
copyResult.wait();
#else
double* r = malloc_host<double>(blockNum);
copy_to_host<double>(result, r, blockNum);
#endif
double dotProduct = 0.0;
for (int i = 0; i < blockNum; i ++) {
dotProduct += r[i];
}
cudaFree(result);
#if ASYNC
cudaFreeHost(r);
#else
free(r);
#endif
return dotProduct;
}
My guess is that the problem is inside the dot_gpu() functions that doesn't only call the kernel. Tell me if I understand correctly the following stream executions
foreach stream {
cudaMemcpyAsync( device[stream], host[stream], ... stream );
LaunchKernel<<<...stream>>>( ... );
cudaMemcpyAsync( host[stream], device[stream], ... stream );
}
The host executes all the three instructions without being blocked, since cudaMemcpyAsync and kernel return immediately (however on the GPU they will execute sequentially as they are assigned to the same stream). So host goes on to the next stream (even if stream1 who knows what stage it is at, but who cares.. it's doing his job on the GPU, right?) and executes the three instructions again without being blocked.. and so on and so forth. However, my code blocks the host before it can process the next stream, somewhere inside the dot_gpu() function. Is it because I am allocating & freeing stuff, as well as reducing the array returned by the kernel to a single value?
Assuming your objectified CUDA interface does what the function and method names suggest, there are three reasons why work from subsequent calls to dot_gpu() might not overlap:
Your code explicitly blocks by recording an event and waiting for it.
If it weren't blocking for 1. already, your code would block on the pinned host side allocation and deallocation, as you suspected.
If your code weren't blocking for 2. already, work from subsequent calls to dot_gpu() might still not overlap depending on compute capbility. Devices of compute capability 3.0 or lower do not reorder operations even if they are enqueued to different streams.
Even for devices of compute capability 3.5 and higher the number of streams whose operations can be reordered is limited by the CUDA_​DEVICE_​MAX_​CONNECTIONS environment variable, which defaults to 8 and can be set to values as large as 32.

Finding min and max by the cuda kernel reduction

Here is my kernel code
typedef unsigned char Npp8u;
...
// Kernel Implementation
__device__ unsigned int min_device;
__device__ unsigned int max_device;
__global__ void findMax_Min(Npp8u * data, int numEl){
int index = blockDim.x*blockIdx.x + threadIdx.x;
int shared_index = threadIdx.x;
__shared__ Npp8u data_shared_min[BLOCKDIM];
__shared__ Npp8u data_shared_max[BLOCKDIM];
// check index condition
if(index < numEl){
data_shared_min[shared_index] = data[index]; //pass values from global to shared memory
__syncthreads();
data_shared_max[shared_index] = data[index]; //pass values from global to shared memory
for (unsigned int stride = BLOCKDIM/2; stride > 0; stride >>= 1) {
if(threadIdx.x < stride){
if(data_shared_max[threadIdx.x] < data_shared_max[threadIdx.x+stride]) data_shared_max[shared_index] = data_shared_max[shared_index+stride];
if(data_shared_min[threadIdx.x]> data_shared_min[threadIdx.x+stride]) data_shared_min[shared_index] = data_shared_min[shared_index+stride];
}
__syncthreads();
}
if(threadIdx.x == 0 ){
atomicMin(&(min_device), (unsigned int)data_shared_min[threadIdx.x ]);
//min_device =10;
__syncthreads();
atomicMax(&(max_device), (unsigned int)data_shared_max[threadIdx.x ]);
}
}else{
data_shared_min[shared_index] = 9999;
}
}
I have an image that is 512x512 and I want to find the min and max pixel values. data is the 1-D version of the image. This code works for max but not for min value. As I checked from matlab max value is 202 and min value is 10 but it finds 0 for the min value. Here is my kernel codes and memcpy calls
int main(){
// Host parameter declarations.
Npp8u * imageHost;
int nWidth, nHeight, nMaxGray;
// Load image to the host.
std::cout << "Load PGM file." << std::endl;
imageHost = LoadPGM("lena_before.pgm", nWidth, nHeight, nMaxGray);
// Device parameter declarations.
Npp8u * imageDevice;
unsigned int max, min;
size_t size = sizeof(Npp8u)*nWidth*nHeight;
cudaMalloc((Npp8u**)&imageDevice, size);
cudaMemcpy(imageDevice, imageHost, size, cudaMemcpyHostToDevice);
int numPixels = nWidth*nHeight;
dim3 numThreads(BLOCKDIM);
dim3 numBlocks(numPixels/BLOCKDIM + (numPixels%BLOCKDIM == 0 ? 0 : 1));
findMax_Min<<<numBlocks, numThreads>>>(imageDevice,numPixels);
cudaMemcpyFromSymbol(&max,max_device, sizeof(max_device), 0, cudaMemcpyDeviceToHost);
cudaMemcpyFromSymbol(&min,min_device, sizeof(min_device), 0, cudaMemcpyDeviceToHost);
printf("Min value for image : %i\n", min);
printf("Max value for image : %i\n", max);
...
Another interesting thing is changing the order of cudaMemcpy just after the kernel call also causes malfunctioning and values both are read as zero. I do not see the problem. Is there anyone sees the obstructed part?
You might want to do cuda error checking. You might also want to initialize min_device to a large value and max_device to zero. There are other problems with your reduction method related to stride (what happens in the last block of an odd size image when you add stride to threadIdx.x, it may exceed the defined image range in shared memory) , but I don't think it matters for a 512x512 image. If min_device just happened to start out at zero, all of your atomicMin operations would always leave zero there.
You can try initializing min_device and max_device like this:
__device__ unsigned int min_device = 9999;
__device__ unsigned int max_device = 0;
For the cudamemcpy calls at the end, you are copying 4 bytes (size of max_device) into a one-byte variable (Npp8u max) and likewise for min. So that's a problem. Since you're using pointers, the copy operation is definitely overwriting something that you don't intend. If the compiler stores the variables sequentially the way you have them defined, one copy operation is overwriting the other variable, which I think would explain the behavior you're seeing. If you created min and max as unsigned int quantities, I think this problem would go away.
EDIT: Since you haven't shown your actual block dimensions, it's possible that you still have a problem with your reduction. You might want to change this line:
if(threadIdx.x < stride){
To something like:
if((threadIdx.x < stride) && ((index + stride)< numEl)){
This or something like it should correct the hazard I mention in the first paragraph. I guess you're trying to account for the hazard using this line:
data_shared_min[shared_index] = 9999;
But there's no guarantee that line of code gets executed before the data element that it is setting in shared memory gets read by some other thread. I also don't know what happens when you assign a value of 9999 to a byte quantity, but it's probably not what you expect.

cuda shared memory overwrite?

I am trying to write a parallel prefix scan on cuda by following this tutorial -
I am trying the work-inefficient 'double buffered one' as explained in the tutorial.
This is what I have:
// double buffered naive.
// d = number of iterations, N - size, and input.
__global__ void prefixsum(int* in, int d, int N)
{
//get the block index
int idx = blockIdx.x*blockDim.x + threadIdx.x;
// allocate shared memory
extern __shared__ int temp_in[], temp_out[];
// copy data to it.
temp_in[idx] = in[idx];
temp_out[idx] = 0;
// block until all threads copy
__syncthreads();
int i = 1;
for (i; i<=d; i++)
{
if (idx < N+1 && idx >= (int)pow(2.0f,(float)i-1))
{
// copy new result to temp_out
temp_out[idx] += temp_in[idx - (int)pow(2.0f,(float)i-1)] + temp_in[idx];
}
else
{
// if the element is to remain unchanged, copy the same thing
temp_out[idx] = temp_in[idx];
}
// block until all theads do this
__syncthreads();
// copy the result to temp_in for next iteration
temp_in[idx] = temp_out[idx];
// wait for all threads to do so
__syncthreads();
}
//finally copy everything back to global memory
in[idx] = temp_in[idx];
}
Can you point out what's wrong with this? I have written comments for what I think should happen.
This is the kernel invocation -
prefixsum<<<dimGrid,dimBlock>>>(d_arr, log(SIZE)/log(2), N);
This is the grid and block allocations:
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
The problem is that I don't get the correct output for any input that's more than 8 elements long.
I see two problems in your code
Problem 1: extern shared memory
Agh.... I hate extern __shared__ memory. The problem is, that the compiler does not know how big are the arrays. As a result, they both point to the same piece of memory!
So, in your case: temp_in[5] and temp_out[5] refer to the same word in shared memory.
If you really want the extern __shared__ memory, you can manually offset the second array, for example something like this:
size_t size = .... //the size of your array
extern __shared__ int memory[];
int* temp_in=memory;
int* temp_out=memory+size;
Problem 2: Shared array index
Shared memory is private for each block. That is, temp[0] in one block can be different than temp[0] in another block. However, you index it by blockIdx.x*blockDim.x + threadIdx.x as if the temp arrays were shared between the blocks.
Instead, you should most likely index your temp arrays just by threadIdx.x.
Of course, the idx array is global and you index that one correctly.

CUDA: Shared memory over a large-ish 2D array

I had a simple CUDA problem for a class assignment, but the professor added an optional task to implement the same algorithm using shared memory instead. I was unable to finish it before the deadline (as in, the turn-in date was a week ago) but I'm still curious so now I'm going to ask the internet ;).
The basic assignment was to implement a bastardized version of a red-black successive over-relaxation both sequentially and in CUDA, make sure you got the same result in both and then compare the speedup. Like I said, doing it with shared memory was an optional +10% add-on.
I'm going to post my working version and pseudocode what I've attempted to do since I don't have the code in my hands at the moment, but I can update this later with the actual code if someone needs it.
Before anyone says it: Yes, I know using CUtil is lame, but it made the comparison and timers easier.
Working global memory version:
#include <stdlib.h>
#include <stdio.h>
#include <cutil_inline.h>
#define N 1024
__global__ void kernel(int *d_A, int *d_B) {
unsigned int index_x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int index_y = blockIdx.y * blockDim.y + threadIdx.y;
// map the two 2D indices to a single linear, 1D index
unsigned int grid_width = gridDim.x * blockDim.x;
unsigned int index = index_y * grid_width + index_x;
// check for boundaries and write out the result
if((index_x > 0) && (index_y > 0) && (index_x < N-1) && (index_y < N-1))
d_B[index] = (d_A[index-1]+d_A[index+1]+d_A[index+N]+d_A[index-N])/4;
}
main (int argc, char **argv) {
int A[N][N], B[N][N];
int *d_A, *d_B; // These are the copies of A and B on the GPU
int *h_B; // This is a host copy of the output of B from the GPU
int i, j;
int num_bytes = N * N * sizeof(int);
// Input is randomly generated
for(i=0;i<N;i++) {
for(j=0;j<N;j++) {
A[i][j] = rand()/1795831;
//printf("%d\n",A[i][j]);
}
}
cudaEvent_t start_event0, stop_event0;
float elapsed_time0;
CUDA_SAFE_CALL( cudaEventCreate(&start_event0) );
CUDA_SAFE_CALL( cudaEventCreate(&stop_event0) );
cudaEventRecord(start_event0, 0);
// sequential implementation of main computation
for(i=1;i<N-1;i++) {
for(j=1;j<N-1;j++) {
B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4;
}
}
cudaEventRecord(stop_event0, 0);
cudaEventSynchronize(stop_event0);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time0,start_event0, stop_event0) );
h_B = (int *)malloc(num_bytes);
memset(h_B, 0, num_bytes);
//ALLOCATE MEMORY FOR GPU COPIES OF A AND B
cudaMalloc((void**)&d_A, num_bytes);
cudaMalloc((void**)&d_B, num_bytes);
cudaMemset(d_A, 0, num_bytes);
cudaMemset(d_B, 0, num_bytes);
//COPY A TO GPU
cudaMemcpy(d_A, A, num_bytes, cudaMemcpyHostToDevice);
// create CUDA event handles for timing purposes
cudaEvent_t start_event, stop_event;
float elapsed_time;
CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
cudaEventRecord(start_event, 0);
// TODO: CREATE BLOCKS AND THREADS AND INVOKE GPU KERNEL
dim3 block_size(256,1,1); //values experimentally determined to be fastest
dim3 grid_size;
grid_size.x = N / block_size.x;
grid_size.y = N / block_size.y;
kernel<<<grid_size,block_size>>>(d_A,d_B);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time,start_event, stop_event) );
//COPY B BACK FROM GPU
cudaMemcpy(h_B, d_B, num_bytes, cudaMemcpyDeviceToHost);
// Verify result is correct
CUTBoolean res = cutComparei( (int *)B, (int *)h_B, N*N);
printf("Test %s\n",(1 == res)?"Passed":"Failed");
printf("Elapsed Time for Sequential: \t%.2f ms\n", elapsed_time0);
printf("Elapsed Time for CUDA:\t%.2f ms\n", elapsed_time);
printf("CUDA Speedup:\t%.2fx\n",(elapsed_time0/elapsed_time));
cudaFree(d_A);
cudaFree(d_B);
free(h_B);
cutilDeviceReset();
}
For the shared memory version, this is what I've tried so far:
#define N 1024
__global__ void kernel(int *d_A, int *d_B, int width) {
//assuming width is 64 because that's the biggest number I can make it
//each MP has 48KB of shared mem, which is 12K ints, 32 threads/warp, so max 375 ints/thread?
__shared__ int A_sh[3][66];
//get x and y index and turn it into linear index
for(i=0; i < width+2; i++) //have to load 2 extra values due to the -1 and +1 in algo
A_sh[index_y%3][i] = d_A[index+i-1]; //so A_sh[index_y%3][0] is actually d_A[index-1]
__syncthreads(); //and hope that previous and next row have been loaded by other threads in the block?
//ignore boundary conditions because it's pseudocode
for(i=0; i < width; i++)
d_B[index+i] = A_sh[index_y%3][i] + A_sh[index_y%3][i+2] + A_sh[index_y%3-1][i+1] + A_sh[index_y%3+1][i+1];
}
main(){
//same init as above until threads/grid init
dim3 threadsperblk(32,16);
dim3 numblks(32,64);
kernel<<<numblks,threadsperblk>>>(d_A,d_B,64);
//rest is the same
}
This shared mem code crashes ("launch failed due to unspecified error") since I haven't caught all the boundary conditions yet, but I'm not worried about that as much as finding the correct way to get things going. I feel that my code is way too complicated to be the correct path (especially compared to the SDK examples), but I also can't see another way to do it since my array doesn't fit into shared mem like all the examples I can find.
And frankly, I'm not sure it would be that much faster on my hardware (GTX 560 Ti - runs the global memory version in 0.121ms), but I need to prove it to myself first :P
Edit 2: For anyone who runs across this in the future, the code in the answer is a good starting point if you want to do some shared memory.
The key to getting the maximum out of these sort of stencil operators in CUDA is data re-usage. I have found that the best approach is usually to have each block "walk" through a dimension of the grid. After the block has loaded an initial tile of data into shared memory, only a single dimension (so row in a row-major order 2D problem ) needs to be read from global memory to have the necessary data in shared memory for the second and subsequent row calculations. The rest of the data can just be reused. To visualise how the shared memory buffer looks through the first four steps of this sort of algorithm:
Three "rows" (a,b,c) of the input grid are loaded to shared memory, and the stencil computed for row (b) and written to global memory
aaaaaaaaaaaaaaaa
bbbbbbbbbbbbbbbb
cccccccccccccccc
Another row (d) is loaded into the shared memory buffer, replacing row (a), and the calculations made for row (c) using a different stencil, reflecting where the row data is in shared memory
dddddddddddddddd
bbbbbbbbbbbbbbbb
cccccccccccccccc
Another row (e) is loaded into the shared memory buffer, replacing row (b), and the calculations made for row (d), using a different stencil from either step 1 or 2.
dddddddddddddddd
eeeeeeeeeeeeeeee
cccccccccccccccc
Another row (f) is loaded into the shared memory buffer, replacing row (c), and the calculations made for row (e). Now the data is back to the same layout as used in step 1, and the same stencil used in step 1 can be used.
dddddddddddddddd
eeeeeeeeeeeeeeee
ffffffffffffffff
The whole cycle repeats until the block has traverse full column length of the input grid. The reason for using different stencils rather than shifting the data in the shared memory buffer is down to performance - shared memory only has about 1000 Gb/s bandwidth on Fermi, and the shifting of data will become a bottleneck in fully optimal code. You should try different buffer sizes, because you might find smaller buffers allows for higher occupancy and improved kernel throughput.
EDIT: To give a concrete example of how that might be implemented:
template<int width>
__device__ void rowfetch(int *in, int *out, int col)
{
*out = *in;
if (col == 1) *(out-1) = *(in-1);
if (col == width) *(out+1) = *(in+1);
}
template<int width>
__global__ operator(int *in, int *out, int nrows, unsigned int lda)
{
// shared buffer holds three rows x (width+2) cols(threads)
__shared__ volatile int buffer [3][2+width];
int colid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x + 1;
int * rowpos = &in[colid], * outpos = &out[colid];
// load the first three rows (compiler will unroll loop)
for(int i=0; i<3; i++, rowpos+=lda) {
rowfetch<width>(rowpos, &buffer[i][tid], tid);
}
__syncthreads(); // shared memory loaded and all threads ready
int brow = 0; // brow is the next buffer row to load data onto
for(int i=0; i<nrows; i++, rowpos+=lda, outpos+=lda) {
// Do stencil calculations - use the value of brow to determine which
// stencil to use
result = ();
// write result to outpos
*outpos = result;
// Fetch another row
__syncthreads(); // Wait until all threads are done calculating
rowfetch<width>(rowpos, &buffer[brow][tid], tid);
brow = (brow < 2) ? (brow+1) : 0; // Increment or roll brow over
__syncthreads(); // Wait until all threads have updated the buffer
}
}