What's the capacity of a CUDA stream (=queue)?

What's the capacity of a CUDA stream (=queue)? - cuda

A CUDA stream is a queue of tasks: memory copies, event firing, event waits, kernel launches, callbacks...
But - these queues don't have infinite capacity. In fact, empirically, I find that this limit is not super-high, e.g. in the thousands, not millions.
My questions:
Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?
How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?

Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?
The "capacity" behaves differently based on actions/tasks you enqueue.
Here is a demonstration:
If we enqueue a single host function/callback in the midst of a number of kernel calls, on a Tesla V100 on CUDA 11.4 I observe a "capacity" for ~1000 enqueued items. However if I alternate kernel calls and host functions, I observe a capacity for ~100 enqueued items.
// test case with alternating kernels and callbacks
$ cat t2042a.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>
#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}
// for blocking stream to wait for host signal
class Event {
private:
std::mutex mtx_condition_;
std::condition_variable condition_;
bool signalled = false;
public:
void Signal() {
{
std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
signalled = true;
}
condition_.notify_all();
}
void Wait() {
std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
while (!signalled) {
condition_.wait(lock);
}
}
};
void CUDART_CB block_op_host_fn(void* arg) {
Event* evt = (Event*)arg;
evt->Wait();
}
int main() {
cudaStream_t stream;
CUDACHECK(cudaStreamCreate(&stream));
int num_events = 60; // 50 is okay, 60 will hang
std::vector<std::shared_ptr<Event>> event_vec;
for (int i = 0; i < num_events; i++) {
std::cout << "Queuing NoOp " << i << std::endl;
NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
std::cout << "Queued NoOp " << i << std::endl;
event_vec.push_back(std::make_shared<Event>());
cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());
std::cout << "Queued block_op " << i << std::endl;
}
for (int i = 0; i < num_events; i++) {
event_vec[i]->Signal();
}
// clean up
CUDACHECK(cudaDeviceSynchronize());
CUDACHECK(cudaStreamDestroy(stream));
return 0;
}
$ nvcc -o t2042a t2042a.cu
$ ./t2042a
Queuing NoOp 0
Queued NoOp 0
Queued block_op 0
Queuing NoOp 1
Queued NoOp 1
Queued block_op 1
Queuing NoOp 2
Queued NoOp 2
Queued block_op 2
Queuing NoOp 3
Queued NoOp 3
Queued block_op 3
Queuing NoOp 4
Queued NoOp 4
Queued block_op 4
Queuing NoOp 5
Queued NoOp 5
Queued block_op 5
Queuing NoOp 6
Queued NoOp 6
Queued block_op 6
Queuing NoOp 7
Queued NoOp 7
Queued block_op 7
Queuing NoOp 8
Queued NoOp 8
Queued block_op 8
Queuing NoOp 9
Queued NoOp 9
Queued block_op 9
Queuing NoOp 10
Queued NoOp 10
Queued block_op 10
Queuing NoOp 11
Queued NoOp 11
Queued block_op 11
Queuing NoOp 12
Queued NoOp 12
Queued block_op 12
Queuing NoOp 13
Queued NoOp 13
Queued block_op 13
Queuing NoOp 14
Queued NoOp 14
Queued block_op 14
Queuing NoOp 15
Queued NoOp 15
Queued block_op 15
Queuing NoOp 16
Queued NoOp 16
Queued block_op 16
Queuing NoOp 17
Queued NoOp 17
Queued block_op 17
Queuing NoOp 18
Queued NoOp 18
Queued block_op 18
Queuing NoOp 19
Queued NoOp 19
Queued block_op 19
Queuing NoOp 20
Queued NoOp 20
Queued block_op 20
Queuing NoOp 21
Queued NoOp 21
Queued block_op 21
Queuing NoOp 22
Queued NoOp 22
Queued block_op 22
Queuing NoOp 23
Queued NoOp 23
Queued block_op 23
Queuing NoOp 24
Queued NoOp 24
Queued block_op 24
Queuing NoOp 25
Queued NoOp 25
Queued block_op 25
Queuing NoOp 26
Queued NoOp 26
Queued block_op 26
Queuing NoOp 27
Queued NoOp 27
Queued block_op 27
Queuing NoOp 28
Queued NoOp 28
Queued block_op 28
Queuing NoOp 29
Queued NoOp 29
Queued block_op 29
Queuing NoOp 30
Queued NoOp 30
Queued block_op 30
Queuing NoOp 31
Queued NoOp 31
Queued block_op 31
Queuing NoOp 32
Queued NoOp 32
Queued block_op 32
Queuing NoOp 33
Queued NoOp 33
Queued block_op 33
Queuing NoOp 34
Queued NoOp 34
Queued block_op 34
Queuing NoOp 35
Queued NoOp 35
Queued block_op 35
Queuing NoOp 36
Queued NoOp 36
Queued block_op 36
Queuing NoOp 37
Queued NoOp 37
Queued block_op 37
Queuing NoOp 38
Queued NoOp 38
Queued block_op 38
Queuing NoOp 39
Queued NoOp 39
Queued block_op 39
Queuing NoOp 40
Queued NoOp 40
Queued block_op 40
Queuing NoOp 41
Queued NoOp 41
Queued block_op 41
Queuing NoOp 42
Queued NoOp 42
Queued block_op 42
Queuing NoOp 43
Queued NoOp 43
Queued block_op 43
Queuing NoOp 44
Queued NoOp 44
Queued block_op 44
Queuing NoOp 45
Queued NoOp 45
Queued block_op 45
Queuing NoOp 46
Queued NoOp 46
Queued block_op 46
Queuing NoOp 47
Queued NoOp 47
Queued block_op 47
Queuing NoOp 48
Queued NoOp 48
Queued block_op 48
Queuing NoOp 49
Queued NoOp 49
Queued block_op 49
Queuing NoOp 50
Queued NoOp 50
Queued block_op 50
Queuing NoOp 51
Queued NoOp 51
Queued block_op 51
Queuing NoOp 52
Queued NoOp 52
Queued block_op 52
Queuing NoOp 53
Queued NoOp 53
Queued block_op 53
Queuing NoOp 54
Queued NoOp 54
Queued block_op 54
Queuing NoOp 55
Queued NoOp 55
Queued block_op 55
Queuing NoOp 56
Queued NoOp 56
Queued block_op 56
Queuing NoOp 57
^C
$
// test case with a single callback and many kernels
$ cat t2042.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>
#include <cstdlib>
#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}
// for blocking stream to wait for host signal
class Event {
private:
std::mutex mtx_condition_;
std::condition_variable condition_;
bool signalled = false;
public:
void Signal() {
{
std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
signalled = true;
}
condition_.notify_all();
}
void Wait() {
std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
while (!signalled) {
condition_.wait(lock);
}
}
};
void CUDART_CB block_op_host_fn(void* arg) {
Event* evt = (Event*)arg;
evt->Wait();
}
int main(int argc, char *argv[]) {
cudaStream_t stream;
CUDACHECK(cudaStreamCreate(&stream));
int num_loops = 2000; // 50 is okay, 60 will hang
int num_events = 0;
std::vector<std::shared_ptr<Event>> event_vec;
if (argc > 1) num_loops = atoi(argv[1]);
for (int i = 0; i < num_loops; i++) {
std::cout << "Queuing NoOp " << i << std::endl;
NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
std::cout << "Queued NoOp " << i << std::endl;
if (i == 0){
num_events++;
event_vec.push_back(std::make_shared<Event>());
cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());
std::cout << "Queued block_op " << i << std::endl;}
}
for (int i = 0; i < num_events; i++) {
event_vec[i]->Signal();
}
// clean up
CUDACHECK(cudaDeviceSynchronize());
CUDACHECK(cudaStreamDestroy(stream));
return 0;
}
$ nvcc -o t2042 t2042.cu
$ nvcc -o t2042 t2042.cu
$ ./t2042
... <snip>
Queuing NoOp 1019
Queued NoOp 1019
Queuing NoOp 1020
Queued NoOp 1020
Queuing NoOp 1021
Queued NoOp 1021
Queuing NoOp 1022
^C
$
(the code hangs when the queue becomes "full", and I terminate at that point with ctrl-C)
How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?
Currently, there is no specification for this in CUDA, nor any explicit method to query for this at runtime.

Related

Is there a way to use CUB::BlockScan on oddly sized data arrays?

All the examples perform scans on arrays sized by some multiple of 32. The quickest examples use 256 or more threads with 4 or more elements assigned to each thread.
This means, that if I had an array of size 450, then, presumably, I would have to pad it out to 512 and do 256 threads assigned 2 elements each.
However, in my particular instance, it is not feasible to have to pad out each array.
Is there an alternative solution to handle multiple oddly sized arrays? Is there a way to somehow specify a width?
Ok, lets be more clear. This is a simplified example. Say I have 2 arrays, one array is simply a list of integer offsets into the second array, which contains the data. The offsets indicate the beginning of a separate set of data.
Each set of data is randomly sized. I get the data as a chunk from some other process, so there is no easy way to pad them. I want to run BlockScan on each offset from the same kernel.

Let your index (offset) array be idx[]. Let your data array be A[], let the result of the scan be in B[].
Scan the whole array A[], storing the output in B[].
For each element at idx[i], go to that index minus 1 in B[], retrieve that value, then use the element at idx[i-1] to index minus 1 in B[] and subtract that value, then subtract the result from the same index idx[i] (not minus 1) in A[].
Rescan A to B.
As a simple example:
idx: 0 2 5
0: 1 1 1 1 1 1 1 1
1: 1 2 3 4 5 6 7 8
2: 1 1 -1 1 1 -2 1 1
3: 1 2 1 2 3 1 2 3
In the above example, the -1 in step 2 is computed as the scan value in step 1 at index (2-1) minus the scan value in step 1 at index (0-1) (assumed to be zero) which is then subtracted from the original data value. The -2 in step 2 is computed as the scan value in step 1 at index (5-1) minus the scan value in step 1 at index (2-1), subtracted from the original data value.
Here is an example:
$ cat t453.cu
#include <cub/cub.cuh>
#include <iostream>
template <int TPB, int IPT, typename T>
__global__ void k(T *data, int *idx, int n){
// Specialize BlockScan for a 1D block of TPB threads on type T
__shared__ T sdata[TPB*IPT*2];
sdata[threadIdx.x*IPT] = 1;
__syncthreads();
typedef cub::BlockScan<T, TPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_data[IPT];
thread_data[0] = sdata[threadIdx.x*IPT];
// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
__syncthreads();
sdata[IPT*(threadIdx.x+TPB)] = thread_data[0];
if ((threadIdx.x < n) && (threadIdx.x > 0)) // assume the first element if idx points to 0
sdata[idx[threadIdx.x]*IPT] -= (sdata[((idx[threadIdx.x]-1)+TPB)*IPT] - ((threadIdx.x == 1)?0:sdata[((idx[threadIdx.x-1]-1)+TPB)*IPT]));
__syncthreads();
thread_data[0] = sdata[threadIdx.x*IPT];
BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
__syncthreads();
data[threadIdx.x] = thread_data[0];
}
typedef int dtype;
const int nTPB = 256;
int main(){
int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200};
int n = sizeof(h_idx)/sizeof(h_idx[0]);
std::cout << "n = " << n << std::endl;
int *d_idx;
cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
dtype *h_data, *d_data;
h_data = new dtype[nTPB];
cudaMalloc(&d_data, nTPB*sizeof(dtype));
k<nTPB, 1><<<1,nTPB>>>(d_data, d_idx, n);
cudaMemcpy(h_data, d_data, nTPB*sizeof(dtype), cudaMemcpyDeviceToHost);
dtype sum;
int idx = 0;
for (int i = 0; i < nTPB; i++){
if (i == h_idx[idx]) {sum = 0; idx++;}
sum++;
std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
}
}
$ nvcc -o t453 t453.cu
$ cuda-memcheck ./t453
========= CUDA-MEMCHECK
n = 8
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
gpu: 57 cpu: 57
gpu: 58 cpu: 58
gpu: 59 cpu: 59
gpu: 60 cpu: 60
gpu: 61 cpu: 61
gpu: 62 cpu: 62
gpu: 63 cpu: 63
gpu: 64 cpu: 64
gpu: 65 cpu: 65
gpu: 66 cpu: 66
gpu: 67 cpu: 67
gpu: 68 cpu: 68
gpu: 69 cpu: 69
gpu: 70 cpu: 70
gpu: 71 cpu: 71
gpu: 72 cpu: 72
gpu: 73 cpu: 73
gpu: 74 cpu: 74
gpu: 75 cpu: 75
gpu: 76 cpu: 76
gpu: 77 cpu: 77
gpu: 78 cpu: 78
gpu: 79 cpu: 79
gpu: 80 cpu: 80
gpu: 81 cpu: 81
gpu: 82 cpu: 82
gpu: 83 cpu: 83
gpu: 84 cpu: 84
gpu: 85 cpu: 85
gpu: 86 cpu: 86
gpu: 87 cpu: 87
gpu: 88 cpu: 88
gpu: 89 cpu: 89
gpu: 90 cpu: 90
gpu: 91 cpu: 91
gpu: 92 cpu: 92
gpu: 93 cpu: 93
gpu: 94 cpu: 94
gpu: 95 cpu: 95
gpu: 96 cpu: 96
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
========= ERROR SUMMARY: 0 errors
$
This still requires you to pad the "end" of your array to the threadblock size. I'm assuming that should be possible based on your description, its basically necessary for cub anyway; cub expects to use every thread in your threadblock.
For larger arrays, the above method could be extended in a straightforward fashion to use DeviceScan. Step 1 is the first scan. Step 2 would be a separate kernel launch. Step 3 is the second scan.
If you want to have each threadblock perform a scan on a segment, you don't need to pad each segment. You only need to pad the "end" of the array so that the last scan will be OK, and even this "pad" operation can be accomplished with a conditional load, instead of an actual pad operation. Here's an example:
$ cat t455.cu
#include <cub/cub.cuh>
#include <iostream>
template <int TPB, int IPT, typename T>
__global__ void k(T *data, int *idx){
int lidx = threadIdx.x;
// Specialize BlockScan for a 1D block of TPB threads on type T
typedef cub::BlockScan<T, TPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_data[IPT];
thread_data[0] = ((lidx+idx[blockIdx.x])>=idx[blockIdx.x+1])?0:data[lidx+idx[blockIdx.x]];
// Collectively compute the block-wide inclusive prefix sum
BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
__syncthreads();
if ((lidx+idx[blockIdx.x]) < idx[blockIdx.x+1])
data[lidx+idx[blockIdx.x]] = thread_data[0];
}
typedef int dtype;
const int nTPB = 128; // sized with IPT to handle the largest segment
const int DS = 256;
int main(){
int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200, 256};
int n = sizeof(h_idx)/sizeof(h_idx[0]);
std::cout << "n = " << n << std::endl;
int *d_idx;
cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
dtype *h_data, *d_data;
h_data = new dtype[DS];
for (int i = 0; i < DS; i++) h_data[i] = 1;
cudaMalloc(&d_data, DS*sizeof(dtype));
cudaMemcpy(d_data, h_data, DS*sizeof(h_data[0]), cudaMemcpyHostToDevice);
k<nTPB, 1><<<n-1,nTPB>>>(d_data, d_idx);
cudaMemcpy(h_data, d_data, DS*sizeof(dtype), cudaMemcpyDeviceToHost);
dtype sum;
int idx = 0;
for (int i = 0; i < DS; i++){
if (i == h_idx[idx]) {sum = 0; idx++;}
sum++;
std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
}
}
$ nvcc -o t455 t455.cu
$ cuda-memcheck ./t455
========= CUDA-MEMCHECK
n = 9
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
gpu: 57 cpu: 57
gpu: 58 cpu: 58
gpu: 59 cpu: 59
gpu: 60 cpu: 60
gpu: 61 cpu: 61
gpu: 62 cpu: 62
gpu: 63 cpu: 63
gpu: 64 cpu: 64
gpu: 65 cpu: 65
gpu: 66 cpu: 66
gpu: 67 cpu: 67
gpu: 68 cpu: 68
gpu: 69 cpu: 69
gpu: 70 cpu: 70
gpu: 71 cpu: 71
gpu: 72 cpu: 72
gpu: 73 cpu: 73
gpu: 74 cpu: 74
gpu: 75 cpu: 75
gpu: 76 cpu: 76
gpu: 77 cpu: 77
gpu: 78 cpu: 78
gpu: 79 cpu: 79
gpu: 80 cpu: 80
gpu: 81 cpu: 81
gpu: 82 cpu: 82
gpu: 83 cpu: 83
gpu: 84 cpu: 84
gpu: 85 cpu: 85
gpu: 86 cpu: 86
gpu: 87 cpu: 87
gpu: 88 cpu: 88
gpu: 89 cpu: 89
gpu: 90 cpu: 90
gpu: 91 cpu: 91
gpu: 92 cpu: 92
gpu: 93 cpu: 93
gpu: 94 cpu: 94
gpu: 95 cpu: 95
gpu: 96 cpu: 96
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
========= ERROR SUMMARY: 0 errors
$

Basic CUDA load and warp transpose

I want to implement a basic blocked load and warp transpose using CUDA 9.0's shuffle operations. I'm aware of the cub and trove implementations, but I'm restricted to compiling with nvrtc and the standard header includes make these libraries difficult to cater for. I'm not looking for anything fancy, just some integer, float and double shuffles on data with dimension a power of 2.
Visualising an example with warp size 8, I want to go from:
correlation
0 1 2 3
lane 0 0 8 16 24
lane 1 1 9 17 25
lane 2 2 10 18 26
lane 3 3 11 19 27
lane 4 4 12 20 28
lane 5 5 13 21 29
lane 6 6 14 22 30
lane 7 7 15 23 31
to this structure:
correlation
0 1 2 3
lane 0 0 1 2 3
lane 1 8 9 10 11
lane 2 16 17 18 19
lane 3 24 25 26 27
lane 4 4 5 6 7
lane 5 12 13 14 15
lane 6 20 21 22 23
lane 7 28 29 30 31
I feel this should be really simple but I can't figure out what I've done incorrectly. I think that the basic transposition loop should look like:
int loads[ncorrs];
int values[ncorrs];
int lane_id = threadIdx.x & (warp_size - 1);
// 0 0 0 0 4 4 4 4 8 8 8 8 ....
int base_idx = lane_id & (warp_size - ncorrs);
// 0 1 2 3 0 1 2 3 0 1 2 3
int src_corr = lane_id & (ncorrs - 1);
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx + corr;
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
So given the example data above, if we're in lane 5, I expect that the following indexing should occur:
base_idx == 4;
src_corr == 1;
corr == [0, 1, 2, 3]
src_lane == [4, 5, 6, 7]
values == [12, 13, 14 15]
But instead the following is happening (33's are from later in the data):
correlation
0 1 2 3
lane 0 0 0 0 0
lane 1 4 4 4 4
lane 2 12 12 12 12
lane 3 16 16 16 16
lane 4 20 20 20 20
lane 5 24 24 24 24
lane 6 28 28 28 28
lane 7 33 33 33 33
What am I doing incorrectly? Full implementation for a warp size of 32:
#include <cstdlib>
#include <cstdio>
#include "cuda.h"
#define ncorr 4
#define warp_size 32
template <int ncorrs>
__global__ void kernel(
int * input,
int * output,
int N)
{
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
if(n >= N)
{ return; }
// Input correlation handled by this thread
int src_corr = corr_idx(lane_id);
int mask = __activemask();
if(threadIdx.x == 0)
{ printf("mask %d\n", mask); }
int loads[ncorrs];
int values[ncorrs];
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ loads[corr] = input[n + corr*N]; }
__syncthreads();
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
loads[0], loads[1],
loads[2], loads[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx(lane_id) + corr;
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
values[0], values[1],
values[2], values[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ output[n + corr*N] = values[corr]; }
}
void print_data(int * data, int N)
{
for(int n=0; n < N; ++n)
{
printf("% -3d: ", n);
for(int c=0; c < ncorr; ++c)
{
printf("%d ", data[n*ncorr + c]);
}
printf("\n");
}
}
int main(void)
{
int * host_input;
int * host_output;
int * device_input;
int * device_output;
int N = 32;
host_input = (int *) malloc(sizeof(int)*N*ncorr);
host_output = (int *) malloc(sizeof(int)*N*ncorr);
printf("malloc done\n");
cudaMalloc((void **) &device_input, sizeof(int)*N*ncorr);
cudaMalloc((void **) &device_output, sizeof(int)*N*ncorr);
printf("cudaMalloc done\n");
for(int i=0; i < N*ncorr; ++i)
{ host_input[i] = i; }
print_data(host_input, N);
dim3 block(256, 1, 1);
dim3 grid((block.x + N - 1) / N, 1, 1);
cudaMemcpy(device_input, host_input,
sizeof(int)*N*ncorr, cudaMemcpyHostToDevice);
printf("memcpy done\n");
kernel<4> <<<grid, block>>> (device_input, device_output, N);
cudaMemcpy(host_output, device_output,
sizeof(int)*N*ncorr, cudaMemcpyDeviceToHost);
print_data(host_output, N);
cudaFree(device_input);
cudaFree(device_output);
free(host_input);
free(host_output);
}
Edit 1: Clarified that the visual example has a warp size of 8 while the full code caters for a warp size of 32

What am I doing incorrectly?
TL;DR: In short, you are transmitting the same input value to multiple output values. Here is one example, in this line of code:
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
The quantity represented by loads[src_corr] is loop-invariant. Therefore you are transmitting that value to 4 warp lanes (over the 4 loop iterations) which means that value is occupying 4 output values (which is exactly what your printout data shows). That can't be right for a transpose.
Taking a somewhat longer view, with another example from your code:
I'm not sure I can read your mind, but possibly you may be confused about the warp shuffle operation. Possibly you have assumed that the destination lane can choose which value from the source lane loads[] array is desired. This is not the case. The destination lane only gets to select whatever is the value provided by the source lane. Let's take a look at your loop:
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
...
// Input correlation handled by this thread
int src_corr = corr_idx(lane_id);
int mask = __activemask();
...
int loads[ncorrs];
int values[ncorrs];
...
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx(lane_id) + corr;
values[corr] = __shfl_sync(mask, loads[src_corr], src_lane, warp_size);
}
On the first pass of the above loop, the src_lane for warp lanes 0, 1, 2, and 3 are all going to be 0. This is evident from the above excerpted code, or print it out if you're not sure. That means warp lanes 0-3 are going to be requesting whatever value is provided by warp lane 0. The value provided by warp lane 0 is loads[src_corr], but the interpretation of src_corr here is whatever value it has for warp lane 0. Therefore one and only one value will be distributed to warp lanes 0-3. This could not possibly be correct for a transpose; no input value shows up in 4 places in the output.
To fix this, we will need to modify the calculation both of src_lane and src_corr. We will also need to modify the storage location (index) per-warp-lane, at each pass the of the loop (I'm calling this new variable dest.) We can think of src_lane as defining the target value that my thread will receive. We can think of src_corr as defining which of my values I will publish to some other thread, on that loop iteration. dest is the location in my values[] array that I will store the currently received value. We can deduce the necessary pattern by carefully studying the relationship between the input value in loads[], the desired output location in values[], taking into account the appropriate warp lanes for source and destination. On the first pass of the loop, we desire this pattern:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
src_lane: 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 4 ... (where my data comes from)
src_corr: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 ... (which value I am transmitting)
dest: 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 ... (where I store the received value)
On the second pass of the loop, we desire this pattern:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
src_lane: 8 16 24 0 9 17 25 1 10 18 26 2 11 19 27 3 19 ... (where my data comes from)
src_corr: 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 1 ... (which value I am transmitting)
dest: 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 ... (where I store the received value)
with corresponding changes for the 3rd and 4th pass of the loop. If we realize those patterns in code for your shuffle loop, it could look something like this:
$ cat t352.cu
#include <cstdlib>
#include <cstdio>
#include <assert.h>
#define ncorr 4
#define warp_size 32
template <int ncorrs>
__global__ void kernel(
int * input,
int * output,
int N)
{
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
if(n >= N)
{ return; }
// Input correlation handled by this thread
int mask = __activemask();
if(threadIdx.x == 0)
{ printf("mask %d\n", mask); }
int loads[ncorrs];
int values[ncorrs];
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ loads[corr] = input[n + corr*N]; }
__syncthreads();
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
loads[0], loads[1],
loads[2], loads[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = ((lane_id+corr)%ncorrs)*(warp_size/ncorrs) + (lane_id/ncorrs);
int src_corr = ((ncorrs-corr)+(lane_id/(warp_size/ncorrs)))%ncorrs;
int dest = (lane_id+corr)%ncorrs;
values[dest] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
values[0], values[1],
values[2], values[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ output[n + corr*N] = values[corr]; }
}
void print_data(int * data, int N)
{
for(int n=0; n < N; ++n)
{
printf("% -3d: ", n);
for(int c=0; c < ncorr; ++c)
{
printf("%d ", data[n*ncorr + c]);
}
printf("\n");
}
}
int main(void)
{
int * host_input;
int * host_output;
int * device_input;
int * device_output;
int N = 32;
host_input = (int *) malloc(sizeof(int)*N*ncorr);
host_output = (int *) malloc(sizeof(int)*N*ncorr);
printf("malloc done\n");
cudaMalloc((void **) &device_input, sizeof(int)*N*ncorr);
cudaMalloc((void **) &device_output, sizeof(int)*N*ncorr);
printf("cudaMalloc done\n");
for(int i=0; i < N*ncorr; ++i)
{ host_input[i] = i; }
print_data(host_input, N);
dim3 block(256, 1, 1);
dim3 grid((block.x + N - 1) / N, 1, 1);
cudaMemcpy(device_input, host_input,
sizeof(int)*N*ncorr, cudaMemcpyHostToDevice);
printf("memcpy done\n");
kernel<4> <<<grid, block>>> (device_input, device_output, N);
cudaMemcpy(host_output, device_output,
sizeof(int)*N*ncorr, cudaMemcpyDeviceToHost);
print_data(host_output, N);
cudaFree(device_input);
cudaFree(device_output);
free(host_input);
free(host_output);
}
$ nvcc -o t352 t352.cu
$ cuda-memcheck ./t352
========= CUDA-MEMCHECK
malloc done
cudaMalloc done
0 : 0 1 2 3
1 : 4 5 6 7
2 : 8 9 10 11
3 : 12 13 14 15
4 : 16 17 18 19
5 : 20 21 22 23
6 : 24 25 26 27
7 : 28 29 30 31
8 : 32 33 34 35
9 : 36 37 38 39
10: 40 41 42 43
11: 44 45 46 47
12: 48 49 50 51
13: 52 53 54 55
14: 56 57 58 59
15: 60 61 62 63
16: 64 65 66 67
17: 68 69 70 71
18: 72 73 74 75
19: 76 77 78 79
20: 80 81 82 83
21: 84 85 86 87
22: 88 89 90 91
23: 92 93 94 95
24: 96 97 98 99
25: 100 101 102 103
26: 104 105 106 107
27: 108 109 110 111
28: 112 113 114 115
29: 116 117 118 119
30: 120 121 122 123
31: 124 125 126 127
memcpy done
mask -1
[0, 0] 0 32 64 96
[1, 0] 1 33 65 97
[2, 0] 2 34 66 98
[3, 0] 3 35 67 99
[4, 4] 4 36 68 100
[5, 4] 5 37 69 101
[6, 4] 6 38 70 102
[7, 4] 7 39 71 103
[8, 8] 8 40 72 104
[9, 8] 9 41 73 105
[10, 8] 10 42 74 106
[11, 8] 11 43 75 107
[12, 12] 12 44 76 108
[13, 12] 13 45 77 109
[14, 12] 14 46 78 110
[15, 12] 15 47 79 111
[16, 16] 16 48 80 112
[17, 16] 17 49 81 113
[18, 16] 18 50 82 114
[19, 16] 19 51 83 115
[20, 20] 20 52 84 116
[21, 20] 21 53 85 117
[22, 20] 22 54 86 118
[23, 20] 23 55 87 119
[24, 24] 24 56 88 120
[25, 24] 25 57 89 121
[26, 24] 26 58 90 122
[27, 24] 27 59 91 123
[28, 28] 28 60 92 124
[29, 28] 29 61 93 125
[30, 28] 30 62 94 126
[31, 28] 31 63 95 127
[0, 0] 0 8 16 24
[1, 0] 32 40 48 56
[2, 0] 64 72 80 88
[3, 0] 96 104 112 120
[4, 4] 1 9 17 25
[5, 4] 33 41 49 57
[6, 4] 65 73 81 89
[7, 4] 97 105 113 121
[8, 8] 2 10 18 26
[9, 8] 34 42 50 58
[10, 8] 66 74 82 90
[11, 8] 98 106 114 122
[12, 12] 3 11 19 27
[13, 12] 35 43 51 59
[14, 12] 67 75 83 91
[15, 12] 99 107 115 123
[16, 16] 4 12 20 28
[17, 16] 36 44 52 60
[18, 16] 68 76 84 92
[19, 16] 100 108 116 124
[20, 20] 5 13 21 29
[21, 20] 37 45 53 61
[22, 20] 69 77 85 93
[23, 20] 101 109 117 125
[24, 24] 6 14 22 30
[25, 24] 38 46 54 62
[26, 24] 70 78 86 94
[27, 24] 102 110 118 126
[28, 28] 7 15 23 31
[29, 28] 39 47 55 63
[30, 28] 71 79 87 95
[31, 28] 103 111 119 127
0 : 0 32 64 96
1 : 1 33 65 97
2 : 2 34 66 98
3 : 3 35 67 99
4 : 4 36 68 100
5 : 5 37 69 101
6 : 6 38 70 102
7 : 7 39 71 103
8 : 8 40 72 104
9 : 9 41 73 105
10: 10 42 74 106
11: 11 43 75 107
12: 12 44 76 108
13: 13 45 77 109
14: 14 46 78 110
15: 15 47 79 111
16: 16 48 80 112
17: 17 49 81 113
18: 18 50 82 114
19: 19 51 83 115
20: 20 52 84 116
21: 21 53 85 117
22: 22 54 86 118
23: 23 55 87 119
24: 24 56 88 120
25: 25 57 89 121
26: 26 58 90 122
27: 27 59 91 123
28: 28 60 92 124
29: 29 61 93 125
30: 30 62 94 126
31: 31 63 95 127
========= ERROR SUMMARY: 0 errors
$
I believe the above code fairly clearly demonstrates a 32x4 -> 4x32 transpose. I think it is "closest" to the code you presented. It does not do the set of 4x8 transposes you depicted in your diagrams.
I acknowledge that the calculations of src_corr, src_lane, and dest are not completely optimized. But they generate the correct indexing. I assume you can work out how to optimally generate those from the patterns you already have.
I think its entirely possible the above code has bugs for other dimensions. I've not tried it on anything except the 32x4 case. Nevertheless I think I have indicated what is fundamentally wrong with your code, and demonstrated a pathway to get to proper indexing.
A square matrix transpose up to 32x32 can be done at the warp level using a simpler method

CUDA shared memory efficiency at 50%?

I have the following code that performs a tiled matrix transpose using shared memory to improve performance. The shared memory is padded with 1 column to avoid bank conflict for a 32x32 thread block.
__global__ void transpose_tiled_padded(float *A, float *B, int n)
{
int i_in = blockDim.x*blockIdx.x + threadIdx.x;
int j_in = blockDim.y*blockIdx.y + threadIdx.y;
int i_out = blockDim.x*blockIdx.y + threadIdx.x;
int j_out = blockDim.y*blockIdx.x + threadIdx.y;
extern __shared__ float tile[];
// coalesced read of A rows to (padded) shared tile column (transpose)
tile[threadIdx.y + threadIdx.x*(blockDim.y+1)] = A[i_in + j_in*n];
__syncthreads();
// coalesced write from (padded) shared tile column to B rows
B[i_out + j_out*n] = tile[threadIdx.x + threadIdx.y*(blockDim.x+1)];
}
Running this code, I get 100% shared memory efficiency in the NVIDIA visual profiler, as I expect. But, when I run it with a 16x16 thread block, I only get 50% efficiency. Why is that? As far as I can tell, no thread in a warp reads from the same bank with this layout. Or am I mistaken?

Yes, you are mistaken.
Considering this (read) access for warp 0 in a 16x16 block:
tile[threadIdx.x + threadIdx.y*(blockDim.x+1)];
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"index"
Here are the relevant calculations for each thread in the warp:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 23 25 26 27 28 29 30 31
threadIdx.x: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
threadIdx.y: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
"index": 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
bank: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 0
So we see that for this warp, the first and the last thread both read from bank 0. This results in a 2-way bank conflict, 2-way serialization, and 50% efficiency.

Is it possible to terminate CUDA kernel from host? [duplicate]

I am working with CUDA and I am trying to stop my kernels work (i.e. terminate all running threads) after a certain if block is being hit. How can I do that? I am really stuck in here.

The CUDA execution model doesn't allow for inter-block communication by design. That can potentially make this sort of kernel abort on condition operation difficult to achieve reliably without resorting to the assert or trap type approaches which can potentially result in context destruction and loss of data which isn't what you probably want.
If your kernel design involves a small number of blocks with "resident" threads, then the only approach is some sort of atomic spinlock, which is hard to get to work reliably, and which will greatly degrade memory controller performance and achievable bandwidth.
If, on the other hand, your kernel design has rather large grids with a lot of blocks, and your main goal is to stop blocks which are not yet scheduled from running, then you could try something like this:
#include <iostream>
#include <vector>
__device__ unsigned int found_idx;
__global__ void setkernel(unsigned int *indata)
{
indata[115949] = 0xdeadbeef;
indata[119086] = 0xdeadbeef;
indata[60534] = 0xdeadbeef;
indata[37072] = 0xdeadbeef;
indata[163107] = 0xdeadbeef;
}
__global__ void searchkernel(unsigned int *indata, unsigned int *outdata)
{
if (found_idx > 0) {
return;
} else if (threadIdx.x == 0) {
outdata[blockIdx.x] = blockIdx.x;
};
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (indata[tid] == 0xdeadbeef) {
unsigned int oldval = atomicCAS(&found_idx, 0, 1+tid);
}
}
int main()
{
const unsigned int N = 1 << 19;
unsigned int* in_data;
cudaMalloc((void **)&in_data, sizeof(unsigned int) * size_t(N));
cudaMemset(in_data, 0, sizeof(unsigned int) * size_t(N));
setkernel<<<1,1>>>(in_data);
cudaDeviceSynchronize();
unsigned int block_size = 1024;
unsigned int grid_size = N / block_size;
unsigned int* out_data;
cudaMalloc((void **)&out_data, sizeof(unsigned int) * size_t(grid_size));
cudaMemset(out_data, 0xf0, sizeof(unsigned int) * size_t(grid_size));
const unsigned int zero = 0;
cudaMemcpyToSymbol(found_idx, &zero, sizeof(unsigned int));
searchkernel<<<grid_size, block_size>>>(in_data, out_data);
std::vector<unsigned int> output(grid_size);
cudaMemcpy(&output[0], out_data, sizeof(unsigned int) * size_t(grid_size), cudaMemcpyDeviceToHost);
cudaDeviceReset();
std::cout << "The following blocks did not run" << std::endl;
for(int i=0, j=0; i<grid_size; i++) {
if (output[i] == 0xf0f0f0f0) {
std::cout << " " << i;
if (j++ == 20) {
std::cout << std::endl;
j = 0;
}
}
}
std::cout << std::endl;
return 0;
}
Here I have a simple kernel which is searching for a magic word in a large array. To get the early exit behaviour, I use a single global word, which is set atomically by those threads which "win" or trigger the termination condition. Every new block checks the state of this global word, and if it is set, they return without doing any work.
If I compile and run this on a moderate sized Kepler device:
$ nvcc -arch=sm_30 -o blocking blocking.cu
$ ./blocking
The following blocks did not run
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
504 505 506 507 508 509 510 511
you can see that a large number of blocks in the grid saw the change in the global word and early terminated without running the search code. This might be the best you can do without a severely invasive spinlock approach which will greatly harm performance.

I assume you want to stop a running kernel (not a single thread).
The simplest approach (and the one that I suggest) is to set up a global memory flag which is been tested by the kernel.
You can set the flag using cudaMemcpy() (or without if using unified memory).
Like the following:
if (gm_flag) {
__threadfence(); // ensure store issued before trap
asm("trap;"); // kill kernel with error
}
ams("trap;") will stop all running thread
Note that since cuda 2.0 you can use assert() to terminate a kernel!
A different approach could be the following (I haven't tried the code!)
__device__ bool go(int val){
return true;
}
__global__ void stopme(bool* flag, int* val, int size){
int idx= blockIdx.x *blockDim.x + threadIdx.x;
if(idx < size){
bool canContinue = true;
while(canContinue && (flag[0])){
printf("HELLO from %i\n",idx);
if(!(*flag)){
return;
}
else{
//do some computation
val[idx]++;
val[idx]%=100;
}
canContinue = go(val[idx]);
}
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(void)
{
int size = 128;
int* h_val = (int*)malloc(sizeof(int)*size);
bool * h_flag = new bool;
*h_flag=true;
bool* d_flag;
cudaMalloc(&d_flag,sizeof(bool));
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
int* d_val;
cudaMalloc(&d_val,sizeof(int)*size );
for(int i=0;i<size;i++){
h_val[i] = i;
}
cudaMemcpy(d_val,h_val,size,cudaMemcpyHostToDevice);
int BSIZE=32;
int nblocks =size/BSIZE;
printf("%i,%i",nblocks,BSIZE);
stopme<<<nblocks,BSIZE>>>(d_flag,d_val,size);
//--------------sleep for a while --------------------------
*h_flag=false;
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
gpuErrchk( cudaPeekAtLastError() );
printf("END\n");
}
where the kernel stopMe keeps running until someone from the host side sets up the flag to false. Note that your kernel could be much more complicated than this and the effort to synchronize all threads in order to execute the return could be much more than this (and can affect performance). Hope this helped.
More info here

why output tex3D data are different from initial data?

I make this program to practice cudaMemcpy3D() and Texture Memory.
Here comes the questions,when I print out tex3D data,it is not same as initial data.The value I get is ncrss times the initial value, and there are ncrss interval numbers which equal to 0 between each other. If I set nsubs to 2 or other bigger one, the time should be ncrss*nsubs and interval will be ncrss*nsubs.
Can you piont out where I made the mistakes. I think it probably is make_cudaPitchedPtr at line 61, or make_cudaExtent at line 56. And also may related with the way of array storaged.
So I come here for your help,appreciate for your comments and advices.
1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<cuda_runtime.h>
4 #include<helper_functions.h>
5 #include<helper_cuda.h>
6 #ifndef MIN
7 #define MIN(A,B) ((A) < (B) ? (A) : (B))
8 #endif
9 #ifndef MAX
10 #define MAX(A,B) ((A) > (B) ? (A) : (B))
11 #endif
12
13 texture<float,cudaTextureType3D,cudaReadModeElementType> vel_tex;
14
15 __global__ void mckernel(int ntab)
16 {
17 const int biy=blockIdx.y;//sub
18 const int bix=blockIdx.x;//crs
19 const int tid=threadIdx.x;
20
21 float test;
22 test=tex3D(vel_tex,biy,bix,tid);
23 printf("test=%f,bix=%d,tid=%d\n",test,bix,tid);
24
25 }
26
27 int main()
28 {
29 int n=10;//208
30 int ntab=10;
31 int submin=1;
32 int crsmin=1;
33 int submax=1;
34 int crsmax=2;
35 int subinc=1;
36 int crsinc=1;
37
38 int ncrss,nsubs;
39 ncrss=(crsmax-crsmin)/crsinc + 1;
40 nsubs=(submax-submin)/subinc + 1;
41 dim3 BlockPerGrid(ncrss,nsubs,1);
42 dim3 ThreadPerBlock(n,1,1);
43
44 float vel[nsubs][ncrss][ntab];
45 int i,j,k;
46 for(i=0;i<nsubs;i++)
47 for(j=0;j<ncrss;j++)
48 for(k=0;k<ntab;k++)
49 vel[i][j][k]=k;
50 for(i=0;i<nsubs;i++)
51 for(j=0;j<ncrss;j++)
52 for(k=0;k<ntab;k++)
53 printf("vel[%d][%d][%d]=%f\n",i,j,k,vel[i][j][k]);
54
55 cudaChannelFormatDesc velchannelDesc=cudaCreateChannelDesc<float>();
56 cudaExtent velExtent=make_cudaExtent(nsubs,ncrss,ntab);
57 cudaArray *d_vel;
58 cudaMalloc3DArray(&d_vel,&velchannelDesc,velExtent);
59
60 cudaMemcpy3DParms velParms = {0};
61 velParms.srcPtr=make_cudaPitchedPtr((void*)vel,sizeof(float)*nsubs,nsubs,ncrss);
62 velParms.dstArray=d_vel;
63 velParms.extent=velExtent;
64 velParms.kind=cudaMemcpyHostToDevice;
65 cudaMemcpy3D(&velParms);
66
67 cudaBindTextureToArray(vel_tex,d_vel);
68
69 printf("kernel start\n");
70 cudaDeviceSynchronize();
71 mckernel<<<BlockPerGrid,ThreadPerBlock>>>(ntab);
72 printf("kernel end\n");
73
74 cudaUnbindTexture(vel_tex);
75 cudaFreeArray(d_vel);
76 cudaDeviceReset();
77 return 0 ;
78 }
Here comes the printf data,nsubs=1 and ncrss=2;
1 vel[0][0][0]=0.000000
2 vel[0][0][1]=1.000000
3 vel[0][0][2]=2.000000
4 vel[0][0][3]=3.000000
5 vel[0][0][4]=4.000000
6 vel[0][0][5]=5.000000
7 vel[0][0][6]=6.000000
8 vel[0][0][7]=7.000000
9 vel[0][0][8]=8.000000
10 vel[0][0][9]=9.000000
11 vel[0][1][0]=0.000000
12 vel[0][1][1]=1.000000
13 vel[0][1][2]=2.000000
14 vel[0][1][3]=3.000000
15 vel[0][1][4]=4.000000
16 vel[0][1][5]=5.000000
17 vel[0][1][6]=6.000000
18 vel[0][1][7]=7.000000
19 vel[0][1][8]=8.000000
20 vel[0][1][9]=9.000000
21 kernel start
22 kernel end
23 test=1.000000,bix=1,tid=0
24 test=3.000000,bix=1,tid=1
25 test=5.000000,bix=1,tid=2
26 test=7.000000,bix=1,tid=3
27 test=9.000000,bix=1,tid=4
28 test=1.000000,bix=1,tid=5
29 test=3.000000,bix=1,tid=6
30 test=5.000000,bix=1,tid=7
31 test=7.000000,bix=1,tid=8
32 test=9.000000,bix=1,tid=9
33 test=0.000000,bix=0,tid=0
34 test=2.000000,bix=0,tid=1
35 test=4.000000,bix=0,tid=2
36 test=6.000000,bix=0,tid=3
37 test=8.000000,bix=0,tid=4
38 test=0.000000,bix=0,tid=5
39 test=2.000000,bix=0,tid=6
40 test=4.000000,bix=0,tid=7
41 test=6.000000,bix=0,tid=8
42 test=8.000000,bix=0,tid=9

After a night thinking ,I find out the problem.
the cuda array load as M[fast][mid][low] while c array is M[low][mid][fast].
so dim3(),cudaExtent(),pitchedPtr()should be same to [low][mid][fast] or at least should be same as each other.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

What's the capacity of a CUDA stream (=queue)? - cuda

Related

Is there a way to use CUB::BlockScan on oddly sized data arrays?

Basic CUDA load and warp transpose

CUDA shared memory efficiency at 50%?

Is it possible to terminate CUDA kernel from host? [duplicate]

why output tex3D data are different from initial data?

Categories

Resources