Related
All the examples perform scans on arrays sized by some multiple of 32. The quickest examples use 256 or more threads with 4 or more elements assigned to each thread.
This means, that if I had an array of size 450, then, presumably, I would have to pad it out to 512 and do 256 threads assigned 2 elements each.
However, in my particular instance, it is not feasible to have to pad out each array.
Is there an alternative solution to handle multiple oddly sized arrays? Is there a way to somehow specify a width?
Ok, lets be more clear. This is a simplified example. Say I have 2 arrays, one array is simply a list of integer offsets into the second array, which contains the data. The offsets indicate the beginning of a separate set of data.
Each set of data is randomly sized. I get the data as a chunk from some other process, so there is no easy way to pad them. I want to run BlockScan on each offset from the same kernel.
Let your index (offset) array be idx[]. Let your data array be A[], let the result of the scan be in B[].
Scan the whole array A[], storing the output in B[].
For each element at idx[i], go to that index minus 1 in B[], retrieve that value, then use the element at idx[i-1] to index minus 1 in B[] and subtract that value, then subtract the result from the same index idx[i] (not minus 1) in A[].
Rescan A to B.
As a simple example:
idx: 0 2 5
0: 1 1 1 1 1 1 1 1
1: 1 2 3 4 5 6 7 8
2: 1 1 -1 1 1 -2 1 1
3: 1 2 1 2 3 1 2 3
In the above example, the -1 in step 2 is computed as the scan value in step 1 at index (2-1) minus the scan value in step 1 at index (0-1) (assumed to be zero) which is then subtracted from the original data value. The -2 in step 2 is computed as the scan value in step 1 at index (5-1) minus the scan value in step 1 at index (2-1), subtracted from the original data value.
Here is an example:
$ cat t453.cu
#include <cub/cub.cuh>
#include <iostream>
template <int TPB, int IPT, typename T>
__global__ void k(T *data, int *idx, int n){
// Specialize BlockScan for a 1D block of TPB threads on type T
__shared__ T sdata[TPB*IPT*2];
sdata[threadIdx.x*IPT] = 1;
__syncthreads();
typedef cub::BlockScan<T, TPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_data[IPT];
thread_data[0] = sdata[threadIdx.x*IPT];
// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
__syncthreads();
sdata[IPT*(threadIdx.x+TPB)] = thread_data[0];
if ((threadIdx.x < n) && (threadIdx.x > 0)) // assume the first element if idx points to 0
sdata[idx[threadIdx.x]*IPT] -= (sdata[((idx[threadIdx.x]-1)+TPB)*IPT] - ((threadIdx.x == 1)?0:sdata[((idx[threadIdx.x-1]-1)+TPB)*IPT]));
__syncthreads();
thread_data[0] = sdata[threadIdx.x*IPT];
BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
__syncthreads();
data[threadIdx.x] = thread_data[0];
}
typedef int dtype;
const int nTPB = 256;
int main(){
int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200};
int n = sizeof(h_idx)/sizeof(h_idx[0]);
std::cout << "n = " << n << std::endl;
int *d_idx;
cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
dtype *h_data, *d_data;
h_data = new dtype[nTPB];
cudaMalloc(&d_data, nTPB*sizeof(dtype));
k<nTPB, 1><<<1,nTPB>>>(d_data, d_idx, n);
cudaMemcpy(h_data, d_data, nTPB*sizeof(dtype), cudaMemcpyDeviceToHost);
dtype sum;
int idx = 0;
for (int i = 0; i < nTPB; i++){
if (i == h_idx[idx]) {sum = 0; idx++;}
sum++;
std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
}
}
$ nvcc -o t453 t453.cu
$ cuda-memcheck ./t453
========= CUDA-MEMCHECK
n = 8
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
gpu: 57 cpu: 57
gpu: 58 cpu: 58
gpu: 59 cpu: 59
gpu: 60 cpu: 60
gpu: 61 cpu: 61
gpu: 62 cpu: 62
gpu: 63 cpu: 63
gpu: 64 cpu: 64
gpu: 65 cpu: 65
gpu: 66 cpu: 66
gpu: 67 cpu: 67
gpu: 68 cpu: 68
gpu: 69 cpu: 69
gpu: 70 cpu: 70
gpu: 71 cpu: 71
gpu: 72 cpu: 72
gpu: 73 cpu: 73
gpu: 74 cpu: 74
gpu: 75 cpu: 75
gpu: 76 cpu: 76
gpu: 77 cpu: 77
gpu: 78 cpu: 78
gpu: 79 cpu: 79
gpu: 80 cpu: 80
gpu: 81 cpu: 81
gpu: 82 cpu: 82
gpu: 83 cpu: 83
gpu: 84 cpu: 84
gpu: 85 cpu: 85
gpu: 86 cpu: 86
gpu: 87 cpu: 87
gpu: 88 cpu: 88
gpu: 89 cpu: 89
gpu: 90 cpu: 90
gpu: 91 cpu: 91
gpu: 92 cpu: 92
gpu: 93 cpu: 93
gpu: 94 cpu: 94
gpu: 95 cpu: 95
gpu: 96 cpu: 96
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
========= ERROR SUMMARY: 0 errors
$
This still requires you to pad the "end" of your array to the threadblock size. I'm assuming that should be possible based on your description, its basically necessary for cub anyway; cub expects to use every thread in your threadblock.
For larger arrays, the above method could be extended in a straightforward fashion to use DeviceScan. Step 1 is the first scan. Step 2 would be a separate kernel launch. Step 3 is the second scan.
If you want to have each threadblock perform a scan on a segment, you don't need to pad each segment. You only need to pad the "end" of the array so that the last scan will be OK, and even this "pad" operation can be accomplished with a conditional load, instead of an actual pad operation. Here's an example:
$ cat t455.cu
#include <cub/cub.cuh>
#include <iostream>
template <int TPB, int IPT, typename T>
__global__ void k(T *data, int *idx){
int lidx = threadIdx.x;
// Specialize BlockScan for a 1D block of TPB threads on type T
typedef cub::BlockScan<T, TPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_data[IPT];
thread_data[0] = ((lidx+idx[blockIdx.x])>=idx[blockIdx.x+1])?0:data[lidx+idx[blockIdx.x]];
// Collectively compute the block-wide inclusive prefix sum
BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
__syncthreads();
if ((lidx+idx[blockIdx.x]) < idx[blockIdx.x+1])
data[lidx+idx[blockIdx.x]] = thread_data[0];
}
typedef int dtype;
const int nTPB = 128; // sized with IPT to handle the largest segment
const int DS = 256;
int main(){
int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200, 256};
int n = sizeof(h_idx)/sizeof(h_idx[0]);
std::cout << "n = " << n << std::endl;
int *d_idx;
cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
dtype *h_data, *d_data;
h_data = new dtype[DS];
for (int i = 0; i < DS; i++) h_data[i] = 1;
cudaMalloc(&d_data, DS*sizeof(dtype));
cudaMemcpy(d_data, h_data, DS*sizeof(h_data[0]), cudaMemcpyHostToDevice);
k<nTPB, 1><<<n-1,nTPB>>>(d_data, d_idx);
cudaMemcpy(h_data, d_data, DS*sizeof(dtype), cudaMemcpyDeviceToHost);
dtype sum;
int idx = 0;
for (int i = 0; i < DS; i++){
if (i == h_idx[idx]) {sum = 0; idx++;}
sum++;
std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
}
}
$ nvcc -o t455 t455.cu
$ cuda-memcheck ./t455
========= CUDA-MEMCHECK
n = 9
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
gpu: 57 cpu: 57
gpu: 58 cpu: 58
gpu: 59 cpu: 59
gpu: 60 cpu: 60
gpu: 61 cpu: 61
gpu: 62 cpu: 62
gpu: 63 cpu: 63
gpu: 64 cpu: 64
gpu: 65 cpu: 65
gpu: 66 cpu: 66
gpu: 67 cpu: 67
gpu: 68 cpu: 68
gpu: 69 cpu: 69
gpu: 70 cpu: 70
gpu: 71 cpu: 71
gpu: 72 cpu: 72
gpu: 73 cpu: 73
gpu: 74 cpu: 74
gpu: 75 cpu: 75
gpu: 76 cpu: 76
gpu: 77 cpu: 77
gpu: 78 cpu: 78
gpu: 79 cpu: 79
gpu: 80 cpu: 80
gpu: 81 cpu: 81
gpu: 82 cpu: 82
gpu: 83 cpu: 83
gpu: 84 cpu: 84
gpu: 85 cpu: 85
gpu: 86 cpu: 86
gpu: 87 cpu: 87
gpu: 88 cpu: 88
gpu: 89 cpu: 89
gpu: 90 cpu: 90
gpu: 91 cpu: 91
gpu: 92 cpu: 92
gpu: 93 cpu: 93
gpu: 94 cpu: 94
gpu: 95 cpu: 95
gpu: 96 cpu: 96
gpu: 1 cpu: 1
gpu: 2 cpu: 2
gpu: 3 cpu: 3
gpu: 4 cpu: 4
gpu: 5 cpu: 5
gpu: 6 cpu: 6
gpu: 7 cpu: 7
gpu: 8 cpu: 8
gpu: 9 cpu: 9
gpu: 10 cpu: 10
gpu: 11 cpu: 11
gpu: 12 cpu: 12
gpu: 13 cpu: 13
gpu: 14 cpu: 14
gpu: 15 cpu: 15
gpu: 16 cpu: 16
gpu: 17 cpu: 17
gpu: 18 cpu: 18
gpu: 19 cpu: 19
gpu: 20 cpu: 20
gpu: 21 cpu: 21
gpu: 22 cpu: 22
gpu: 23 cpu: 23
gpu: 24 cpu: 24
gpu: 25 cpu: 25
gpu: 26 cpu: 26
gpu: 27 cpu: 27
gpu: 28 cpu: 28
gpu: 29 cpu: 29
gpu: 30 cpu: 30
gpu: 31 cpu: 31
gpu: 32 cpu: 32
gpu: 33 cpu: 33
gpu: 34 cpu: 34
gpu: 35 cpu: 35
gpu: 36 cpu: 36
gpu: 37 cpu: 37
gpu: 38 cpu: 38
gpu: 39 cpu: 39
gpu: 40 cpu: 40
gpu: 41 cpu: 41
gpu: 42 cpu: 42
gpu: 43 cpu: 43
gpu: 44 cpu: 44
gpu: 45 cpu: 45
gpu: 46 cpu: 46
gpu: 47 cpu: 47
gpu: 48 cpu: 48
gpu: 49 cpu: 49
gpu: 50 cpu: 50
gpu: 51 cpu: 51
gpu: 52 cpu: 52
gpu: 53 cpu: 53
gpu: 54 cpu: 54
gpu: 55 cpu: 55
gpu: 56 cpu: 56
========= ERROR SUMMARY: 0 errors
$
I want to implement a basic blocked load and warp transpose using CUDA 9.0's shuffle operations. I'm aware of the cub and trove implementations, but I'm restricted to compiling with nvrtc and the standard header includes make these libraries difficult to cater for. I'm not looking for anything fancy, just some integer, float and double shuffles on data with dimension a power of 2.
Visualising an example with warp size 8, I want to go from:
correlation
0 1 2 3
lane 0 0 8 16 24
lane 1 1 9 17 25
lane 2 2 10 18 26
lane 3 3 11 19 27
lane 4 4 12 20 28
lane 5 5 13 21 29
lane 6 6 14 22 30
lane 7 7 15 23 31
to this structure:
correlation
0 1 2 3
lane 0 0 1 2 3
lane 1 8 9 10 11
lane 2 16 17 18 19
lane 3 24 25 26 27
lane 4 4 5 6 7
lane 5 12 13 14 15
lane 6 20 21 22 23
lane 7 28 29 30 31
I feel this should be really simple but I can't figure out what I've done incorrectly. I think that the basic transposition loop should look like:
int loads[ncorrs];
int values[ncorrs];
int lane_id = threadIdx.x & (warp_size - 1);
// 0 0 0 0 4 4 4 4 8 8 8 8 ....
int base_idx = lane_id & (warp_size - ncorrs);
// 0 1 2 3 0 1 2 3 0 1 2 3
int src_corr = lane_id & (ncorrs - 1);
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx + corr;
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
So given the example data above, if we're in lane 5, I expect that the following indexing should occur:
base_idx == 4;
src_corr == 1;
corr == [0, 1, 2, 3]
src_lane == [4, 5, 6, 7]
values == [12, 13, 14 15]
But instead the following is happening (33's are from later in the data):
correlation
0 1 2 3
lane 0 0 0 0 0
lane 1 4 4 4 4
lane 2 12 12 12 12
lane 3 16 16 16 16
lane 4 20 20 20 20
lane 5 24 24 24 24
lane 6 28 28 28 28
lane 7 33 33 33 33
What am I doing incorrectly? Full implementation for a warp size of 32:
#include <cstdlib>
#include <cstdio>
#include "cuda.h"
#define ncorr 4
#define warp_size 32
template <int ncorrs>
__global__ void kernel(
int * input,
int * output,
int N)
{
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
if(n >= N)
{ return; }
// Input correlation handled by this thread
int src_corr = corr_idx(lane_id);
int mask = __activemask();
if(threadIdx.x == 0)
{ printf("mask %d\n", mask); }
int loads[ncorrs];
int values[ncorrs];
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ loads[corr] = input[n + corr*N]; }
__syncthreads();
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
loads[0], loads[1],
loads[2], loads[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx(lane_id) + corr;
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
values[0], values[1],
values[2], values[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ output[n + corr*N] = values[corr]; }
}
void print_data(int * data, int N)
{
for(int n=0; n < N; ++n)
{
printf("% -3d: ", n);
for(int c=0; c < ncorr; ++c)
{
printf("%d ", data[n*ncorr + c]);
}
printf("\n");
}
}
int main(void)
{
int * host_input;
int * host_output;
int * device_input;
int * device_output;
int N = 32;
host_input = (int *) malloc(sizeof(int)*N*ncorr);
host_output = (int *) malloc(sizeof(int)*N*ncorr);
printf("malloc done\n");
cudaMalloc((void **) &device_input, sizeof(int)*N*ncorr);
cudaMalloc((void **) &device_output, sizeof(int)*N*ncorr);
printf("cudaMalloc done\n");
for(int i=0; i < N*ncorr; ++i)
{ host_input[i] = i; }
print_data(host_input, N);
dim3 block(256, 1, 1);
dim3 grid((block.x + N - 1) / N, 1, 1);
cudaMemcpy(device_input, host_input,
sizeof(int)*N*ncorr, cudaMemcpyHostToDevice);
printf("memcpy done\n");
kernel<4> <<<grid, block>>> (device_input, device_output, N);
cudaMemcpy(host_output, device_output,
sizeof(int)*N*ncorr, cudaMemcpyDeviceToHost);
print_data(host_output, N);
cudaFree(device_input);
cudaFree(device_output);
free(host_input);
free(host_output);
}
Edit 1: Clarified that the visual example has a warp size of 8 while the full code caters for a warp size of 32
What am I doing incorrectly?
TL;DR: In short, you are transmitting the same input value to multiple output values. Here is one example, in this line of code:
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
The quantity represented by loads[src_corr] is loop-invariant. Therefore you are transmitting that value to 4 warp lanes (over the 4 loop iterations) which means that value is occupying 4 output values (which is exactly what your printout data shows). That can't be right for a transpose.
Taking a somewhat longer view, with another example from your code:
I'm not sure I can read your mind, but possibly you may be confused about the warp shuffle operation. Possibly you have assumed that the destination lane can choose which value from the source lane loads[] array is desired. This is not the case. The destination lane only gets to select whatever is the value provided by the source lane. Let's take a look at your loop:
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
...
// Input correlation handled by this thread
int src_corr = corr_idx(lane_id);
int mask = __activemask();
...
int loads[ncorrs];
int values[ncorrs];
...
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx(lane_id) + corr;
values[corr] = __shfl_sync(mask, loads[src_corr], src_lane, warp_size);
}
On the first pass of the above loop, the src_lane for warp lanes 0, 1, 2, and 3 are all going to be 0. This is evident from the above excerpted code, or print it out if you're not sure. That means warp lanes 0-3 are going to be requesting whatever value is provided by warp lane 0. The value provided by warp lane 0 is loads[src_corr], but the interpretation of src_corr here is whatever value it has for warp lane 0. Therefore one and only one value will be distributed to warp lanes 0-3. This could not possibly be correct for a transpose; no input value shows up in 4 places in the output.
To fix this, we will need to modify the calculation both of src_lane and src_corr. We will also need to modify the storage location (index) per-warp-lane, at each pass the of the loop (I'm calling this new variable dest.) We can think of src_lane as defining the target value that my thread will receive. We can think of src_corr as defining which of my values I will publish to some other thread, on that loop iteration. dest is the location in my values[] array that I will store the currently received value. We can deduce the necessary pattern by carefully studying the relationship between the input value in loads[], the desired output location in values[], taking into account the appropriate warp lanes for source and destination. On the first pass of the loop, we desire this pattern:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
src_lane: 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 4 ... (where my data comes from)
src_corr: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 ... (which value I am transmitting)
dest: 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 ... (where I store the received value)
On the second pass of the loop, we desire this pattern:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
src_lane: 8 16 24 0 9 17 25 1 10 18 26 2 11 19 27 3 19 ... (where my data comes from)
src_corr: 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 1 ... (which value I am transmitting)
dest: 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 ... (where I store the received value)
with corresponding changes for the 3rd and 4th pass of the loop. If we realize those patterns in code for your shuffle loop, it could look something like this:
$ cat t352.cu
#include <cstdlib>
#include <cstdio>
#include <assert.h>
#define ncorr 4
#define warp_size 32
template <int ncorrs>
__global__ void kernel(
int * input,
int * output,
int N)
{
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
if(n >= N)
{ return; }
// Input correlation handled by this thread
int mask = __activemask();
if(threadIdx.x == 0)
{ printf("mask %d\n", mask); }
int loads[ncorrs];
int values[ncorrs];
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ loads[corr] = input[n + corr*N]; }
__syncthreads();
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
loads[0], loads[1],
loads[2], loads[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = ((lane_id+corr)%ncorrs)*(warp_size/ncorrs) + (lane_id/ncorrs);
int src_corr = ((ncorrs-corr)+(lane_id/(warp_size/ncorrs)))%ncorrs;
int dest = (lane_id+corr)%ncorrs;
values[dest] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
values[0], values[1],
values[2], values[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ output[n + corr*N] = values[corr]; }
}
void print_data(int * data, int N)
{
for(int n=0; n < N; ++n)
{
printf("% -3d: ", n);
for(int c=0; c < ncorr; ++c)
{
printf("%d ", data[n*ncorr + c]);
}
printf("\n");
}
}
int main(void)
{
int * host_input;
int * host_output;
int * device_input;
int * device_output;
int N = 32;
host_input = (int *) malloc(sizeof(int)*N*ncorr);
host_output = (int *) malloc(sizeof(int)*N*ncorr);
printf("malloc done\n");
cudaMalloc((void **) &device_input, sizeof(int)*N*ncorr);
cudaMalloc((void **) &device_output, sizeof(int)*N*ncorr);
printf("cudaMalloc done\n");
for(int i=0; i < N*ncorr; ++i)
{ host_input[i] = i; }
print_data(host_input, N);
dim3 block(256, 1, 1);
dim3 grid((block.x + N - 1) / N, 1, 1);
cudaMemcpy(device_input, host_input,
sizeof(int)*N*ncorr, cudaMemcpyHostToDevice);
printf("memcpy done\n");
kernel<4> <<<grid, block>>> (device_input, device_output, N);
cudaMemcpy(host_output, device_output,
sizeof(int)*N*ncorr, cudaMemcpyDeviceToHost);
print_data(host_output, N);
cudaFree(device_input);
cudaFree(device_output);
free(host_input);
free(host_output);
}
$ nvcc -o t352 t352.cu
$ cuda-memcheck ./t352
========= CUDA-MEMCHECK
malloc done
cudaMalloc done
0 : 0 1 2 3
1 : 4 5 6 7
2 : 8 9 10 11
3 : 12 13 14 15
4 : 16 17 18 19
5 : 20 21 22 23
6 : 24 25 26 27
7 : 28 29 30 31
8 : 32 33 34 35
9 : 36 37 38 39
10: 40 41 42 43
11: 44 45 46 47
12: 48 49 50 51
13: 52 53 54 55
14: 56 57 58 59
15: 60 61 62 63
16: 64 65 66 67
17: 68 69 70 71
18: 72 73 74 75
19: 76 77 78 79
20: 80 81 82 83
21: 84 85 86 87
22: 88 89 90 91
23: 92 93 94 95
24: 96 97 98 99
25: 100 101 102 103
26: 104 105 106 107
27: 108 109 110 111
28: 112 113 114 115
29: 116 117 118 119
30: 120 121 122 123
31: 124 125 126 127
memcpy done
mask -1
[0, 0] 0 32 64 96
[1, 0] 1 33 65 97
[2, 0] 2 34 66 98
[3, 0] 3 35 67 99
[4, 4] 4 36 68 100
[5, 4] 5 37 69 101
[6, 4] 6 38 70 102
[7, 4] 7 39 71 103
[8, 8] 8 40 72 104
[9, 8] 9 41 73 105
[10, 8] 10 42 74 106
[11, 8] 11 43 75 107
[12, 12] 12 44 76 108
[13, 12] 13 45 77 109
[14, 12] 14 46 78 110
[15, 12] 15 47 79 111
[16, 16] 16 48 80 112
[17, 16] 17 49 81 113
[18, 16] 18 50 82 114
[19, 16] 19 51 83 115
[20, 20] 20 52 84 116
[21, 20] 21 53 85 117
[22, 20] 22 54 86 118
[23, 20] 23 55 87 119
[24, 24] 24 56 88 120
[25, 24] 25 57 89 121
[26, 24] 26 58 90 122
[27, 24] 27 59 91 123
[28, 28] 28 60 92 124
[29, 28] 29 61 93 125
[30, 28] 30 62 94 126
[31, 28] 31 63 95 127
[0, 0] 0 8 16 24
[1, 0] 32 40 48 56
[2, 0] 64 72 80 88
[3, 0] 96 104 112 120
[4, 4] 1 9 17 25
[5, 4] 33 41 49 57
[6, 4] 65 73 81 89
[7, 4] 97 105 113 121
[8, 8] 2 10 18 26
[9, 8] 34 42 50 58
[10, 8] 66 74 82 90
[11, 8] 98 106 114 122
[12, 12] 3 11 19 27
[13, 12] 35 43 51 59
[14, 12] 67 75 83 91
[15, 12] 99 107 115 123
[16, 16] 4 12 20 28
[17, 16] 36 44 52 60
[18, 16] 68 76 84 92
[19, 16] 100 108 116 124
[20, 20] 5 13 21 29
[21, 20] 37 45 53 61
[22, 20] 69 77 85 93
[23, 20] 101 109 117 125
[24, 24] 6 14 22 30
[25, 24] 38 46 54 62
[26, 24] 70 78 86 94
[27, 24] 102 110 118 126
[28, 28] 7 15 23 31
[29, 28] 39 47 55 63
[30, 28] 71 79 87 95
[31, 28] 103 111 119 127
0 : 0 32 64 96
1 : 1 33 65 97
2 : 2 34 66 98
3 : 3 35 67 99
4 : 4 36 68 100
5 : 5 37 69 101
6 : 6 38 70 102
7 : 7 39 71 103
8 : 8 40 72 104
9 : 9 41 73 105
10: 10 42 74 106
11: 11 43 75 107
12: 12 44 76 108
13: 13 45 77 109
14: 14 46 78 110
15: 15 47 79 111
16: 16 48 80 112
17: 17 49 81 113
18: 18 50 82 114
19: 19 51 83 115
20: 20 52 84 116
21: 21 53 85 117
22: 22 54 86 118
23: 23 55 87 119
24: 24 56 88 120
25: 25 57 89 121
26: 26 58 90 122
27: 27 59 91 123
28: 28 60 92 124
29: 29 61 93 125
30: 30 62 94 126
31: 31 63 95 127
========= ERROR SUMMARY: 0 errors
$
I believe the above code fairly clearly demonstrates a 32x4 -> 4x32 transpose. I think it is "closest" to the code you presented. It does not do the set of 4x8 transposes you depicted in your diagrams.
I acknowledge that the calculations of src_corr, src_lane, and dest are not completely optimized. But they generate the correct indexing. I assume you can work out how to optimally generate those from the patterns you already have.
I think its entirely possible the above code has bugs for other dimensions. I've not tried it on anything except the 32x4 case. Nevertheless I think I have indicated what is fundamentally wrong with your code, and demonstrated a pathway to get to proper indexing.
A square matrix transpose up to 32x32 can be done at the warp level using a simpler method
I have the following code that performs a tiled matrix transpose using shared memory to improve performance. The shared memory is padded with 1 column to avoid bank conflict for a 32x32 thread block.
__global__ void transpose_tiled_padded(float *A, float *B, int n)
{
int i_in = blockDim.x*blockIdx.x + threadIdx.x;
int j_in = blockDim.y*blockIdx.y + threadIdx.y;
int i_out = blockDim.x*blockIdx.y + threadIdx.x;
int j_out = blockDim.y*blockIdx.x + threadIdx.y;
extern __shared__ float tile[];
// coalesced read of A rows to (padded) shared tile column (transpose)
tile[threadIdx.y + threadIdx.x*(blockDim.y+1)] = A[i_in + j_in*n];
__syncthreads();
// coalesced write from (padded) shared tile column to B rows
B[i_out + j_out*n] = tile[threadIdx.x + threadIdx.y*(blockDim.x+1)];
}
Running this code, I get 100% shared memory efficiency in the NVIDIA visual profiler, as I expect. But, when I run it with a 16x16 thread block, I only get 50% efficiency. Why is that? As far as I can tell, no thread in a warp reads from the same bank with this layout. Or am I mistaken?
Yes, you are mistaken.
Considering this (read) access for warp 0 in a 16x16 block:
tile[threadIdx.x + threadIdx.y*(blockDim.x+1)];
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"index"
Here are the relevant calculations for each thread in the warp:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 23 25 26 27 28 29 30 31
threadIdx.x: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
threadIdx.y: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
"index": 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
bank: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 0
So we see that for this warp, the first and the last thread both read from bank 0. This results in a 2-way bank conflict, 2-way serialization, and 50% efficiency.
I am working with CUDA and I am trying to stop my kernels work (i.e. terminate all running threads) after a certain if block is being hit. How can I do that? I am really stuck in here.
The CUDA execution model doesn't allow for inter-block communication by design. That can potentially make this sort of kernel abort on condition operation difficult to achieve reliably without resorting to the assert or trap type approaches which can potentially result in context destruction and loss of data which isn't what you probably want.
If your kernel design involves a small number of blocks with "resident" threads, then the only approach is some sort of atomic spinlock, which is hard to get to work reliably, and which will greatly degrade memory controller performance and achievable bandwidth.
If, on the other hand, your kernel design has rather large grids with a lot of blocks, and your main goal is to stop blocks which are not yet scheduled from running, then you could try something like this:
#include <iostream>
#include <vector>
__device__ unsigned int found_idx;
__global__ void setkernel(unsigned int *indata)
{
indata[115949] = 0xdeadbeef;
indata[119086] = 0xdeadbeef;
indata[60534] = 0xdeadbeef;
indata[37072] = 0xdeadbeef;
indata[163107] = 0xdeadbeef;
}
__global__ void searchkernel(unsigned int *indata, unsigned int *outdata)
{
if (found_idx > 0) {
return;
} else if (threadIdx.x == 0) {
outdata[blockIdx.x] = blockIdx.x;
};
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (indata[tid] == 0xdeadbeef) {
unsigned int oldval = atomicCAS(&found_idx, 0, 1+tid);
}
}
int main()
{
const unsigned int N = 1 << 19;
unsigned int* in_data;
cudaMalloc((void **)&in_data, sizeof(unsigned int) * size_t(N));
cudaMemset(in_data, 0, sizeof(unsigned int) * size_t(N));
setkernel<<<1,1>>>(in_data);
cudaDeviceSynchronize();
unsigned int block_size = 1024;
unsigned int grid_size = N / block_size;
unsigned int* out_data;
cudaMalloc((void **)&out_data, sizeof(unsigned int) * size_t(grid_size));
cudaMemset(out_data, 0xf0, sizeof(unsigned int) * size_t(grid_size));
const unsigned int zero = 0;
cudaMemcpyToSymbol(found_idx, &zero, sizeof(unsigned int));
searchkernel<<<grid_size, block_size>>>(in_data, out_data);
std::vector<unsigned int> output(grid_size);
cudaMemcpy(&output[0], out_data, sizeof(unsigned int) * size_t(grid_size), cudaMemcpyDeviceToHost);
cudaDeviceReset();
std::cout << "The following blocks did not run" << std::endl;
for(int i=0, j=0; i<grid_size; i++) {
if (output[i] == 0xf0f0f0f0) {
std::cout << " " << i;
if (j++ == 20) {
std::cout << std::endl;
j = 0;
}
}
}
std::cout << std::endl;
return 0;
}
Here I have a simple kernel which is searching for a magic word in a large array. To get the early exit behaviour, I use a single global word, which is set atomically by those threads which "win" or trigger the termination condition. Every new block checks the state of this global word, and if it is set, they return without doing any work.
If I compile and run this on a moderate sized Kepler device:
$ nvcc -arch=sm_30 -o blocking blocking.cu
$ ./blocking
The following blocks did not run
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
504 505 506 507 508 509 510 511
you can see that a large number of blocks in the grid saw the change in the global word and early terminated without running the search code. This might be the best you can do without a severely invasive spinlock approach which will greatly harm performance.
I assume you want to stop a running kernel (not a single thread).
The simplest approach (and the one that I suggest) is to set up a global memory flag which is been tested by the kernel.
You can set the flag using cudaMemcpy() (or without if using unified memory).
Like the following:
if (gm_flag) {
__threadfence(); // ensure store issued before trap
asm("trap;"); // kill kernel with error
}
ams("trap;") will stop all running thread
Note that since cuda 2.0 you can use assert() to terminate a kernel!
A different approach could be the following (I haven't tried the code!)
__device__ bool go(int val){
return true;
}
__global__ void stopme(bool* flag, int* val, int size){
int idx= blockIdx.x *blockDim.x + threadIdx.x;
if(idx < size){
bool canContinue = true;
while(canContinue && (flag[0])){
printf("HELLO from %i\n",idx);
if(!(*flag)){
return;
}
else{
//do some computation
val[idx]++;
val[idx]%=100;
}
canContinue = go(val[idx]);
}
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(void)
{
int size = 128;
int* h_val = (int*)malloc(sizeof(int)*size);
bool * h_flag = new bool;
*h_flag=true;
bool* d_flag;
cudaMalloc(&d_flag,sizeof(bool));
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
int* d_val;
cudaMalloc(&d_val,sizeof(int)*size );
for(int i=0;i<size;i++){
h_val[i] = i;
}
cudaMemcpy(d_val,h_val,size,cudaMemcpyHostToDevice);
int BSIZE=32;
int nblocks =size/BSIZE;
printf("%i,%i",nblocks,BSIZE);
stopme<<<nblocks,BSIZE>>>(d_flag,d_val,size);
//--------------sleep for a while --------------------------
*h_flag=false;
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
gpuErrchk( cudaPeekAtLastError() );
printf("END\n");
}
where the kernel stopMe keeps running until someone from the host side sets up the flag to false. Note that your kernel could be much more complicated than this and the effort to synchronize all threads in order to execute the return could be much more than this (and can affect performance). Hope this helped.
More info here
I make this program to practice cudaMemcpy3D() and Texture Memory.
Here comes the questions,when I print out tex3D data,it is not same as initial data.The value I get is ncrss times the initial value, and there are ncrss interval numbers which equal to 0 between each other. If I set nsubs to 2 or other bigger one, the time should be ncrss*nsubs and interval will be ncrss*nsubs.
Can you piont out where I made the mistakes. I think it probably is make_cudaPitchedPtr at line 61, or make_cudaExtent at line 56. And also may related with the way of array storaged.
So I come here for your help,appreciate for your comments and advices.
1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<cuda_runtime.h>
4 #include<helper_functions.h>
5 #include<helper_cuda.h>
6 #ifndef MIN
7 #define MIN(A,B) ((A) < (B) ? (A) : (B))
8 #endif
9 #ifndef MAX
10 #define MAX(A,B) ((A) > (B) ? (A) : (B))
11 #endif
12
13 texture<float,cudaTextureType3D,cudaReadModeElementType> vel_tex;
14
15 __global__ void mckernel(int ntab)
16 {
17 const int biy=blockIdx.y;//sub
18 const int bix=blockIdx.x;//crs
19 const int tid=threadIdx.x;
20
21 float test;
22 test=tex3D(vel_tex,biy,bix,tid);
23 printf("test=%f,bix=%d,tid=%d\n",test,bix,tid);
24
25 }
26
27 int main()
28 {
29 int n=10;//208
30 int ntab=10;
31 int submin=1;
32 int crsmin=1;
33 int submax=1;
34 int crsmax=2;
35 int subinc=1;
36 int crsinc=1;
37
38 int ncrss,nsubs;
39 ncrss=(crsmax-crsmin)/crsinc + 1;
40 nsubs=(submax-submin)/subinc + 1;
41 dim3 BlockPerGrid(ncrss,nsubs,1);
42 dim3 ThreadPerBlock(n,1,1);
43
44 float vel[nsubs][ncrss][ntab];
45 int i,j,k;
46 for(i=0;i<nsubs;i++)
47 for(j=0;j<ncrss;j++)
48 for(k=0;k<ntab;k++)
49 vel[i][j][k]=k;
50 for(i=0;i<nsubs;i++)
51 for(j=0;j<ncrss;j++)
52 for(k=0;k<ntab;k++)
53 printf("vel[%d][%d][%d]=%f\n",i,j,k,vel[i][j][k]);
54
55 cudaChannelFormatDesc velchannelDesc=cudaCreateChannelDesc<float>();
56 cudaExtent velExtent=make_cudaExtent(nsubs,ncrss,ntab);
57 cudaArray *d_vel;
58 cudaMalloc3DArray(&d_vel,&velchannelDesc,velExtent);
59
60 cudaMemcpy3DParms velParms = {0};
61 velParms.srcPtr=make_cudaPitchedPtr((void*)vel,sizeof(float)*nsubs,nsubs,ncrss);
62 velParms.dstArray=d_vel;
63 velParms.extent=velExtent;
64 velParms.kind=cudaMemcpyHostToDevice;
65 cudaMemcpy3D(&velParms);
66
67 cudaBindTextureToArray(vel_tex,d_vel);
68
69 printf("kernel start\n");
70 cudaDeviceSynchronize();
71 mckernel<<<BlockPerGrid,ThreadPerBlock>>>(ntab);
72 printf("kernel end\n");
73
74 cudaUnbindTexture(vel_tex);
75 cudaFreeArray(d_vel);
76 cudaDeviceReset();
77 return 0 ;
78 }
Here comes the printf data,nsubs=1 and ncrss=2;
1 vel[0][0][0]=0.000000
2 vel[0][0][1]=1.000000
3 vel[0][0][2]=2.000000
4 vel[0][0][3]=3.000000
5 vel[0][0][4]=4.000000
6 vel[0][0][5]=5.000000
7 vel[0][0][6]=6.000000
8 vel[0][0][7]=7.000000
9 vel[0][0][8]=8.000000
10 vel[0][0][9]=9.000000
11 vel[0][1][0]=0.000000
12 vel[0][1][1]=1.000000
13 vel[0][1][2]=2.000000
14 vel[0][1][3]=3.000000
15 vel[0][1][4]=4.000000
16 vel[0][1][5]=5.000000
17 vel[0][1][6]=6.000000
18 vel[0][1][7]=7.000000
19 vel[0][1][8]=8.000000
20 vel[0][1][9]=9.000000
21 kernel start
22 kernel end
23 test=1.000000,bix=1,tid=0
24 test=3.000000,bix=1,tid=1
25 test=5.000000,bix=1,tid=2
26 test=7.000000,bix=1,tid=3
27 test=9.000000,bix=1,tid=4
28 test=1.000000,bix=1,tid=5
29 test=3.000000,bix=1,tid=6
30 test=5.000000,bix=1,tid=7
31 test=7.000000,bix=1,tid=8
32 test=9.000000,bix=1,tid=9
33 test=0.000000,bix=0,tid=0
34 test=2.000000,bix=0,tid=1
35 test=4.000000,bix=0,tid=2
36 test=6.000000,bix=0,tid=3
37 test=8.000000,bix=0,tid=4
38 test=0.000000,bix=0,tid=5
39 test=2.000000,bix=0,tid=6
40 test=4.000000,bix=0,tid=7
41 test=6.000000,bix=0,tid=8
42 test=8.000000,bix=0,tid=9
After a night thinking ,I find out the problem.
the cuda array load as M[fast][mid][low] while c array is M[low][mid][fast].
so dim3(),cudaExtent(),pitchedPtr()should be same to [low][mid][fast] or at least should be same as each other.