why output tex3D data are different from initial data? - cuda

I make this program to practice cudaMemcpy3D() and Texture Memory.
Here comes the questions,when I print out tex3D data,it is not same as initial data.The value I get is ncrss times the initial value, and there are ncrss interval numbers which equal to 0 between each other. If I set nsubs to 2 or other bigger one, the time should be ncrss*nsubs and interval will be ncrss*nsubs.
Can you piont out where I made the mistakes. I think it probably is make_cudaPitchedPtr at line 61, or make_cudaExtent at line 56. And also may related with the way of array storaged.
So I come here for your help,appreciate for your comments and advices.
1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<cuda_runtime.h>
4 #include<helper_functions.h>
5 #include<helper_cuda.h>
6 #ifndef MIN
7 #define MIN(A,B) ((A) < (B) ? (A) : (B))
8 #endif
9 #ifndef MAX
10 #define MAX(A,B) ((A) > (B) ? (A) : (B))
11 #endif
12
13 texture<float,cudaTextureType3D,cudaReadModeElementType> vel_tex;
14
15 __global__ void mckernel(int ntab)
16 {
17 const int biy=blockIdx.y;//sub
18 const int bix=blockIdx.x;//crs
19 const int tid=threadIdx.x;
20
21 float test;
22 test=tex3D(vel_tex,biy,bix,tid);
23 printf("test=%f,bix=%d,tid=%d\n",test,bix,tid);
24
25 }
26
27 int main()
28 {
29 int n=10;//208
30 int ntab=10;
31 int submin=1;
32 int crsmin=1;
33 int submax=1;
34 int crsmax=2;
35 int subinc=1;
36 int crsinc=1;
37
38 int ncrss,nsubs;
39 ncrss=(crsmax-crsmin)/crsinc + 1;
40 nsubs=(submax-submin)/subinc + 1;
41 dim3 BlockPerGrid(ncrss,nsubs,1);
42 dim3 ThreadPerBlock(n,1,1);
43
44 float vel[nsubs][ncrss][ntab];
45 int i,j,k;
46 for(i=0;i<nsubs;i++)
47 for(j=0;j<ncrss;j++)
48 for(k=0;k<ntab;k++)
49 vel[i][j][k]=k;
50 for(i=0;i<nsubs;i++)
51 for(j=0;j<ncrss;j++)
52 for(k=0;k<ntab;k++)
53 printf("vel[%d][%d][%d]=%f\n",i,j,k,vel[i][j][k]);
54
55 cudaChannelFormatDesc velchannelDesc=cudaCreateChannelDesc<float>();
56 cudaExtent velExtent=make_cudaExtent(nsubs,ncrss,ntab);
57 cudaArray *d_vel;
58 cudaMalloc3DArray(&d_vel,&velchannelDesc,velExtent);
59
60 cudaMemcpy3DParms velParms = {0};
61 velParms.srcPtr=make_cudaPitchedPtr((void*)vel,sizeof(float)*nsubs,nsubs,ncrss);
62 velParms.dstArray=d_vel;
63 velParms.extent=velExtent;
64 velParms.kind=cudaMemcpyHostToDevice;
65 cudaMemcpy3D(&velParms);
66
67 cudaBindTextureToArray(vel_tex,d_vel);
68
69 printf("kernel start\n");
70 cudaDeviceSynchronize();
71 mckernel<<<BlockPerGrid,ThreadPerBlock>>>(ntab);
72 printf("kernel end\n");
73
74 cudaUnbindTexture(vel_tex);
75 cudaFreeArray(d_vel);
76 cudaDeviceReset();
77 return 0 ;
78 }
Here comes the printf data,nsubs=1 and ncrss=2;
1 vel[0][0][0]=0.000000
2 vel[0][0][1]=1.000000
3 vel[0][0][2]=2.000000
4 vel[0][0][3]=3.000000
5 vel[0][0][4]=4.000000
6 vel[0][0][5]=5.000000
7 vel[0][0][6]=6.000000
8 vel[0][0][7]=7.000000
9 vel[0][0][8]=8.000000
10 vel[0][0][9]=9.000000
11 vel[0][1][0]=0.000000
12 vel[0][1][1]=1.000000
13 vel[0][1][2]=2.000000
14 vel[0][1][3]=3.000000
15 vel[0][1][4]=4.000000
16 vel[0][1][5]=5.000000
17 vel[0][1][6]=6.000000
18 vel[0][1][7]=7.000000
19 vel[0][1][8]=8.000000
20 vel[0][1][9]=9.000000
21 kernel start
22 kernel end
23 test=1.000000,bix=1,tid=0
24 test=3.000000,bix=1,tid=1
25 test=5.000000,bix=1,tid=2
26 test=7.000000,bix=1,tid=3
27 test=9.000000,bix=1,tid=4
28 test=1.000000,bix=1,tid=5
29 test=3.000000,bix=1,tid=6
30 test=5.000000,bix=1,tid=7
31 test=7.000000,bix=1,tid=8
32 test=9.000000,bix=1,tid=9
33 test=0.000000,bix=0,tid=0
34 test=2.000000,bix=0,tid=1
35 test=4.000000,bix=0,tid=2
36 test=6.000000,bix=0,tid=3
37 test=8.000000,bix=0,tid=4
38 test=0.000000,bix=0,tid=5
39 test=2.000000,bix=0,tid=6
40 test=4.000000,bix=0,tid=7
41 test=6.000000,bix=0,tid=8
42 test=8.000000,bix=0,tid=9

After a night thinking ,I find out the problem.
the cuda array load as M[fast][mid][low] while c array is M[low][mid][fast].
so dim3(),cudaExtent(),pitchedPtr()should be same to [low][mid][fast] or at least should be same as each other.

Related

Basic CUDA load and warp transpose

I want to implement a basic blocked load and warp transpose using CUDA 9.0's shuffle operations. I'm aware of the cub and trove implementations, but I'm restricted to compiling with nvrtc and the standard header includes make these libraries difficult to cater for. I'm not looking for anything fancy, just some integer, float and double shuffles on data with dimension a power of 2.
Visualising an example with warp size 8, I want to go from:
correlation
0 1 2 3
lane 0 0 8 16 24
lane 1 1 9 17 25
lane 2 2 10 18 26
lane 3 3 11 19 27
lane 4 4 12 20 28
lane 5 5 13 21 29
lane 6 6 14 22 30
lane 7 7 15 23 31
to this structure:
correlation
0 1 2 3
lane 0 0 1 2 3
lane 1 8 9 10 11
lane 2 16 17 18 19
lane 3 24 25 26 27
lane 4 4 5 6 7
lane 5 12 13 14 15
lane 6 20 21 22 23
lane 7 28 29 30 31
I feel this should be really simple but I can't figure out what I've done incorrectly. I think that the basic transposition loop should look like:
int loads[ncorrs];
int values[ncorrs];
int lane_id = threadIdx.x & (warp_size - 1);
// 0 0 0 0 4 4 4 4 8 8 8 8 ....
int base_idx = lane_id & (warp_size - ncorrs);
// 0 1 2 3 0 1 2 3 0 1 2 3
int src_corr = lane_id & (ncorrs - 1);
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx + corr;
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
So given the example data above, if we're in lane 5, I expect that the following indexing should occur:
base_idx == 4;
src_corr == 1;
corr == [0, 1, 2, 3]
src_lane == [4, 5, 6, 7]
values == [12, 13, 14 15]
But instead the following is happening (33's are from later in the data):
correlation
0 1 2 3
lane 0 0 0 0 0
lane 1 4 4 4 4
lane 2 12 12 12 12
lane 3 16 16 16 16
lane 4 20 20 20 20
lane 5 24 24 24 24
lane 6 28 28 28 28
lane 7 33 33 33 33
What am I doing incorrectly? Full implementation for a warp size of 32:
#include <cstdlib>
#include <cstdio>
#include "cuda.h"
#define ncorr 4
#define warp_size 32
template <int ncorrs>
__global__ void kernel(
int * input,
int * output,
int N)
{
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
if(n >= N)
{ return; }
// Input correlation handled by this thread
int src_corr = corr_idx(lane_id);
int mask = __activemask();
if(threadIdx.x == 0)
{ printf("mask %d\n", mask); }
int loads[ncorrs];
int values[ncorrs];
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ loads[corr] = input[n + corr*N]; }
__syncthreads();
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
loads[0], loads[1],
loads[2], loads[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx(lane_id) + corr;
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
values[0], values[1],
values[2], values[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ output[n + corr*N] = values[corr]; }
}
void print_data(int * data, int N)
{
for(int n=0; n < N; ++n)
{
printf("% -3d: ", n);
for(int c=0; c < ncorr; ++c)
{
printf("%d ", data[n*ncorr + c]);
}
printf("\n");
}
}
int main(void)
{
int * host_input;
int * host_output;
int * device_input;
int * device_output;
int N = 32;
host_input = (int *) malloc(sizeof(int)*N*ncorr);
host_output = (int *) malloc(sizeof(int)*N*ncorr);
printf("malloc done\n");
cudaMalloc((void **) &device_input, sizeof(int)*N*ncorr);
cudaMalloc((void **) &device_output, sizeof(int)*N*ncorr);
printf("cudaMalloc done\n");
for(int i=0; i < N*ncorr; ++i)
{ host_input[i] = i; }
print_data(host_input, N);
dim3 block(256, 1, 1);
dim3 grid((block.x + N - 1) / N, 1, 1);
cudaMemcpy(device_input, host_input,
sizeof(int)*N*ncorr, cudaMemcpyHostToDevice);
printf("memcpy done\n");
kernel<4> <<<grid, block>>> (device_input, device_output, N);
cudaMemcpy(host_output, device_output,
sizeof(int)*N*ncorr, cudaMemcpyDeviceToHost);
print_data(host_output, N);
cudaFree(device_input);
cudaFree(device_output);
free(host_input);
free(host_output);
}
Edit 1: Clarified that the visual example has a warp size of 8 while the full code caters for a warp size of 32
What am I doing incorrectly?
TL;DR: In short, you are transmitting the same input value to multiple output values. Here is one example, in this line of code:
values[corr] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
The quantity represented by loads[src_corr] is loop-invariant. Therefore you are transmitting that value to 4 warp lanes (over the 4 loop iterations) which means that value is occupying 4 output values (which is exactly what your printout data shows). That can't be right for a transpose.
Taking a somewhat longer view, with another example from your code:
I'm not sure I can read your mind, but possibly you may be confused about the warp shuffle operation. Possibly you have assumed that the destination lane can choose which value from the source lane loads[] array is desired. This is not the case. The destination lane only gets to select whatever is the value provided by the source lane. Let's take a look at your loop:
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
...
// Input correlation handled by this thread
int src_corr = corr_idx(lane_id);
int mask = __activemask();
...
int loads[ncorrs];
int values[ncorrs];
...
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = base_idx(lane_id) + corr;
values[corr] = __shfl_sync(mask, loads[src_corr], src_lane, warp_size);
}
On the first pass of the above loop, the src_lane for warp lanes 0, 1, 2, and 3 are all going to be 0. This is evident from the above excerpted code, or print it out if you're not sure. That means warp lanes 0-3 are going to be requesting whatever value is provided by warp lane 0. The value provided by warp lane 0 is loads[src_corr], but the interpretation of src_corr here is whatever value it has for warp lane 0. Therefore one and only one value will be distributed to warp lanes 0-3. This could not possibly be correct for a transpose; no input value shows up in 4 places in the output.
To fix this, we will need to modify the calculation both of src_lane and src_corr. We will also need to modify the storage location (index) per-warp-lane, at each pass the of the loop (I'm calling this new variable dest.) We can think of src_lane as defining the target value that my thread will receive. We can think of src_corr as defining which of my values I will publish to some other thread, on that loop iteration. dest is the location in my values[] array that I will store the currently received value. We can deduce the necessary pattern by carefully studying the relationship between the input value in loads[], the desired output location in values[], taking into account the appropriate warp lanes for source and destination. On the first pass of the loop, we desire this pattern:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
src_lane: 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27 4 ... (where my data comes from)
src_corr: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 ... (which value I am transmitting)
dest: 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 ... (where I store the received value)
On the second pass of the loop, we desire this pattern:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
src_lane: 8 16 24 0 9 17 25 1 10 18 26 2 11 19 27 3 19 ... (where my data comes from)
src_corr: 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 1 ... (which value I am transmitting)
dest: 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 ... (where I store the received value)
with corresponding changes for the 3rd and 4th pass of the loop. If we realize those patterns in code for your shuffle loop, it could look something like this:
$ cat t352.cu
#include <cstdlib>
#include <cstdio>
#include <assert.h>
#define ncorr 4
#define warp_size 32
template <int ncorrs>
__global__ void kernel(
int * input,
int * output,
int N)
{
// This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
#define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
// This should provide 0 1 2 3 0 1 2 3 0 1 2 3
#define corr_idx(lane_id) (lane_id & (ncorrs - 1))
int n = blockIdx.x*blockDim.x + threadIdx.x;
int lane_id = threadIdx.x & (warp_size - 1);
if(n >= N)
{ return; }
// Input correlation handled by this thread
int mask = __activemask();
if(threadIdx.x == 0)
{ printf("mask %d\n", mask); }
int loads[ncorrs];
int values[ncorrs];
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ loads[corr] = input[n + corr*N]; }
__syncthreads();
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
loads[0], loads[1],
loads[2], loads[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{
int src_lane = ((lane_id+corr)%ncorrs)*(warp_size/ncorrs) + (lane_id/ncorrs);
int src_corr = ((ncorrs-corr)+(lane_id/(warp_size/ncorrs)))%ncorrs;
int dest = (lane_id+corr)%ncorrs;
values[dest] = __shfl_sync(mask, loads[src_corr],
src_lane, warp_size);
}
printf("[%d, %d] %d %d %d %d\n",
lane_id, base_idx(lane_id),
values[0], values[1],
values[2], values[3]);
#pragma unroll (ncorrs)
for(int corr=0; corr < ncorrs; ++corr)
{ output[n + corr*N] = values[corr]; }
}
void print_data(int * data, int N)
{
for(int n=0; n < N; ++n)
{
printf("% -3d: ", n);
for(int c=0; c < ncorr; ++c)
{
printf("%d ", data[n*ncorr + c]);
}
printf("\n");
}
}
int main(void)
{
int * host_input;
int * host_output;
int * device_input;
int * device_output;
int N = 32;
host_input = (int *) malloc(sizeof(int)*N*ncorr);
host_output = (int *) malloc(sizeof(int)*N*ncorr);
printf("malloc done\n");
cudaMalloc((void **) &device_input, sizeof(int)*N*ncorr);
cudaMalloc((void **) &device_output, sizeof(int)*N*ncorr);
printf("cudaMalloc done\n");
for(int i=0; i < N*ncorr; ++i)
{ host_input[i] = i; }
print_data(host_input, N);
dim3 block(256, 1, 1);
dim3 grid((block.x + N - 1) / N, 1, 1);
cudaMemcpy(device_input, host_input,
sizeof(int)*N*ncorr, cudaMemcpyHostToDevice);
printf("memcpy done\n");
kernel<4> <<<grid, block>>> (device_input, device_output, N);
cudaMemcpy(host_output, device_output,
sizeof(int)*N*ncorr, cudaMemcpyDeviceToHost);
print_data(host_output, N);
cudaFree(device_input);
cudaFree(device_output);
free(host_input);
free(host_output);
}
$ nvcc -o t352 t352.cu
$ cuda-memcheck ./t352
========= CUDA-MEMCHECK
malloc done
cudaMalloc done
0 : 0 1 2 3
1 : 4 5 6 7
2 : 8 9 10 11
3 : 12 13 14 15
4 : 16 17 18 19
5 : 20 21 22 23
6 : 24 25 26 27
7 : 28 29 30 31
8 : 32 33 34 35
9 : 36 37 38 39
10: 40 41 42 43
11: 44 45 46 47
12: 48 49 50 51
13: 52 53 54 55
14: 56 57 58 59
15: 60 61 62 63
16: 64 65 66 67
17: 68 69 70 71
18: 72 73 74 75
19: 76 77 78 79
20: 80 81 82 83
21: 84 85 86 87
22: 88 89 90 91
23: 92 93 94 95
24: 96 97 98 99
25: 100 101 102 103
26: 104 105 106 107
27: 108 109 110 111
28: 112 113 114 115
29: 116 117 118 119
30: 120 121 122 123
31: 124 125 126 127
memcpy done
mask -1
[0, 0] 0 32 64 96
[1, 0] 1 33 65 97
[2, 0] 2 34 66 98
[3, 0] 3 35 67 99
[4, 4] 4 36 68 100
[5, 4] 5 37 69 101
[6, 4] 6 38 70 102
[7, 4] 7 39 71 103
[8, 8] 8 40 72 104
[9, 8] 9 41 73 105
[10, 8] 10 42 74 106
[11, 8] 11 43 75 107
[12, 12] 12 44 76 108
[13, 12] 13 45 77 109
[14, 12] 14 46 78 110
[15, 12] 15 47 79 111
[16, 16] 16 48 80 112
[17, 16] 17 49 81 113
[18, 16] 18 50 82 114
[19, 16] 19 51 83 115
[20, 20] 20 52 84 116
[21, 20] 21 53 85 117
[22, 20] 22 54 86 118
[23, 20] 23 55 87 119
[24, 24] 24 56 88 120
[25, 24] 25 57 89 121
[26, 24] 26 58 90 122
[27, 24] 27 59 91 123
[28, 28] 28 60 92 124
[29, 28] 29 61 93 125
[30, 28] 30 62 94 126
[31, 28] 31 63 95 127
[0, 0] 0 8 16 24
[1, 0] 32 40 48 56
[2, 0] 64 72 80 88
[3, 0] 96 104 112 120
[4, 4] 1 9 17 25
[5, 4] 33 41 49 57
[6, 4] 65 73 81 89
[7, 4] 97 105 113 121
[8, 8] 2 10 18 26
[9, 8] 34 42 50 58
[10, 8] 66 74 82 90
[11, 8] 98 106 114 122
[12, 12] 3 11 19 27
[13, 12] 35 43 51 59
[14, 12] 67 75 83 91
[15, 12] 99 107 115 123
[16, 16] 4 12 20 28
[17, 16] 36 44 52 60
[18, 16] 68 76 84 92
[19, 16] 100 108 116 124
[20, 20] 5 13 21 29
[21, 20] 37 45 53 61
[22, 20] 69 77 85 93
[23, 20] 101 109 117 125
[24, 24] 6 14 22 30
[25, 24] 38 46 54 62
[26, 24] 70 78 86 94
[27, 24] 102 110 118 126
[28, 28] 7 15 23 31
[29, 28] 39 47 55 63
[30, 28] 71 79 87 95
[31, 28] 103 111 119 127
0 : 0 32 64 96
1 : 1 33 65 97
2 : 2 34 66 98
3 : 3 35 67 99
4 : 4 36 68 100
5 : 5 37 69 101
6 : 6 38 70 102
7 : 7 39 71 103
8 : 8 40 72 104
9 : 9 41 73 105
10: 10 42 74 106
11: 11 43 75 107
12: 12 44 76 108
13: 13 45 77 109
14: 14 46 78 110
15: 15 47 79 111
16: 16 48 80 112
17: 17 49 81 113
18: 18 50 82 114
19: 19 51 83 115
20: 20 52 84 116
21: 21 53 85 117
22: 22 54 86 118
23: 23 55 87 119
24: 24 56 88 120
25: 25 57 89 121
26: 26 58 90 122
27: 27 59 91 123
28: 28 60 92 124
29: 29 61 93 125
30: 30 62 94 126
31: 31 63 95 127
========= ERROR SUMMARY: 0 errors
$
I believe the above code fairly clearly demonstrates a 32x4 -> 4x32 transpose. I think it is "closest" to the code you presented. It does not do the set of 4x8 transposes you depicted in your diagrams.
I acknowledge that the calculations of src_corr, src_lane, and dest are not completely optimized. But they generate the correct indexing. I assume you can work out how to optimally generate those from the patterns you already have.
I think its entirely possible the above code has bugs for other dimensions. I've not tried it on anything except the 32x4 case. Nevertheless I think I have indicated what is fundamentally wrong with your code, and demonstrated a pathway to get to proper indexing.
A square matrix transpose up to 32x32 can be done at the warp level using a simpler method

CUDA shared memory efficiency at 50%?

I have the following code that performs a tiled matrix transpose using shared memory to improve performance. The shared memory is padded with 1 column to avoid bank conflict for a 32x32 thread block.
__global__ void transpose_tiled_padded(float *A, float *B, int n)
{
int i_in = blockDim.x*blockIdx.x + threadIdx.x;
int j_in = blockDim.y*blockIdx.y + threadIdx.y;
int i_out = blockDim.x*blockIdx.y + threadIdx.x;
int j_out = blockDim.y*blockIdx.x + threadIdx.y;
extern __shared__ float tile[];
// coalesced read of A rows to (padded) shared tile column (transpose)
tile[threadIdx.y + threadIdx.x*(blockDim.y+1)] = A[i_in + j_in*n];
__syncthreads();
// coalesced write from (padded) shared tile column to B rows
B[i_out + j_out*n] = tile[threadIdx.x + threadIdx.y*(blockDim.x+1)];
}
Running this code, I get 100% shared memory efficiency in the NVIDIA visual profiler, as I expect. But, when I run it with a 16x16 thread block, I only get 50% efficiency. Why is that? As far as I can tell, no thread in a warp reads from the same bank with this layout. Or am I mistaken?
Yes, you are mistaken.
Considering this (read) access for warp 0 in a 16x16 block:
tile[threadIdx.x + threadIdx.y*(blockDim.x+1)];
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"index"
Here are the relevant calculations for each thread in the warp:
warp lane: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 23 25 26 27 28 29 30 31
threadIdx.x: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
threadIdx.y: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
"index": 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
bank: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 0
So we see that for this warp, the first and the last thread both read from bank 0. This results in a 2-way bank conflict, 2-way serialization, and 50% efficiency.

Is it possible to terminate CUDA kernel from host? [duplicate]

I am working with CUDA and I am trying to stop my kernels work (i.e. terminate all running threads) after a certain if block is being hit. How can I do that? I am really stuck in here.
The CUDA execution model doesn't allow for inter-block communication by design. That can potentially make this sort of kernel abort on condition operation difficult to achieve reliably without resorting to the assert or trap type approaches which can potentially result in context destruction and loss of data which isn't what you probably want.
If your kernel design involves a small number of blocks with "resident" threads, then the only approach is some sort of atomic spinlock, which is hard to get to work reliably, and which will greatly degrade memory controller performance and achievable bandwidth.
If, on the other hand, your kernel design has rather large grids with a lot of blocks, and your main goal is to stop blocks which are not yet scheduled from running, then you could try something like this:
#include <iostream>
#include <vector>
__device__ unsigned int found_idx;
__global__ void setkernel(unsigned int *indata)
{
indata[115949] = 0xdeadbeef;
indata[119086] = 0xdeadbeef;
indata[60534] = 0xdeadbeef;
indata[37072] = 0xdeadbeef;
indata[163107] = 0xdeadbeef;
}
__global__ void searchkernel(unsigned int *indata, unsigned int *outdata)
{
if (found_idx > 0) {
return;
} else if (threadIdx.x == 0) {
outdata[blockIdx.x] = blockIdx.x;
};
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (indata[tid] == 0xdeadbeef) {
unsigned int oldval = atomicCAS(&found_idx, 0, 1+tid);
}
}
int main()
{
const unsigned int N = 1 << 19;
unsigned int* in_data;
cudaMalloc((void **)&in_data, sizeof(unsigned int) * size_t(N));
cudaMemset(in_data, 0, sizeof(unsigned int) * size_t(N));
setkernel<<<1,1>>>(in_data);
cudaDeviceSynchronize();
unsigned int block_size = 1024;
unsigned int grid_size = N / block_size;
unsigned int* out_data;
cudaMalloc((void **)&out_data, sizeof(unsigned int) * size_t(grid_size));
cudaMemset(out_data, 0xf0, sizeof(unsigned int) * size_t(grid_size));
const unsigned int zero = 0;
cudaMemcpyToSymbol(found_idx, &zero, sizeof(unsigned int));
searchkernel<<<grid_size, block_size>>>(in_data, out_data);
std::vector<unsigned int> output(grid_size);
cudaMemcpy(&output[0], out_data, sizeof(unsigned int) * size_t(grid_size), cudaMemcpyDeviceToHost);
cudaDeviceReset();
std::cout << "The following blocks did not run" << std::endl;
for(int i=0, j=0; i<grid_size; i++) {
if (output[i] == 0xf0f0f0f0) {
std::cout << " " << i;
if (j++ == 20) {
std::cout << std::endl;
j = 0;
}
}
}
std::cout << std::endl;
return 0;
}
Here I have a simple kernel which is searching for a magic word in a large array. To get the early exit behaviour, I use a single global word, which is set atomically by those threads which "win" or trigger the termination condition. Every new block checks the state of this global word, and if it is set, they return without doing any work.
If I compile and run this on a moderate sized Kepler device:
$ nvcc -arch=sm_30 -o blocking blocking.cu
$ ./blocking
The following blocks did not run
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
504 505 506 507 508 509 510 511
you can see that a large number of blocks in the grid saw the change in the global word and early terminated without running the search code. This might be the best you can do without a severely invasive spinlock approach which will greatly harm performance.
I assume you want to stop a running kernel (not a single thread).
The simplest approach (and the one that I suggest) is to set up a global memory flag which is been tested by the kernel.
You can set the flag using cudaMemcpy() (or without if using unified memory).
Like the following:
if (gm_flag) {
__threadfence(); // ensure store issued before trap
asm("trap;"); // kill kernel with error
}
ams("trap;") will stop all running thread
Note that since cuda 2.0 you can use assert() to terminate a kernel!
A different approach could be the following (I haven't tried the code!)
__device__ bool go(int val){
return true;
}
__global__ void stopme(bool* flag, int* val, int size){
int idx= blockIdx.x *blockDim.x + threadIdx.x;
if(idx < size){
bool canContinue = true;
while(canContinue && (flag[0])){
printf("HELLO from %i\n",idx);
if(!(*flag)){
return;
}
else{
//do some computation
val[idx]++;
val[idx]%=100;
}
canContinue = go(val[idx]);
}
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(void)
{
int size = 128;
int* h_val = (int*)malloc(sizeof(int)*size);
bool * h_flag = new bool;
*h_flag=true;
bool* d_flag;
cudaMalloc(&d_flag,sizeof(bool));
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
int* d_val;
cudaMalloc(&d_val,sizeof(int)*size );
for(int i=0;i<size;i++){
h_val[i] = i;
}
cudaMemcpy(d_val,h_val,size,cudaMemcpyHostToDevice);
int BSIZE=32;
int nblocks =size/BSIZE;
printf("%i,%i",nblocks,BSIZE);
stopme<<<nblocks,BSIZE>>>(d_flag,d_val,size);
//--------------sleep for a while --------------------------
*h_flag=false;
cudaMemcpy(d_flag,h_flag,1,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
gpuErrchk( cudaPeekAtLastError() );
printf("END\n");
}
where the kernel stopMe keeps running until someone from the host side sets up the flag to false. Note that your kernel could be much more complicated than this and the effort to synchronize all threads in order to execute the return could be much more than this (and can affect performance). Hope this helped.
More info here

how to apply dynamic pagination for duplicate record in mysql

Eventid Scheduleid skill name
1 5 5 x
2 8 7 x
3 25 8 x
4 27 9 x
5 26 18 x
6 29 19 x
7 30 20 x
8 31 21 x
8 31 22 x
9 32 23 x
10 33 24 x
11 34 26 x
11 34 26 x
11 34 26 x
12 35 27 x
13 36 29 x
14 37 30 x
15 38 31 x
16 39 32 x
17 40 33 x
18 41 34 x
19 42 35 x
20 43 36 x
21 44 37 x
22 44 37 x
this is my record i want to apply pagination per page record in such manner if eventid and schedule id same then it should consider 1 record and accordingly we have to display 10 record per hit so in first hit i should get 11 record in per hit because it record has 1 even and schedule id having same value but skill different so it should consider one same in 21 even it also please suggest me how to apply pagination for this requirement
expected out put should like this
call procdure(page0 means offset value per 10 record)
Eventid Scheduleid skill name
1 5 5 x
2 8 7 x
3 25 8 x
4 27 9 x
5 26 18 x
6 29 19 x
7 30 20 x
8 31 21 x
8 31 22 x
9 32 23 x
10 33 24 x
11 34 26 x
11 34 26 x
11 34 26 x
now if page 1 then same :
12 35 27 x
13 36 29 x
14 37 30 x
15 38 31 x
16 39 32 x
17 40 33 x
18 41 34 x
19 42 35 x
20 43 36 x
21 44 37 x
22 44 37 x

Code-golf: Output multiplication table to the Console

Locked. This question and its answers are locked because the question is off-topic but has historical significance. It is not currently accepting new answers or interactions.
I recently pointed a student doing work experience to an article about dumping a multiplication table to the console. It used a nested for loop and multiplied the step value of each.
This looked like a .NET 2.0 approach. I was wondering, with the use of Linq and extension methods,for example, how many lines of code it would take to achieve the same result.
Is the stackoverflow community up to the challenge?
The challenge:
In a console application, write code to generate a table like this example:
01 02 03 04 05 06 07 08 09
02 04 06 08 10 12 14 16 18
03 06 09 12 15 18 21 24 27
04 08 12 16 20 24 28 32 36
05 10 15 20 25 30 35 40 45
06 12 18 24 30 36 42 48 54
07 14 21 28 35 42 49 56 63
08 16 24 32 40 48 56 64 72
09 18 27 36 45 54 63 72 81
As this turned into a language-agnostic code-golf battle, I'll go with the communities decision about which is the best solution for the accepted answer.
There's been alot of talk about the spec and the format that the table should be in, I purposefully added the 00 format but the double new-line was originally only there because I didn't know how to format the text when creating the post!
J - 8 chars - 24 chars for proper format
*/~1+i.9
Gives:
1 2 3 4 5 6 7 8 9
2 4 6 8 10 12 14 16 18
3 6 9 12 15 18 21 24 27
4 8 12 16 20 24 28 32 36
5 10 15 20 25 30 35 40 45
6 12 18 24 30 36 42 48 54
7 14 21 28 35 42 49 56 63
8 16 24 32 40 48 56 64 72
9 18 27 36 45 54 63 72 81
This solution found by #earl:
'r(0)q( )3.'8!:2*/~1+i.9
Gives:
01 02 03 04 05 06 07 08 09
02 04 06 08 10 12 14 16 18
03 06 09 12 15 18 21 24 27
04 08 12 16 20 24 28 32 36
05 10 15 20 25 30 35 40 45
06 12 18 24 30 36 42 48 54
07 14 21 28 35 42 49 56 63
08 16 24 32 40 48 56 64 72
09 18 27 36 45 54 63 72 81
MATLAB - 10 characters
a=1:9;a'*a
... or 33 characters for stricter output format
a=1:9;disp(num2str(a'*a,'%.2d '))
Brainf**k - 185 chars
>---------[++++++++++>---------[+<[-<+>>+++++++++[->+>>---------[>-<++++++++++<]<[>]>>+<<<<]>[-<+>]<---------<]<[->+<]>>>>++++[-<++++>]<[->++>+++>+++<<<]>>>[.[-]<]<]++++++++++.[-<->]<+]
cat - 252 characters
01 02 03 04 05 06 07 08 09
02 04 06 08 10 12 14 16 18
03 06 09 12 15 18 21 24 27
04 08 12 16 20 24 28 32 36
05 10 15 20 25 30 35 40 45
06 12 18 24 30 36 42 48 54
07 14 21 28 35 42 49 56 63
08 16 24 32 40 48 56 64 72
09 18 27 36 45 54 63 72 81
Assuming that a trailing newline is wanted; otherwise, 251 chars.
* runs *
Python - 61 chars
r=range(1,10)
for y in r:print"%02d "*9%tuple(y*x for x in r)
C#
This is only 2 lines. It uses lambdas not extension methods
var nums = new List<int>() { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
nums.ForEach(n => { nums.ForEach(n2 => Console.Write((n * n2).ToString("00 "))); Console.WriteLine(); });
and of course it could be done in one long unreadable line
new List<int>() { 1, 2, 3, 4, 5, 6, 7, 8, 9 }.ForEach(n => { new List<int>() { 1, 2, 3, 4, 5, 6, 7, 8, 9 }.ForEach(n2 => Console.Write((n * n2).ToString("00 "))); Console.WriteLine(); });
all of this is assuming you consider a labmda one line?
K - 12 characters
Let's take the rosetta-stoning seriously, and compare Kdb+'s K4 with the canonical J solution (*/~1+i.9):
a*/:\:a:1+!9
1 2 3 4 5 6 7 8 9
2 4 6 8 10 12 14 16 18
3 6 9 12 15 18 21 24 27
4 8 12 16 20 24 28 32 36
5 10 15 20 25 30 35 40 45
6 12 18 24 30 36 42 48 54
7 14 21 28 35 42 49 56 63
8 16 24 32 40 48 56 64 72
9 18 27 36 45 54 63 72 81
J's "table" operator (/) equals the K "each-left each-right" (/:\:) idiom. We don't have J's extremely handy "reflexive" operator (~) in K, so we have to pass a as both left and right argument.
Fortran95 - 40 chars (beating perl by 4 chars!)
This solution does print the leading zeros as per the spec.
print"(9(i3.2))",((i*j,i=1,9),j=1,9);end
Oracle SQL, 103 characters:
select n, n*2, n*3, n*4, n*5, n*6, n*7, n*8, n*9 from (select rownum n from dual CONNECT BY LEVEL < 10)
C# - 117, 113, 99, 96, 95 89 characters
updated based on NickLarsen's idea
for(int x=0,y;++x<10;)
for(y=x;y<x*10;y+=x)
Console.Write(y.ToString(y<x*9?"00 ":"00 \n"));
99, 85, 82 81 characters
... If you don't care about the leading zeros and would allow tabs for alignment.
for(int x=0,y;++x<10;)
{
var w="";
for(y=1;++y<10;)
w+=x*y+" ";
Console.WriteLine(w);
}
COBOL - 218 chars -> 216 chars
PROGRAM-ID.P.DATA DIVISION.WORKING-STORAGE SECTION.
1 I PIC 9.
1 N PIC 99.
PROCEDURE DIVISION.PERFORM 9 TIMES
ADD 1 TO I
SET N TO I
PERFORM 9 TIMES
DISPLAY N' 'NO ADVANCING
ADD I TO N
END-PERFORM
DISPLAY''
END-PERFORM.
Edit
216 chars (probably a different compiler)
PROGRAM-ID.P.DATA DIVISION.WORKING-STORAGE SECTION.
1 I PIC 9.
1 N PIC 99.
PROCEDURE DIVISION.
PERFORM B 9 TIMES
STOP RUN.
B.
ADD 1 TO I
set N to I
PERFORM C 9 TIMES
DISPLAY''.
C.
DISPLAY N" "NO ADVANCING
Add I TO N.
Not really a one-liner, but the shortest linq i can think of:
var r = Enumerable.Range(1, 9);
foreach (var z in r.Select(n => r.Select(m => n * m)).Select(a => a.Select(b => b.ToString("00 "))))
{
foreach (var q in z)
Console.Write(q);
Console.WriteLine();
}
In response to combining this and SRuly's answer
Enumberable.Range(1,9).ToList.ForEach(n => Enumberable.Range(1,9).ToList.ForEach(n2 => Console.Write((n * n2).ToString("00 "))); Console.WriteLine(); });
Ruby - 42 Chars (including one linebreak, interactive command line only)
This method is two lines of input and only works in irb (because irb gives us _), but shortens the previous method by a scant 2 charcters.
1..9
_.map{|y|puts"%02d "*9%_.map{|x|x*y}}
Ruby - 44 Chars (tied with perl)
(a=1..9).map{|y|puts"%02d "*9%a.map{|x|x*y}}
Ruby - 46 Chars
9.times{|y|puts"%02d "*9%(1..9).map{|x|x*y+x}}
Ruby - 47 Chars
And back to a double loop
(1..9).map{|y|puts"%02d "*9%(1..9).map{|x|x*y}}
Ruby - 54 chars!
Using a single loop saves a couple of chars!
(9..89).map{|n|print"%02d "%(n/9*(x=n%9+1))+"\n"*(x/9)}
Ruby - 56 chars
9.times{|x|puts (1..9).map{|y|"%.2d"%(y+x*y)}.join(" ")}
Haskell — 85 84 79 chars
r=[1..9]
s x=['0'|x<=9]++show x
main=mapM putStrLn[unwords[s$x*y|x<-r]|y<-r]
If double spacing is required (89 81 chars),
r=[1..9]
s x=['0'|x<=9]++show x
main=mapM putStrLn['\n':unwords[s$x*y|x<-r]|y<-r]
F# - 61 chars:
for y=1 to 9 do(for x=1 to 9 do printf"%02d "(x*y));printfn""
If you prefer a more applicative/LINQ-y solution, then in 72 chars:
[1..9]|>Seq.iter(fun y->[1..9]|>Seq.iter((*)y>>printf"%02d ");printfn"")
c# - 125, 123 chars (2 lines):
var r=Enumerable.Range(1,9).ToList();
r.ForEach(n=>{var s="";r.ForEach(m=>s+=(n*m).ToString("00 "));Console.WriteLine(s);});
C - 97 79 characters
#define f(i){int i=0;while(i++<9)
main()f(x)f(y)printf("%.2d ",x*y);puts("");}}
Perl, 44 chars
(No hope of coming anywhere near J, but languages with matrix ops are in a class of their own here...)
for$n(1..9){printf"%3d"x9 .$/,map$n*$_,1..9}
R (very similar to Matlab on this level): 12 characters.
> 1:9%*%t(1:9)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 1 2 3 4 5 6 7 8 9
[2,] 2 4 6 8 10 12 14 16 18
[3,] 3 6 9 12 15 18 21 24 27
[4,] 4 8 12 16 20 24 28 32 36
[5,] 5 10 15 20 25 30 35 40 45
[6,] 6 12 18 24 30 36 42 48 54
[7,] 7 14 21 28 35 42 49 56 63
[8,] 8 16 24 32 40 48 56 64 72
[9,] 9 18 27 36 45 54 63 72 81
PHP, 71 chars
for($x=0;++$x<10;print"\n"){for($y=0;++$y<10;){printf("%02d ",$x*$y);}}
Output:
$ php -r 'for($x=0;++$x<10;print"\n"){for($y=0;++$y<10;){printf("%02d ",$x*$y);}}'
01 02 03 04 05 06 07 08 09
02 04 06 08 10 12 14 16 18
03 06 09 12 15 18 21 24 27
04 08 12 16 20 24 28 32 36
05 10 15 20 25 30 35 40 45
06 12 18 24 30 36 42 48 54
07 14 21 28 35 42 49 56 63
08 16 24 32 40 48 56 64 72
09 18 27 36 45 54 63 72 81
C#, 135 chars, nice and clean:
var rg = Enumerable.Range(1, 9);
foreach (var rc in from r in rg
from c in rg
select (r * c).ToString("D2") + (c == 9 ? "\n\n" : " "))
Console.Write(rc);
PostgreSQL: 81 74 chars
select array(select generate_series(1,9)*x)from generate_series(1,9)as x;
Ruby - 56 chars :D
9.times{|a|9.times{|b|print"%02d "%((a+1)*(b+1))};puts;}
C - 66 Chars
This resolves the complaint about the second parameter of main :)
main(x){for(x=8;x++<89;)printf("%.2d%c",x/9*(x%9+1),x%9<8?32:10);}
C - 77 chars
Based on dreamlax's 97 char answer. His current answer somewhat resembles this one now :)
Compiles ok with gcc, and main(x,y) is fair game for golf i reckon
#define f(i){for(i=0;i++<9;)
main(x,y)f(x)f(y)printf("%.2d ",x*y);puts("");}}
XQuery 1.0 (96 bytes)
string-join(for$x in 1 to 9 return(for$y in 1 to 9 return concat(0[$x*$y<10],$x*$y,' '),'
'),'')
Run (with XQSharp) with:
xquery table.xq !method=text
Scala - 77 59 58 chars
print(1 to 9 map(p=>1 to 9 map(q=>"%02d "format(p*q))mkString)mkString("\n"))
Sorry, I had to do this, the Scala solution by Malax was way too readable...
[Edit] For comprehension seems to be the better choice:
for(p<-1 to 9;q<-{println;1 to 9})print("%02d "format p*q)
[Edit] A much longer solution, but without multiplication, and much more obfuscated:
val s=(1 to 9).toSeq
(s:\s){(p,q)=>println(q.map("%02d "format _)mkString)
q zip(s)map(t=>t._1+t._2)}
PHP, 62 chars
for(;$x++<9;print"\n",$y=0)while($y++<9)printf("%02d ",$x*$y);
Java - 155 137 chars
Update 1: replaced string building by direct printing. Saved 18 chars.
class M{public static void main(String[]a){for(int x,y=0,z=10;++y<z;System.out.println())for(x=0;++x<z;System.out.printf("%02d ",x*y));}}
More readable format:
class M{
public static void main(String[]a){
for(int x,y=0,z=10;++y<z;System.out.println())
for(x=0;++x<z;System.out.printf("%02d ",x*y));
}
}
Another attempt using C#/Linq with GroupJoin:
Console.Write(
String.Join(
Environment.NewLine,
Enumerable.Range(1, 9)
.GroupJoin(Enumerable.Range(1, 9), y => 0, x => 0, (y, xx) => String.Join(" ", xx.Select(x => x * y)))
.ToArray()));
Ruby — 47 chars
puts (a=1..9).map{|i|a.map{|j|"%2d"%(j*i)}*" "}
Output
1 2 3 4 5 6 7 8 9
2 4 6 8 10 12 14 16 18
3 6 9 12 15 18 21 24 27
4 8 12 16 20 24 28 32 36
5 10 15 20 25 30 35 40 45
6 12 18 24 30 36 42 48 54
7 14 21 28 35 42 49 56 63
8 16 24 32 40 48 56 64 72
9 18 27 36 45 54 63 72 81
(If we ignore spacing, it becomes 39: puts (a=1..9).map{|i|a.map{|j|j*i}*" "} And anyway, I feel like there's a bit of room for improvement with the wordy map stuff.)