Parallel histogram gives wrong results - cuda

I try to implement character frequency program in C using CUDA and I have an issue with the results. I think it's something about thread synchronization but I need help.
Output for 1 block and 1 thread per block:
>./char_freq.exe test.txt 1 1
file size is 115
0 = 0
1 = 0
2 = 0
3 = 0
4 = 0
5 = 0
6 = 0
7 = 0
8 = 0
9 = 0
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 14
103 = 7
104 = 18
105 = 1
106 = 14
107 = 20
108 = 0
109 = 0
110 = 1
111 = 1
112 = 0
113 = 0
114 = 5
115 = 8
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0
N: 128, Blocks: 1, Threads: 1
Total time (ms): 0.143
Kernel time (ms): 0.046
Data transfer time(ms): 0.097
Output for 1 block and 5 threads per block:
>./char_freq.exe test.txt 1 5
file size is 115
0 = 0
1 = 0
2 = 0
3 = 0
4 = 0
5 = 0
6 = 0
7 = 0
8 = 0
9 = 0
10 = 0
11 = 0
12 = 0
13 = 0
14 = 0
15 = 0
16 = 0
17 = 0
18 = 0
19 = 0
20 = 0
21 = 0
22 = 0
23 = 0
24 = 0
25 = 0
26 = 0
27 = 0
28 = 0
29 = 0
30 = 0
31 = 0
32 = 0
33 = 0
34 = 0
35 = 0
36 = 0
37 = 0
38 = 0
39 = 0
40 = 0
41 = 0
42 = 0
43 = 0
44 = 0
45 = 0
46 = 0
47 = 0
48 = 0
49 = 0
50 = 0
51 = 0
52 = 1
53 = 1
54 = 1
55 = 0
56 = 0
57 = 0
58 = 0
59 = 0
60 = 0
61 = 0
62 = 0
63 = 0
64 = 0
65 = 0
66 = 0
67 = 0
68 = 0
69 = 0
70 = 0
71 = 0
72 = 0
73 = 0
74 = 0
75 = 0
76 = 0
77 = 0
78 = 0
79 = 0
80 = 0
81 = 0
82 = 0
83 = 0
84 = 0
85 = 0
86 = 0
87 = 0
88 = 0
89 = 0
90 = 0
91 = 0
92 = 0
93 = 0
94 = 0
95 = 0
96 = 0
97 = 0
98 = 2
99 = 2
100 = 9
101 = 1
102 = 12
103 = 7
104 = 13
105 = 1
106 = 11
107 = 12
108 = 0
109 = 0
111 = 1
112 = 0
113 = 0
114 = 5
115 = 7
116 = 3
117 = 0
118 = 0
119 = 0
120 = 0
121 = 6
122 = 0
123 = 0
124 = 0
125 = 0
126 = 0
127 = 0
N: 128, Blocks: 1, Threads: 5
Total time (ms): 0.157
Kernel time (ms): 0.048
Data transfer time(ms): 0.109
Why the results are different?
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define N 128
#define base 0
__global__ void char_freq(char *buffer, int *freq, int slice, int extra, int total_threads){
int index = threadIdx.x + blockIdx.x * blockDim.x ;
int start = index * slice;
int stop = start + slice;
int i;
if (index == (total_threads-1))
stop += extra;
__shared__ int local_freq[N];
//initialize local_freq
if(threadIdx.x == 0){
memset(local_freq, 0, N*sizeof(int));
}
__syncthreads();
for(i=start; i<stop; i++){
local_freq[buffer[i] - base]++;
}
__syncthreads();
for(i=0; i<N; i++){
freq[i] += local_freq[i];
}
__syncthreads();
}
int main(int argc, char *argv[]){
FILE *pFile;
long file_size;
char * buffer;
char * filename;
size_t result;
int j, freq[N];
int slice, extra;
int total_blocks, threads_per_block, total_threads;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 4) {
printf ("Usage : %s <file_name> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
total_blocks = strtol(argv[2], NULL, 10);
threads_per_block = strtol(argv[3], NULL, 10);
total_threads = total_blocks*threads_per_block;
filename = argv[1];
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
char *buffer_dev;
int *freq_dev;
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&freq_dev, N*sizeof(int));
cudaMemset(freq_dev,0,N*sizeof(int));
cudaEventRecord(total_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaEventRecord(comp_start);
slice = file_size / total_threads;
extra = file_size % total_threads;
char_freq<<<total_blocks, threads_per_block>>>(buffer_dev, freq_dev, slice, extra, total_threads);
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(freq, freq_dev, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(buffer_dev);
cudaFree(freq_dev);
for (j=0; j<N; j++){
printf("%d = %d\n", j+base, freq[j]);
}
fclose (pFile);
free (buffer);
//GPU Timing
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", N, total_blocks, total_threads);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
return 0;
}

This should work (there might be typos, I have neither compiled nor run the code):
__global__ void char_freq(char *buffer, int *freq, int buffersize) {
__shared__ int sh_freq[N];
// block-stride loop over shared buffer
for (int idx = threadIdx.x; idx < N; idx += blockDim.x) {
sh_freq[idx] = 0;
}
__syncthreads();
// grid-stride loop over global buffer
const int gtid = blockIdx.x * blockDim.x + threadIdx.x;
const int grid_size = blockDim.x * gridDim.x;
for (int idx = gtid; idx < buffersize; idx += grid_size) {
atomicAdd(&sh_freq[buffer[idx] - base], 1);
}
__syncthreads();
// block-stride loop over shared buffer
for (int idx = threadIdx.x; idx < N; idx += blockDim.x) {
atomicAdd(&freq[idx], sh_freq[idx]);
}
}
For better performance take a look at the CUDA sample as mentioned in the comments.

I finally found a solution!
I used temp array for optimization
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define N 128
#define base 0
__global__ void char_freq(char *buffer, int *freq, int buffersize){
int tid;
__shared__ int temp[N];
//Cyclic calculation of block local frequences
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<buffersize; tid+=blockDim.x){
atomicAdd(&temp[buffer[tid]-base], 1); //used for thread synchronization
}
//reduce temp results to freq
if(threadIdx.x == 0){
int j;
for(j=0; j<N; j++){
atomicAdd(&freq[j], temp[j]);
}
}
}
int main(int argc, char *argv[]){
FILE *pFile;
long file_size;
char * buffer;
char * filename;
size_t result;
int j, freq[N];
int slice, extra;
int total_blocks, threads_per_block, total_threads;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 4) {
printf ("Usage : %s <file_name> <blocks> <threads_per_block>\n", argv[0]);
return 1;
}
total_blocks = strtol(argv[2], NULL, 10);
threads_per_block = strtol(argv[3], NULL, 10);
total_threads = total_blocks*threads_per_block;
filename = argv[1];
pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File error\n"); return 2;}
//obtain file size
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ld\n", file_size);
//allocate memory to contain the file
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory error\n"); return 3;}
//copy file data to buffer
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading error\n"); return 4;}
//Device arrays (GPU)
char *buffer_dev;
int *freq_dev;
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&freq_dev, N*sizeof(int));
cudaMemset(freq_dev,0,N*sizeof(int));
cudaEventRecord(total_start);
//Copy data from host (CPU) to device (GPU)
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaEventRecord(comp_start);
char_freq<<<total_blocks, threads_per_block>>>(buffer_dev, freq_dev, file_size);
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
//Copy result from device (GPU) to host (CPU)
cudaMemcpy(freq, freq_dev, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(buffer_dev);
cudaFree(freq_dev);
//Print Result
for (j=0; j<N; j++){
printf("%d = %d\n", j+base, freq[j]);
}
fclose (pFile);
free (buffer);
//GPU Timing
printf("\n\n\nN: %d, Blocks: %d, Threads: %d\n", N, total_blocks, total_threads);
printf("Total time (ms): %.3f\n", total_time);
printf("Kernel time (ms): %.3f\n", comp_time);
printf("Data transfer time(ms): %.3f\n\n\n", total_time-comp_time);
return 0;
}

Related

Nested loops modulo permutation in cuda

I need to perform a function on triplets taken from an array and add the result to a Histogram, but I want to avoid permutations since the function is invariant under those [F(i,j,k) = F(j,i,k) and so on].
Normally I would code something like this:
def F(int i, int j, int k){
int temp_index;
/* Do something */
return temp_index;
}
for(int i=0;i<N;i++){
for(int j=i+1;j<N;j++){
for(int k=j+1;k<N;k++){
hist[F(i,j,k)]++;
}
}
}
As N is quite big (approx. 10^5), I would like to call perform this on a GPU using cuda.
I have written a code to call this function on the GPU, but I have no idea how to prevent multiple calls of the same triple of indices. So far I call cuda with a 3-dimensional grid, like:
__global__ void compute_3pcf(float *theta, float *hist) {
int i,j,k;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
k = blockIdx.z*blockDim.z + threadIdx.z;
if(i>=j || j>=k) return;
atomicAdd(&hist[F(i,j,k)],1);
}
int main(){
/*
Allocation of memory and cudaMemcpy
*/
dim3 grid((N+15)/16,(N+7)/8,(N+7)/8);
dim3 block(16,8,8);
//Launch on GPU
compute_3pcf<<<grid,block>>>(d_theta, d_hist);
}
However, now for each combination (i,j,k) a new thread is launched and then aborted, which seems very inefficient to me, as then only 1/6 of the threads perform the actual computation. What I would like to have is something like this:
__global__ void compute_3pcf(float *theta, float *hist) {
int i,j,k,idx;
idx = blockIdx.x*blockDim.x + threadIdx.x;
i = H_i(idx);
j = H_j(idx,i);
k = H_k(idx,j);
atomicAdd(&hist[F(i,j,k)],1);
}
int main(){
/*
Allocation of memory and cudaMemcpy
*/
long long int N_combinations = N*(N-1)*(N-2)/6;
long int grid = (N_combinations+1023)/1024;
int block = 1024;
//Launch on GPU
compute_3pcf<<<grid,block>>>(d_theta, d_hist);
}
However, I am unable to find the functions H_i, H_j, H_k. If anyone can tell me how I could solve or avoid this problem, I would be very thankful.
Edit: The histogram contains about 10^6 bins, so that I can not have one histogram per block in a shared memory, like in the example code for cuda. Instead, it lies in the global memory of the GPU.
[Disclaimer -- this is only a partial answer and a work in progress and answers a related problem, while only hinting at a solution to the actual question]
Before thinking about algorithms and code it is useful to understand the mathematical character of your problem. If we look at the output of your pseudocode in Python (and note that this includes the diagonal entries where the original question does not), we see this for the 5x5x5 case:
N = 5
x0 = np.zeros((N,N,N), dtype=np.int)
idx = 1
for i in range(0,N):
for j in range(i,N):
for k in range(j,N):
x0[i,j,k] = idx
idx += 1
print(x0)
we get:
[[[ 1 2 3 4 5]
[ 0 6 7 8 9]
[ 0 0 10 11 12]
[ 0 0 0 13 14]
[ 0 0 0 0 15]]
[[ 0 0 0 0 0]
[ 0 16 17 18 19]
[ 0 0 20 21 22]
[ 0 0 0 23 24]
[ 0 0 0 0 25]]
[[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 26 27 28]
[ 0 0 0 29 30]
[ 0 0 0 0 31]]
[[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 32 33]
[ 0 0 0 0 34]]
[[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 35]]]
i.e. the unique entries form a series of stacked upper triangular matrices of decreasing sizes. As identified in comments, the number of non-zero entries is a tetrahedral number, in this case for n = 5, the tetrahedral number Tr[5] = 5*(5+1)*(5+2)/6 = 35 entries, and the non zero entries fill a tetrahedral shaped region of the hypermatrix in three dimensions (best illustration here) And as noted in the original question, all the permutations of indices are functionally identical in the problem, meaning that there are six (3P3) functionally identical symmetric tetrahedral regions in the cubic hypermatrix. You can confirm this yourself:
x1 = np.zeros((N,N,N), dtype=np.int)
idx = 1
for i in range(0,N):
for j in range(0,N):
for k in range(0,N):
if (i <= j) and (j <= k):
x1[i,j,k] = idx
x1[i,k,j] = idx
x1[j,i,k] = idx
x1[j,k,i] = idx
x1[k,i,j] = idx
x1[k,j,i] = idx
idx += 1
print(x1)
which gives:
[[[ 1 2 3 4 5]
[ 2 6 7 8 9]
[ 3 7 10 11 12]
[ 4 8 11 13 14]
[ 5 9 12 14 15]]
[[ 2 6 7 8 9]
[ 6 16 17 18 19]
[ 7 17 20 21 22]
[ 8 18 21 23 24]
[ 9 19 22 24 25]]
[[ 3 7 10 11 12]
[ 7 17 20 21 22]
[10 20 26 27 28]
[11 21 27 29 30]
[12 22 28 30 31]]
[[ 4 8 11 13 14]
[ 8 18 21 23 24]
[11 21 27 29 30]
[13 23 29 32 33]
[14 24 30 33 34]]
[[ 5 9 12 14 15]
[ 9 19 22 24 25]
[12 22 28 30 31]
[14 24 30 33 34]
[15 25 31 34 35]]]
Here it should be obvious that you can slice the hypermatrix along any plane and get a symmetric matrix, and that it can be constructed by a set of reflections from any of the six permutations of the same basic tetrahedral hypermatrix.
That last part is important because I am now going to focus on another permutation from the one in your question. It is functionally the same (as shown above) but mathematically and graphically easier to visualize compared to the upper tetrahedron calculated by the original pseudocode in the question. Again some Python:
N = 5
nmax = N * (N+1) * (N+2) // 6
x= np.empty(nmax, dtype=object)
x2 = np.zeros((N,N,N), dtype=np.int)
idx = 1
for i in range(0,N):
for j in range(0,i+1):
for k in range(0,j+1):
x2[i,j,k] = idx
x[idx-1] = (i,j,k)
idx +=1
print(x)
print(x2)
which produces
[(0, 0, 0) (1, 0, 0) (1, 1, 0) (1, 1, 1) (2, 0, 0) (2, 1, 0) (2, 1, 1)
(2, 2, 0) (2, 2, 1) (2, 2, 2) (3, 0, 0) (3, 1, 0) (3, 1, 1) (3, 2, 0)
(3, 2, 1) (3, 2, 2) (3, 3, 0) (3, 3, 1) (3, 3, 2) (3, 3, 3) (4, 0, 0)
(4, 1, 0) (4, 1, 1) (4, 2, 0) (4, 2, 1) (4, 2, 2) (4, 3, 0) (4, 3, 1)
(4, 3, 2) (4, 3, 3) (4, 4, 0) (4, 4, 1) (4, 4, 2) (4, 4, 3) (4, 4, 4)]
[[[ 1 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]]
[[ 2 0 0 0 0]
[ 3 4 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]]
[[ 5 0 0 0 0]
[ 6 7 0 0 0]
[ 8 9 10 0 0]
[ 0 0 0 0 0]
[ 0 0 0 0 0]]
[[11 0 0 0 0]
[12 13 0 0 0]
[14 15 16 0 0]
[17 18 19 20 0]
[ 0 0 0 0 0]]
[[21 0 0 0 0]
[22 23 0 0 0]
[24 25 26 0 0]
[27 28 29 30 0]
[31 32 33 34 35]]]
You can see it is a transformation of the original code, with each "layer" of the tetrahedron built from a lower triangular matrix of increasing size, rather than upper triangular matrices of successively smaller size.
When you look at tetrahedron produced by this permutation, it should be obvious that each lower triangular slice starts at a tetrahedral number within the linear array of indices and each row within the lower triangular matrix starts at a triangular number offset relative to the start of the matrix. The indexing scheme is, therefore:
idx(i,j,k) = (i*(i+1)*(i+2)/6) + (j*(j+1)/2) + k
when data is arranged so that the kth dimension is the fastest varying in memory, and ith the slowest.
Now to the actual question. To calculate (i,j,k) from a given idx value would require calculating the integer cube root for i and the integer square root for j, which isn't particularly easy or performant and I would not imagine that it would offer any advantage over what you have now. However, if your implementation has a finite and known dimension a priori, you can use precalculated tetrahedral and triangular numbers and perform a lookup to replace the need to calculate roots.
A toy example:
#include <cstdio>
__constant__ unsigned int tetdata[100] =
{ 0, 1, 4, 10, 20, 35, 56, 84, 120, 165, 220, 286, 364, 455, 560, 680, 816, 969, 1140,
1330, 1540, 1771, 2024, 2300, 2600, 2925, 3276, 3654, 4060, 4495, 4960, 5456, 5984,
6545, 7140, 7770, 8436, 9139, 9880, 10660, 11480, 12341, 13244, 14190, 15180, 16215,
17296, 18424, 19600, 20825, 22100, 23426, 24804, 26235, 27720, 29260, 30856, 32509,
34220, 35990, 37820, 39711, 41664, 43680, 45760, 47905, 50116, 52394, 54740, 57155,
59640, 62196, 64824, 67525, 70300, 73150, 76076, 79079, 82160, 85320, 88560, 91881,
95284, 98770, 102340, 105995, 109736, 113564, 117480, 121485, 125580, 129766, 134044,
138415, 142880, 147440, 152096, 156849, 161700, 166650 };
__constant__ unsigned int tridata[100] =
{ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120,
136, 153, 171, 190, 210, 231, 253, 276, 300, 325, 351, 378, 406,
435, 465, 496, 528, 561, 595, 630, 666, 703, 741, 780, 820, 861,
903, 946, 990, 1035, 1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431,
1485, 1540, 1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145,
2211, 2278, 2346, 2415, 2485, 2556, 2628, 2701, 2775, 2850, 2926, 3003,
3081, 3160, 3240, 3321, 3403, 3486, 3570, 3655, 3741, 3828, 3916, 4005,
4095, 4186, 4278, 4371, 4465, 4560, 4656, 4753, 4851, 4950 };
__device__ unsigned int lookup(unsigned int&x, unsigned int n, const unsigned int* data)
{
int i=0;
while (n >= data[i]) i++;
x = data[i-1];
return i-1;
}
__device__ unsigned int tetnumber(unsigned int& x, unsigned int n) { return lookup(x, n, tetdata); }
__device__ unsigned int trinumber(unsigned int& x, unsigned int n) { return lookup(x, n, tridata); }
__global__ void kernel()
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int x;
unsigned int k = idx;
unsigned int i = tetnumber(x, k); k -= x;
unsigned int j = trinumber(x, k); k -= x;
printf("idx = %d, i=%d j=%d k=%d\n", idx, i, j, k);
}
int main(void)
{
cudaSetDevice(0);
kernel<<<1,35>>>();
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
which does the same thing as the python (note the out-of-order print output):
$ nvcc -o tetrahedral tetrahedral.cu
avidday#marteil2:~/SO$ cuda-memcheck ./tetrahedral
========= CUDA-MEMCHECK
idx = 32, i=4 j=4 k=2
idx = 33, i=4 j=4 k=3
idx = 34, i=4 j=4 k=4
idx = 0, i=0 j=0 k=0
idx = 1, i=1 j=0 k=0
idx = 2, i=1 j=1 k=0
idx = 3, i=1 j=1 k=1
idx = 4, i=2 j=0 k=0
idx = 5, i=2 j=1 k=0
idx = 6, i=2 j=1 k=1
idx = 7, i=2 j=2 k=0
idx = 8, i=2 j=2 k=1
idx = 9, i=2 j=2 k=2
idx = 10, i=3 j=0 k=0
idx = 11, i=3 j=1 k=0
idx = 12, i=3 j=1 k=1
idx = 13, i=3 j=2 k=0
idx = 14, i=3 j=2 k=1
idx = 15, i=3 j=2 k=2
idx = 16, i=3 j=3 k=0
idx = 17, i=3 j=3 k=1
idx = 18, i=3 j=3 k=2
idx = 19, i=3 j=3 k=3
idx = 20, i=4 j=0 k=0
idx = 21, i=4 j=1 k=0
idx = 22, i=4 j=1 k=1
idx = 23, i=4 j=2 k=0
idx = 24, i=4 j=2 k=1
idx = 25, i=4 j=2 k=2
idx = 26, i=4 j=3 k=0
idx = 27, i=4 j=3 k=1
idx = 28, i=4 j=3 k=2
idx = 29, i=4 j=3 k=3
idx = 30, i=4 j=4 k=0
idx = 31, i=4 j=4 k=1
========= ERROR SUMMARY: 0 errors
Obviously the lookup function is only for demonstration purposes. At large sizes either a binary array or hash based look-up would be much faster. But this at least demonstrates that it seems possible to do what you envisaged, even if the problem solved and approach are subtly different from what you probably had in mind.
Note I have no formal mathemtical proofs for anything in this answer and don't claim that any of the code or propositions here are correct. Buyer beware.
After some more thought, it is trivial to extend this approach via a hybrid search/calculation routine which is reasonably efficient:
#include <iostream>
#include <vector>
#include <cstdio>
typedef unsigned int uint;
__device__ __host__ ulong tetnum(uint n) { ulong n1(n); return n1 * (n1 + 1ull) * (n1 + 2ull) / 6ull; }
__device__ __host__ ulong trinum(uint n) { ulong n1(n); return n1 * (n1 + 1ull) / 2ull; }
typedef ulong (*Functor)(uint);
template<Functor F>
__device__ __host__ uint bounded(ulong& y, ulong x, uint n1=0, ulong y1=0)
{
uint n = n1;
y = y1;
while (x >= y1) {
y = y1;
n = n1++;
y1 = F(n1);
}
return n;
}
__constant__ uint idxvals[19] = {
0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384,
32768, 65536, 131072 };
__constant__ ulong tetvals[19] = {
0, 1, 4, 20, 120, 816, 5984, 45760, 357760, 2829056, 22500864, 179481600, 1433753600,
11461636096, 91659526144, 733141975040, 5864598896640, 46914643623936, 375308558925824 };
__constant__ ulong trivals[19] = {
0, 1, 3, 10, 36, 136, 528, 2080, 8256, 32896, 131328, 524800, 2098176, 8390656, 33558528,
134225920, 536887296, 2147516416, 8590000128 };
__device__ __host__ uint lookup(ulong& x, uint n, const uint* abscissa, const ulong* data)
{
uint i=0;
while (n >= data[i]) i++;
x = data[i-1];
return abscissa[i-1];
}
__device__ uint tetnumber(ulong& x, uint n)
{
ulong x0;
uint n0 = lookup(x0, n, idxvals, tetvals);
return bounded<tetnum>(x, n, n0, x0);
}
__device__ uint trinumber(ulong& x, uint n)
{
ulong x0;
uint n0 = lookup(x0, n, idxvals, trivals);
return bounded<trinum>(x, n, n0, x0);
}
__global__ void kernel(uint3 *results, ulong Nmax)
{
ulong idx = threadIdx.x + blockIdx.x * blockDim.x;
ulong gridStride = blockDim.x * gridDim.x;
for(; idx < Nmax; idx += gridStride) {
ulong x, k1 = idx;
uint3 tuple;
tuple.x = tetnumber(x, k1); k1 -= x;
tuple.y = trinumber(x, k1); k1 -= x;
tuple.z = (uint)k1;
results[idx] = tuple;
}
}
int main(void)
{
cudaSetDevice(0);
uint N = 500;
ulong Nmax = tetnum(N);
uint3* results_d; cudaMalloc(&results_d, Nmax * sizeof(uint3));
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, kernel);
kernel<<<gridsize, blocksize>>>(results_d, Nmax);
cudaDeviceSynchronize();
std::vector<uint3> results(Nmax);
cudaMemcpy(&results[0], results_d, Nmax * sizeof(uint3), cudaMemcpyDeviceToHost);
cudaDeviceReset();
// Only uncomment this if you want to see 22 million lines of output
//for(auto const& idx : results) {
// std::cout << idx.x << " " << idx.y << " " << idx.z << std::endl;
//}
return 0;
}
which does this (be aware it will emit 21 million lines of output if you uncomment the last loop):
$ module load use.own cuda9.2
$ nvcc -std=c++11 -arch=sm_52 -o tetrahedral tetrahedral.cu
$ nvprof ./tetrahedral
==20673== NVPROF is profiling process 20673, command: ./tetrahedral
==20673== Profiling application: ./tetrahedral
==20673== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.85% 154.23ms 1 154.23ms 154.23ms 154.23ms kernel(uint3*, unsigned long)
21.15% 41.361ms 1 41.361ms 41.361ms 41.361ms [CUDA memcpy DtoH]
API calls: 41.73% 154.24ms 1 154.24ms 154.24ms 154.24ms cudaDeviceSynchronize
30.90% 114.22ms 1 114.22ms 114.22ms 114.22ms cudaMalloc
15.94% 58.903ms 1 58.903ms 58.903ms 58.903ms cudaDeviceReset
11.26% 41.604ms 1 41.604ms 41.604ms 41.604ms cudaMemcpy
0.11% 412.75us 96 4.2990us 275ns 177.45us cuDeviceGetAttribute
0.04% 129.46us 1 129.46us 129.46us 129.46us cuDeviceTotalMem
0.02% 55.616us 1 55.616us 55.616us 55.616us cuDeviceGetName
0.01% 32.919us 1 32.919us 32.919us 32.919us cudaLaunchKernel
0.00% 10.211us 1 10.211us 10.211us 10.211us cudaSetDevice
0.00% 5.7640us 1 5.7640us 5.7640us 5.7640us cudaFuncGetAttributes
0.00% 4.6690us 1 4.6690us 4.6690us 4.6690us cuDeviceGetPCIBusId
0.00% 2.8580us 4 714ns 393ns 1.3680us cudaDeviceGetAttribute
0.00% 2.8050us 3 935ns 371ns 2.0030us cuDeviceGetCount
0.00% 2.2780us 1 2.2780us 2.2780us 2.2780us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 1.6720us 1 1.6720us 1.6720us 1.6720us cudaGetDevice
0.00% 1.5450us 2 772ns 322ns 1.2230us cuDeviceGet
That code calculates and stores the unique (i,j,k) pairs for a 500 x 500 x 500 search space (about 21 million values) in 150 milliseconds on my GTX970. Perhaps that is some use to you.
One possible approach is given on this wikipedia page ("Finding the k-combination for a given number") for a closed-form solution to convert a linear index into a unique C(n,3) combination.
However it will involve calculating square roots and cube roots, so its "non-trivial". My rationale for even mentioning it is two-fold:
If the amount of work to be saved per-thread is substantial, then the additional burden this method proposes may be offset by that. However, for the example given, the amount of work per thread saved is just a few simple if-tests.
Processor trends are such that computation cost is dropping more rapidly than e.g. memory access cost. Since this approach involves no memory access, if future processor trends continue in this vein, this approach may become more palatable.
This approach is also distinguished by the fact that there is no iterative exhaustive table searching. However as indicated in the other answer, for the stipulations given there, it is almost certainly preferable to this approach, currently.
As indicated on the previously mentioned wiki page, the general approach will be to:
Find the largest C(n,3) number that is less than the current index (N). The n value associated with this C(n,3) number becomes the ordinal value of our first "choice" index n1.
Subtract the C(n,3) number from the current index. The process is repeated with the remainder and C(n,2). The n value associated with the maximum C(n,2) number that fits within our remainder becomes our second "choice" index n2.
The remainder is found from step 2, and this then identifies our final C(n,1) choice (C(n,1) = n = n3).
In order to come up with a closed-form solution to step 1, we must:
identify the cubic equation associated with the relationship between
N and C(N,3)
Use the solution of the cubic polynomial to identify N (in floating
point).
Truncate the value N, to get our "largest" N.
perform an integer search around this point, for the correct solution, to address floating point issues
A similar process can be repeated for step 2 (quadratic) and step 3 (linear).
I don't intend to cover all the math in particular detail, however the solution of a cubic polynomial equation in closed form can be readily found on the web (such as here) and the derivation of the governing cubic equation for step 1 is straightforward. We simply use the formula for the total number of choices already given in the question, coupled with the particular thread index:
n(n-1)(n-2)/6 = N -> n(n-1)(n-2)/6 - N = 0
rearranging:
(n^3)/6 - (n^2)/2 + n/3 - N = 0
from this we can acquire the a,b,c,d coefficients to feed into our cubic solution method.
a = 1/6, b = -1/2, c = 1/3, d = -N
(Note that N here is effectively our globally unique 1D thread index. We are solving for n, which gives us our first "choice" index.)
Studying the formula for the solution of the cubic, we note that the only item that varies among threads is the d coefficient. This allows for reduction of some arithmetic at run-time.
What follows then is a worked example. It is not thoroughly tested, as my aim here is to identify a solution method, not a fully tested solution:
$ cat t1485.cu
#include <stdio.h>
#include <math.h>
typedef float ct;
const int STEP_DOWN = 2;
// only float or double template types allowed
template <typename ft>
struct CN3{
__host__ __device__
int3 operator()(size_t N){
int3 n;
if (N == 0) {n.x = 2; n.y = 1; n.z = 0; return n;}
if (N == 1) {n.x = 3; n.y = 1; n.z = 0; return n;}
if (N == 2) {n.x = 3; n.y = 2; n.z = 0; return n;}
if (N == 3) {n.x = 3; n.y = 2; n.z = 1; return n;}
if (N == 4) {n.x = 4; n.y = 1; n.z = 0; return n;}
ft x, x1;
// identify n.x from cubic
// compiler computed
const ft a = 1.0/6;
const ft b = -1.0/2;
const ft c = 1.0/3;
const ft p1 = (-1.0)*(b*b*b)/(27.0*a*a*a) + b*c/(6.0*a*a);
const ft p2 = c/(3.0*a) - (b*b)/(9.0*a*a);
const ft p3 = p2*p2*p2;
const ft p4 = b/(3.0*a);
// run-time computed
//const ft d = -N;
const ft q0 = N/(2.0*a); // really should adjust constant for float vs. double
const ft q1 = p1 + q0;
const ft q2 = q1*q1;
if (sizeof(ft)==4){
x1 = sqrtf(q2+p3);
x = cbrtf(q1+x1) + cbrtf(q1-x1) - p4;
n.x = truncf(x);}
else {
x1 = sqrt(q2+p3);
x = cbrt(q1+x1) + cbrt(q1-x1) - p4;
n.x = trunc(x);}
/// fix floating-point errors
size_t tn = n.x - STEP_DOWN;
while ((tn)*(tn-1)*(tn-2)/6 <= N) tn++;
n.x = tn-1;
// identify n.y from quadratic
// compiler computed
const ft qa = 1.0/2;
//const ft qb = -qa;
const ft p5 = 1.0/4;
const ft p6 = 2.0;
// run-time computed
N = N - (((size_t)n.x)*(n.x-1)*(n.x-2))/6;
if (sizeof(ft)==4){
x = qa + sqrtf(p5+p6*N);
n.y = truncf(x);}
else {
x = qa + sqrt(p5+p6*N);
n.y = trunc(x);}
/// fix floating-point errors
if ((n.y - STEP_DOWN) <= 0) tn = 0;
else tn = n.y - STEP_DOWN;
while ((((tn)*(tn-1))>>1) <= N) tn++;
n.y = tn-1;
// identify n3
n.z = N - ((((size_t)n.y)*(n.y-1))>>1);
return n;
}
};
template <typename T>
__global__ void test(T f, size_t maxn, int3 *res){
size_t idx = threadIdx.x+((size_t)blockDim.x)*blockIdx.x;
if (idx < maxn)
res[idx] = f(idx);
}
int3 get_next_C3(int3 prev){
int3 res = prev;
res.z++;
if (res.z >= res.y){
res.y++; res.z = 0;
if (res.y >= res.x){res.x++; res.y = 1; res.z = 0;}}
return res;
}
int main(int argc, char* argv[]){
size_t n = 1000000000;
if (argc > 1) n *= atoi(argv[1]);
const int nTPB = 256;
int3 *d_res;
cudaMalloc(&d_res, n*sizeof(int3));
test<<<(n+nTPB-1)/nTPB,nTPB>>>(CN3<ct>(), n, d_res);
int3 *h_gpu = new int3[n];
int3 temp;
temp.x = 2; temp.y = 1; temp.z = 0;
cudaMemcpy(h_gpu, d_res, n*sizeof(int3), cudaMemcpyDeviceToHost);
for (int i = 0; i < n; i++){
if ((temp.x != h_gpu[i].x) || (temp.y != h_gpu[i].y) || (temp.z != h_gpu[i].z))
{printf("mismatch at index %d: cpu: %d,%d,%d gpu: %d,%d,%d\n", i, temp.x,temp.y,temp.z, h_gpu[i].x, h_gpu[i].y, h_gpu[i].z); return 0;}
temp = get_next_C3(temp);}
}
$ nvcc -arch=sm_70 -o t1485 t1485.cu
$ cuda-memcheck ./t1485 2
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
[user2#dc10 misc]$ nvprof ./t1485
==6128== NVPROF is profiling process 6128, command: ./t1485
==6128== Profiling application: ./t1485
==6128== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 99.35% 4.81251s 1 4.81251s 4.81251s 4.81251s [CUDA memcpy DtoH]
0.65% 31.507ms 1 31.507ms 31.507ms 31.507ms void test<CN3<float>>(float, int, int3*)
API calls: 93.70% 4.84430s 1 4.84430s 4.84430s 4.84430s cudaMemcpy
6.09% 314.89ms 1 314.89ms 314.89ms 314.89ms cudaMalloc
0.11% 5.4296ms 4 1.3574ms 691.18us 3.3429ms cuDeviceTotalMem
0.10% 4.9644ms 388 12.794us 317ns 535.35us cuDeviceGetAttribute
0.01% 454.66us 4 113.66us 103.24us 134.26us cuDeviceGetName
0.00% 65.032us 1 65.032us 65.032us 65.032us cudaLaunchKernel
0.00% 24.906us 4 6.2260us 3.2890us 10.160us cuDeviceGetPCIBusId
0.00% 8.2490us 8 1.0310us 533ns 1.5980us cuDeviceGet
0.00% 5.9930us 3 1.9970us 381ns 3.8870us cuDeviceGetCount
0.00% 2.8160us 4 704ns 600ns 880ns cuDeviceGetUuid
$
Notes:
as indicated above I have tested it for accuracy up through the first 2 billion results
The implementation above accounts for the fact that the solution of the cubic and quadratic equations in floating point introduces errors. These errors are "fixed" by creating a local integer search around the starting point given by the floating-point calculations, to produce the correct answer.
As indicated, the kernel above runs in ~30ms on my Tesla V100 for 1 billion results (10^9). If the methodology could correctly scale to 10^15 results, I have no reason to assume it would not take at least 0.03*10^6 seconds, or over 8 hours(!)
I haven't run the test, but I suspect that a quick benchmark of the simple case proposed in the question of simply generating the full domain (10^15) and then throwing away the ~5/6 of the space that did not apply, would be quicker.
Out of curiosity, I created an alternate test case that tests 31 out of each 32 values, across a larger space.
Here is the code and test:
$ cat t1485.cu
#include <stdio.h>
#include <math.h>
typedef float ct;
const int nTPB = 1024;
const int STEP_DOWN = 2;
// only float or double template types allowed
template <typename ft>
struct CN3{
__host__ __device__
int3 operator()(size_t N){
int3 n;
if (N == 0) {n.x = 2; n.y = 1; n.z = 0; return n;}
if (N == 1) {n.x = 3; n.y = 1; n.z = 0; return n;}
if (N == 2) {n.x = 3; n.y = 2; n.z = 0; return n;}
if (N == 3) {n.x = 3; n.y = 2; n.z = 1; return n;}
if (N == 4) {n.x = 4; n.y = 1; n.z = 0; return n;}
ft x, x1;
// identify n.x from cubic
// compiler computed
const ft a = 1.0/6;
const ft b = -1.0/2;
const ft c = 1.0/3;
const ft p1 = (-1.0)*(b*b*b)/(27.0*a*a*a) + b*c/(6.0*a*a);
const ft p2 = c/(3.0*a) - (b*b)/(9.0*a*a);
const ft p3 = p2*p2*p2;
const ft p4 = b/(3.0*a);
// run-time computed
//const ft d = -N;
const ft q0 = N/(2.0*a); // really should adjust constant for float vs. double
const ft q1 = p1 + q0;
const ft q2 = q1*q1;
if (sizeof(ft)==4){
x1 = sqrtf(q2+p3);
x = cbrtf(q1+x1) + cbrtf(q1-x1) - p4;
n.x = truncf(x);}
else {
x1 = sqrt(q2+p3);
x = cbrt(q1+x1) + cbrt(q1-x1) - p4;
n.x = trunc(x);}
/// fix floating-point errors
size_t tn = n.x - STEP_DOWN;
while ((tn)*(tn-1)*(tn-2)/6 <= N) tn++;
n.x = tn-1;
// identify n.y from quadratic
// compiler computed
const ft qa = 1.0/2;
//const ft qb = -qa;
const ft p5 = 1.0/4;
const ft p6 = 2.0;
// run-time computed
N = N - (((size_t)n.x)*(n.x-1)*(n.x-2))/6;
if (sizeof(ft)==4){
x = qa + sqrtf(p5+p6*N);
n.y = truncf(x);}
else {
x = qa + sqrt(p5+p6*N);
n.y = trunc(x);}
/// fix floating-point errors
if ((n.y - STEP_DOWN) <= 0) tn = 0;
else tn = n.y - STEP_DOWN;
while ((((tn)*(tn-1))>>1) <= N) tn++;
n.y = tn-1;
// identify n3
n.z = N - ((((size_t)n.y)*(n.y-1))>>1);
return n;
}
};
__host__ __device__
int3 get_next_C3(int3 prev){
int3 res = prev;
res.z++;
if (res.z >= res.y){
res.y++; res.z = 0;
if (res.y >= res.x){res.x++; res.y = 1; res.z = 0;}}
return res;
}
template <typename T>
__global__ void test(T f){
size_t idx = threadIdx.x+((size_t)blockDim.x)*blockIdx.x;
size_t idy = threadIdx.y+((size_t)blockDim.y)*blockIdx.y;
size_t id = idx + idy*gridDim.x*blockDim.x;
int3 temp = f(id);
int3 temp2;
temp2.x = __shfl_up_sync(0xFFFFFFFF, temp.x, 1);
temp2.y = __shfl_up_sync(0xFFFFFFFF, temp.y, 1);
temp2.z = __shfl_up_sync(0xFFFFFFFF, temp.z, 1);
temp2 = get_next_C3(temp2);
if ((threadIdx.x & 31) != 0)
if ((temp.x != temp2.x) || (temp.y != temp2.y) || (temp.z != temp2.z)) printf("%lu,%d,%d,%d,%d,%d,%d\n", id, temp.x, temp.y, temp.z, temp2.x, temp2.y, temp2.z);
}
int main(int argc, char* argv[]){
const size_t nbx = 200000000ULL;
const int nby = 100;
dim3 block(nbx, nby, 1);
test<<<block,nTPB>>>(CN3<ct>());
cudaDeviceSynchronize();
cudaError_t e = cudaGetLastError();
if (e != cudaSuccess) {printf("CUDA error %s\n", e); return 0;}
printf("tested space of size: %lu\n", nbx*nby*nTPB);
}
$ nvcc -arch=sm_70 -o t1485 t1485.cu
$ time ./t1485
tested space of size: 20480000000000
real 25m18.133s
user 18m4.804s
sys 7m12.782s
Here we see that the Tesla V100 took about 30 minutes to accuracy test a space of 20480000000000 results (about 2 * 10^13).

thread work if previously thread finished work (cuda) in same block

hello I am a beginner in cuda programming.I use lock.lock () function to wait for previously thread finished work. this my code :
#include "book.h"
#include <cuda.h>
#include <conio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include <string>
#include <curand.h>
#include <curand_kernel.h>
#include "lock.h"
#define pop 10
#define gen 10
#define pg pop*gen
using namespace std;
__global__ void hold(Lock lock,float* a )
{
__shared__ int cache[gen];
int tid=blockIdx.x * blockDim.x+threadIdx.x;
int cacheIndex = threadIdx.x;
if(tid<gen)
{
a[tid]=7;//this number example but in my chase this random number
}
else
{
//cache[cacheIndex]=a[tid];
int temp;
if(tid%gen==0)
{
a[tid]=tid+4;//this example number but in my chase this random number if tid==tid%gen
temp=a[tid];
tid+=blockIdx.x*gridDim.x;
}
else
{
__syncthreads();
a[tid]=temp+1;//this must a[tid]=a[tid-1]+1;
temp=a[tid];
tid+=blockIdx.x*gridDim.x;
}
cache[cacheIndex]=temp;
__syncthreads();
for (int i=0;i<gen;i++)
{
if(cacheIndex==i)
{
lock. lock();
cache[cacheIndex]=temp;
lock.unlock();
}
}
}
}
int main()
{
float time;
float* a=new float [pg];
float *dev_a;
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,pg *sizeof(int) ) );
Lock lock;
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );
hold<<<pop,gen>>>(lock,dev_a);
HANDLE_ERROR( cudaMemcpy( a, dev_a,pg * sizeof(float),cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );
for(int i=0;i<pop;i++)
{
for(int j=0;j<gen;j++)
{
cout<<a[(i*gen)+j]<<" ";
}
cout<<endl;
}
printf("hold: %3.1f ms \n", time);
HANDLE_ERROR(cudaFree(dev_a));
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
system("pause");
return 0;
}
and this the result :
7 7 7 7 7 7 7 7 7 7
14 0 0 0 0 0 0 0 0 0
24 0 0 0 0 0 0 0 0 0
34 0 0 0 0 0 0 0 0 0
44 0 0 0 0 0 0 0 0 0
54 0 0 0 0 0 0 0 0 0
64 0 0 0 0 0 0 0 0 0
74 0 0 0 0 0 0 0 0 0
84 0 0 0 0 0 0 0 0 0
94 0 0 0 0 0 0 0 0 0
my expected result :
7 7 7 7 7 7 7 7 7 7
14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 23 31 32 33
34 35 36 37 38 39 40 41 42 43
44 45 46 47 48 49 50 51 52 53
54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73
74 75 76 77 78 79 80 81 82 83
84 85 86 87 88 89 90 91 92 93
94 95 96 97 98 99 100 101 102 103
any one please help me to correct my code. thanks
If you want help, it would be useful to point out that some of your code (e.g. lock.h and book.h) come from the CUDA by examples book. This is not a standard part of CUDA, so if you don't indicate where it comes from, it may be confusing.
I see the following issues in your code:
You are using a __syncthreads() in a conditional block where not all threads will meet the __syncthreads() barrier:
if(tid%gen==0)
{
...
}
else
{
__syncthreads(); // illegal
}
The usage of __syncthreads() in this way is illegal because not all threads will be able to reach the __syncthreads() barrier:
__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.
You are using the temp local variable without initializing it first:
a[tid]=temp+1;//this must a[tid]=a[tid-1]+1;
note that temp is thread-local variable. It is not shared amongst threads. Therefore the above line of code (for threads in the else block) is using an unitialized value of temp.
The remainder of your kernel code:
cache[cacheIndex]=temp;
__syncthreads();
for (int i=0;i<gen;i++)
{
if(cacheIndex==i)
{
lock. lock();
cache[cacheIndex]=temp;
lock.unlock();
}
}
}
does nothing useful because it is updating shared memory locations (i.e. cache) which are never transferred back to the dev_a variable, i.e. global memory. Therefore none of this code could affect the results you print out.
It's difficult to follow what you are trying to accomplish in your code. However if you change this line (the uninitialized value):
int temp;
to this:
int temp=tid+3;
Your code will print out the data according to what you have shown.

Apparently wrong results in CUDA emulation

This is the code below:
#include <stdio.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#define _crt_nonstdc_no_deprecate
__global__ void Kernel(float *d_arr_i,float *d_arr_o)
{
int i = threadIdx.x;
int j = threadIdx.y;
int k = threadIdx.z;
float f = d_arr_i[i];
d_arr_o[i] = f * f;
printf("%d \n",f);
printf("x = %d & y = %d & z = %d \n",i,j,k);
}
int main ()
{
//printf("Hello C..!");
const unsigned int arr_s = 12;
float h_arr_i[arr_s];
for (int i = 0;i < arr_s;i++)
{
h_arr_i[i] = float(i);
}
for (int i = 0;i<arr_s;i++)
{
printf("input arr %d : %e \n",(int)i,h_arr_i[i]);
}
float h_arr_o[arr_s];
float* d_arr_i;
float* d_arr_o;
const unsigned int d_arr_s = arr_s*sizeof(float);
cudaMalloc((void**)&d_arr_i,d_arr_s);
cudaMalloc((void**)&d_arr_o,d_arr_s);
cudaMemcpy(d_arr_i,h_arr_i,d_arr_s,cudaMemcpyHostToDevice);
Kernel<<<1,arr_s>>>(d_arr_i,d_arr_o);
cudaMemcpy(h_arr_o,d_arr_o,d_arr_s,cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0;i < arr_s;i++)
{
printf("output arr : %d \n",h_arr_o[i]);
}
int d;
cudaDeviceProp c;
int e;
cudaGetDeviceProperties(&c,e);
printf("\n %e",e);
cudaGetDeviceCount(&d);
printf("\n %d \n",d);
cudaFree(d_arr_i);
cudaFree(d_arr_o);
system("Pause");
return 0;
}
In the above code "output arr " should give out the square of the numbers in the array, but it does not.
Can somebody please explain why is this error taking place ?
some details , I'm running the code on :
CUDA 2.3 / Emulation mode/without a NVIDIA GPU
OS : Windows 7 64bit
Visual Studio 2005 SP1
You are using the wrong printf format specifier. This error occurs twice, once in the kernel, and once at the end of your code for the output printout.
Instead of %d you should use %f. When I make that change to your code, I get the correct results when running on a real GPU using CUDA 5.0:
$ ./t212
input arr 0 : 0.000000e+00
input arr 1 : 1.000000e+00
input arr 2 : 2.000000e+00
input arr 3 : 3.000000e+00
input arr 4 : 4.000000e+00
input arr 5 : 5.000000e+00
input arr 6 : 6.000000e+00
input arr 7 : 7.000000e+00
input arr 8 : 8.000000e+00
input arr 9 : 9.000000e+00
input arr 10 : 1.000000e+01
input arr 11 : 1.100000e+01
0.000000
1.000000
2.000000
3.000000
4.000000
5.000000
6.000000
7.000000
8.000000
9.000000
10.000000
11.000000
x = 0 & y = 0 & z = 0
x = 1 & y = 0 & z = 0
x = 2 & y = 0 & z = 0
x = 3 & y = 0 & z = 0
x = 4 & y = 0 & z = 0
x = 5 & y = 0 & z = 0
x = 6 & y = 0 & z = 0
x = 7 & y = 0 & z = 0
x = 8 & y = 0 & z = 0
x = 9 & y = 0 & z = 0
x = 10 & y = 0 & z = 0
x = 11 & y = 0 & z = 0
output arr : 0.000000
output arr : 1.000000
output arr : 4.000000
output arr : 9.000000
output arr : 16.000000
output arr : 25.000000
output arr : 36.000000
output arr : 49.000000
output arr : 64.000000
output arr : 81.000000
output arr : 100.000000
output arr : 121.000000

cudaMemset() - does it set bytes or integers?

From online documentation:
cudaError_t cudaMemset (void * devPtr, int value, size_t count )
Fills the first count bytes of the memory area pointed to by devPtr with the constant byte value value.
Parameters:
devPtr - Pointer to device memory
value - Value to set for each byte of specified memory
count - Size in bytes to set
This description doesn't appear to be correct as:
int *dJunk;
cudaMalloc((void**)&dJunk, 32*(sizeof(int));
cudaMemset(dJunk, 0x12, 32);
will set all 32 integers to 0x12, not 0x12121212. (Int vs Byte)
The description talks about setting bytes. Count and Value are described in terms of bytes. Notice count is of type size_t, and value is of type int. i.e. Set a byte-size to an int-value.
cudaMemset() is not mentioned in the prog guide.
I have to assume the behavior I am seeing is correct, and the documentation is bad.
Is there a better documentation source out there? (Where?)
Are other types supported? i.e. Would float *dJunk; work? Others?
The documentation is correct, and your interpretation of what cudaMemset does is wrong. The function really does set byte values. Your example sets the first 32 bytes to 0x12, not all 32 integers to 0x12, viz:
#include <cstdio>
int main(void)
{
const int n = 32;
const size_t sz = size_t(n) * sizeof(int);
int *dJunk;
cudaMalloc((void**)&dJunk, sz);
cudaMemset(dJunk, 0, sz);
cudaMemset(dJunk, 0x12, 32);
int *Junk = new int[n];
cudaMemcpy(Junk, dJunk, sz, cudaMemcpyDeviceToHost);
for(int i=0; i<n; i++) {
fprintf(stdout, "%d %x\n", i, Junk[i]);
}
cudaDeviceReset();
return 0;
}
produces
$ nvcc memset.cu
$ ./a.out
0 12121212
1 12121212
2 12121212
3 12121212
4 12121212
5 12121212
6 12121212
7 12121212
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
30 0
31 0
ie. all 128 bytes set to 0, then first 32 bytes set to 0x12. Exactly as described by the documentation.

scan-array CUDA

I'm trying to scan a simple array using CUDA but it seems there is something wrong with the code below..I am trying to find what i am doing wrong but i can't.Can anyone please help me?
#include <stdio.h>
#include <stdlib.h>
__global__ void prescan(int *g_odata, int *g_idata, int n){
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid]; // load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1){ // build sum in place up the tree
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0) { temp[n - 1] = 0; } // clear the last element
for (int d = 1; d < n; d *= 2){ // traverse down tree & build scan
offset >>= 1;
__syncthreads();
if (thid < d){
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
int main(int argc, char *argv[]){
int i;
int *input = 0;
int *output = 0;
int *g_idata = 0;
int *g_odata = 0;
int numblocks = 1;
int radix = 16;
input = (int*)malloc(numblocks*radix*sizeof(int));
output = (int*)malloc(numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_idata, numblocks*radix*sizeof(int));
cudaMalloc((void**)&g_odata, numblocks*radix*sizeof(int));
for(i=0; i<numblocks*radix; i++){
input[i] = 1 + 2*i;
}
for(i=0; i<numblocks*radix; i++){
printf("%d ", input[i]);
}
cudaMemcpy(g_idata, input, numblocks*radix*sizeof(int), cudaMemcpyHostToDevice);
prescan<<<1,8>>>(g_odata, g_idata, numblocks*radix);
cudaThreadSynchronize();
cudaMemcpy(output, g_odata, numblocks*radix*sizeof(int), cudaMemcpyDeviceToHost);
for(i=0; i<numblocks*radix; i++){
printf("%d ", output[i]);
}
free(input);
free(output);
cudaFree(g_idata);
cudaFree(g_odata);
return 0;
}
The output is this: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.I want to have this output: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 0 1 4 9 16 25 36 49 64 81 100 121 144 169 196 225
Just go through this code to implement scan in parallel environment.
The algorithm which I implemented here is Hillis Steele exclusive scan.I implemented algorithm through shared memory, it will definitely improve the execution time for the large data set.
#include<stdio.h>
#include<math.h>
__global__ void scan(int *d_in,int *d_out,int n)
{
extern __shared__ int sdata[];
int i;
int tid = threadIdx.x;
sdata[tid] = d_in[tid];
for (i = 1; i <n; i <<= 1)
{
if (tid>=i)
{
sdata[tid] +=sdata[tid-i];
}
__syncthreads();
}
d_out[tid] = sdata[tid];
__syncthreads();
}
int main()
{
int h_in[16],h_out[16];
int i,j;
for (i = 0; i < 16; i++)
h_in[i] = 2*i+1;
for (i = 0; i < 16; i++)
printf("%d ", h_in[i]);
int *d_in;
int *d_out;
cudaMalloc((void**)&d_in, sizeof(int)* 16);
cudaMalloc((void**)&d_out, sizeof(int)* 16);
cudaMemcpy(d_in, h_in, sizeof(int) * 16, cudaMemcpyHostToDevice);
scan <<<1, 16, sizeof(int)*16 >>>(d_in,d_out, 16);
cudaMemcpy(h_out, d_out, sizeof(int) * 16, cudaMemcpyDeviceToHost);
for (i = 0; i < 16; i++)
printf("%d ", h_out[i]);
return 0;
}