Related
I am new to cuda and am trying to parallelize a very simple program shown below that was inspired from this link: https://devblogs.nvidia.com/even-easier-introduction-cuda/
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
void add(int n, S * s){
for(int i = 0; i < n; i++){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
add(n,grid);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].temp);
}
return 0;
}
I am not getting desired results however as when I am updating temp all the new values are 0. I think that the issue is because the array of structs I am passing to my add function cannot be accessed in device memory. I, however, am having a hard time figuring out how to fix this. I found this post on stackoverflow and am a little unsure what the suggested answer did to fix the issue: Array of structs of arrays CUDA C
The cuda code I have for reference is here:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define SIZE 1000
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__ void add(int n, S * s){
int index = threadIdx.x;
int stride = blockDim.x;
//printf("%d\n",(n-index)/stride);
//printf("%d\n",s[0].temp);
for(int i = index; i < n; i+=stride){
printf("%d\n",index);
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
printf("%d\n",index);
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int *h_a;
int *d_a;
int num_blocks= 2;
int num_th_per_blk= 5;
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
size_t memSize;
memSize = num_blocks* num_th_per_blk* sizeof(int);
h_a= (int*) malloc(memSize);
cudaMallocManaged((void **)&grid, n * sizeof(S));
cudaMalloc( (void**) &d_a, memSize);
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].newtemp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
I feel like there is a simple fix here that I am not seeing, or maybe I am missing a key concept. Whatever the case any help would be greatly appreciated.
There is actually a lot wrong in your code, so much so that it is easier to post a working version than point out all the individual mistakes:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__
void add(int n, S * s)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < n; i+=stride){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<10;
S* grid;
cudaMallocManaged((void **)&grid, n * sizeof(S));
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand()%n;
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
int num_th_per_blk= 32;
int num_blocks= (n / num_th_per_blk) + (n % num_th_per_blk > 0) ? 1 : 0;
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaDeviceSynchronize();
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%10==1)printf("%d %d\n",i,grid[i].temp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
The most egregious error is how you handle grid in the host code. Doing this:
S grid[n];
// code initializing grid
cudaMallocManaged((void **)&grid, n * sizeof(S));
is both illegal (you shouldn't try and set grid to another pointer value, it isn't a pointer), and nonsensical. cudaMallocManaged allocates new memory, so all you are doing is initializing grid, then throwing away all the carefully initialized memory and replacing it with uninitialized memory which you pass to the kernel. The kernel then operates on random data. Note also that the grid stride loop within the kernel is also incorrect, and both the original code and CUDA version potentially suffer from integer overflow due to how you initialize the temp members of the structure in both versions using rand().
I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?
One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$
Here I want to calculate the distance of each two points, and decide if they are neighbours. here is my simple code in cuda.
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
The DataPoint is a struct is
typedef struct DataPoint {
float pfDimens[3];
} DataPoint;
so here i want to reduce the time, How can i do? I have tried to use memory coalesing and share memory, but i didn't get a good speed up?
===============use share memory==============
__global__ void calcNeighbors2(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[threadsPerBlock];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
Here i changed the neighbors[tid*N+i] to neighbors[i*N+tid], it give me amlost 8x speed up on Tesla K10.G2.8GB. But when i use share memory to store some points, it is no use?
There are at least 4 ideas, some of which have already been stated in the comments:
Transform your point distance storage from AoS format:
struct DataPoint {
float pfDimens[3];
};
to SoA format:
struct DataPoint {
float pfDimens_x[NPTS];
float pfDimens_y[NPTS];
float pfDimens_z[NPTS];
};
this will enable full coalescing on loading of the data. In fact, to help with point 4 below, I would just switch to using 3 bare arrays, rather than a structure.
reduce the computation to (slightly less than) half:
for (int i=N-1; i>tid; i--) {
then, either in the thread code itself, or in the host, you can populate the other "half" of the output matrix by copying data.
Transpose the storage in your output matrix, so that you can write a storage operation like this:
neighbors[i*N+tid] = true;
which will nicely coalesce, as opposed to this:
neighbors[tid*N+i] = true;
which will not.
Since your input point data is read only, mark the kernel parameter appropriately:
const float * __restrict__ points_x, const float * __restrict__ points_y, const float * __restrict__ points_z
in some cases, and on some GPUs, this will often lead to a speed-up due to use of the read-only cache. If you really want to get aggressive with caching, and your data array is small enough (4K or less float points), you could put a copy of the point data in global memory as well as a copy in __constant__ memory, and load the "uniform" load you are doing here through constant memory:
DataPoint p2 = c_points[i];
thus you could perform the coalesced load through the read-only cache, the uniform load through the constant cache, and the coalesced store going to ordinary global memory.
On a K40c, on linux/CUDA 7, for N = 4096, the net effect of these changes appears to be about a 3.5x speedup, at the kernel level:
$ cat t749.cu
#include <stdio.h>
#define N 4096
// if N is 16K/3 or less, we can use constant
#define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 0.004903s, t2: 0.001395s
$
In the case of K40c, the limited number of blocks being launched above (16) is a significant impediment to performance, due to latency. If we comment out the USE_CONSTANT define, and change N to 16384, we observe an even higher speedup with the improved kernel:
$ ./t749
t1: 0.267107s, t2: 0.008209s
$
the resultant ~48 blocks being enough to approximately "fill" the K40c which has 15 SMs.
EDIT: now that you've posted a shared memory kernel, I added it to my test case as calcNeighbors3 and compared it's timing performance (as t3). It is almost as fast as my kernel, and it seems to provide the correct result (matches your original kernel) so I'm not sure what your concerns are.
Here's the updated code and test case:
$ cat t749.cu
#include <stdio.h>
#include <math.h>
#define imin(X,Y) ((X)<(Y))?(X):(Y)
#define N 32768
// if N is 16K/3 or less, we can use constant
// #define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
__global__ void calcNeighbors3(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[nTPB];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2, *hn3;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
hn3=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
#ifdef USE_CONSTANT
cudaMemcpyToSymbol(cpx, hx, N*sizeof(float));
cudaMemcpyToSymbol(cpy, hy, N*sizeof(float));
cudaMemcpyToSymbol(cpz, hz, N*sizeof(float));
#endif
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t3 = dtime_usec(0);
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 3 error");
t3 = dtime_usec(t3);
cudaMemcpy(hn3, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC, t3/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("1:2 mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
for (int i = 0; i < N*N; i++)
if (hn1[i] != hn3[i]) {printf("1:3 mismatch at %d, was: %d, should be: %d\n", i, hn1[i], hn3[i]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 1.260010s, t2: 0.022661s, t3: 0.029632s
$
For this test, I have changed the data set size to 32768 since that is closer to the range you care about. Your shared memory kernel shows about a 42x speedup over your original kernel, and my kernel shows about a 55x speedup, on my K40c.
I want to compute 'out = alpha * px + beta * py','px' and 'py' is array.*
I have a simple kernel:
__global__
void saxpyGPU2( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
size_t i = blockDim.x*blockIdx.x + threadIdx.x;
while (i < N)
{
out[i] = alpha * px[i] + beta * py[i];
i += blockDim.x*gridDim.x;
}
}
It works, so I want to loop unroll.
The code in cuda-handbook is:
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x; i < N-n*blockDim.x*gridDim.x; i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = alpha*x[j]+beta* y[j];
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
I don't understand in the second branch when i > N-n*blockDim.x*gridDim.x. why use a outer loop
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ )....}
And I test those two kernel , first one is OK, but second one I copy from the book is incorrect.
I initial two array while(i<1024) a[i] = i; b[i] = 10*i;i++, and I want to compute the c = alpha*a + beta*b use the two kernels above, but the result in the loop unrolled kernel is 4.3e8 for all element in c.
This my test code:
int main(){
int arraySize = 1024;
float* a =new float[arraySize];
float* b =new float[arraySize];
float* c =new float[arraySize];
for (int i =0;i<arraySize;i++)
{
a[i] = 1.0* i;
b[i] = 10.0*i;
c[i] = 0.0;
}
float* d_a;
float* d_b;
float* d_c;
cudaMalloc((void**)&d_a,sizeof(float)*arraySize);
cudaMemcpy(d_a,a,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b,sizeof(float)*arraySize);
cudaMemcpy(d_b,b,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_c,sizeof(float)*arraySize);
for (int i=0;i<arraySize;i++)
{
c[i] = a[i] + b[i];
}
dim3 block_size(256,1,1);
dim3 grid_size((arraySize -1)/block_size.x+1,1,1);
float alpha = 1.0;
float beta = 1.0;
bool flag = true;
if(flag)
{
saxpyGPU<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
float* temp = new float[arraySize];
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
}
else
{
saxpyGPU2<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
Those two kernel show different result
The kernel code you posted is perfectly correct and produces the expected results. This can be demonstrated using the following code:
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py,
size_t N, float alpha,float beta) {
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x;
i < N-n*blockDim.x*gridDim.x;
i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
out[index] = alpha*x[j] + beta*y[j];
}
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py,
size_t N, float alpha,float beta ) {
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
struct prg {
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const {
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void) {
const int N = 100000;
const float alpha = 0.12345f, beta = 0.9876f;
prg gen(1.f, 2.f);
thrust::device_vector<float> x(N), y(N), z(N);
thrust::counting_iterator<unsigned int> iseqx(0);
thrust::counting_iterator<unsigned int> iseqy(N);
thrust::transform(iseqx, iseqx + N, x.begin(), gen);
thrust::transform(iseqy, iseqy + N, y.begin(), gen);
float *xp = thrust::raw_pointer_cast(&x[0]);
float *yp = thrust::raw_pointer_cast(&y[0]);
float *zp = thrust::raw_pointer_cast(&z[0]);
dim3 blockdim(128);
dim3 griddim(16);
saxpyGPU<<<griddim, blockdim>>>(zp, xp, yp, N, alpha, beta);
cudaDeviceSynchronize();
std::vector<float> xh(N), yh(N), zh(N);
thrust::copy(x.begin(), x.end(), xh.begin());
thrust::copy(y.begin(), y.end(), yh.begin());
thrust::copy(z.begin(), z.end(), zh.begin());
float maxabserr = -1.f, maxrelerr = -1.f;
for(int i=0; i<N; i++) {
float saxpyval = alpha * xh[i] + beta * yh[i];
float abserr = fabs(zh[i]-saxpyval);
float relerr = abserr / fmaxf(fabs(zh[i]), fabs(saxpyval));
maxabserr = fmaxf(abserr, maxabserr);
maxrelerr = fmaxf(relerr, maxrelerr);
}
std::cout.precision(10);
std::cout << "Maximum absolute error = " << maxabserr << std::endl;
std::cout << "Maximum relative error = " << maxrelerr << std::endl;
return 0;
}
which gives me the following:
$ nvcc -arch=sm_30 -o unrolled_saxpy unrolled_saxpy.cu
$ ./unrolled_saxpy
Maximum absolute error = 2.384185791e-07
Maximum relative error = 1.1920676e-07
If you (still) do not understand why the kernel is written as it is, follow what I showed you in your previous question and manually unroll the saxpy function. Start with n=1 and confirm it is functionally the same as the unrolled equivalent, and then try n=2, n=4, etc. to see what the action of loop unrolling is.
I've been trying to write a kernel in that calculates the sum of the inverse of the distance between N given points over N. A serial coda in C would be like
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Where rx and ry are the x and y coordinates, respectively.
I generate the points via a kernel that uses random number generator. For the kernel, I used 128(256) threads per block for 4k(8k) points. On it every thread performs the inner above inner loop, then the results are passed to a reduce sum function, as follows
Generate points:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Reduction:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Kernel:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
The kernel is configured and launched as follows:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Finally, I copy the data back to host and then perform the remaining sum:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
For 4k points, ok! the results are:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
In this case the sum may be performed in different order, causing both results to diverge from each other, I don't know...
But when it comes to 8k, the results are quiet different:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
To me it seems that both the kernel and the serial code are written the same way. What leads me to distrust the precision on CUDA calculation of floating point numbers. Does this make sense? Or are the access to global memory causing some conflicts when some threads load the same data from X and Y at the same time? Or the way I wrote the kernel is in some way 'wrong'(I mean, am I doing something that is causing both results to diverge from each other?).
Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.
I was able to reproduce your results.
First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.
Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.
When I run the code with typedef of float and a data size of 4096 I get these results:
GPU average = 108.294922
CPU average = 109.925285
When I run the code with typedef of double and a data size of 4096 I get these results:
GPU average = 108.294903
CPU average = 108.294903
When I run the code with typedef of float and a data size of 8192 I get these results:
GPU average = 153.447327
CPU average = 131.473526
When I run the code with typedef of double and a data size of 8192 I get these results:
GPU average = 153.447380
CPU average = 153.447380
There are at least 2 observations:
The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.
Based on this, I believe the CPU is providing the variable, questionable behavior.
Here's my code for reference:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560
compiled with:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
EDIT:
After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.