I am new to cuda and am trying to parallelize a very simple program shown below that was inspired from this link: https://devblogs.nvidia.com/even-easier-introduction-cuda/
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
void add(int n, S * s){
for(int i = 0; i < n; i++){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
add(n,grid);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].temp);
}
return 0;
}
I am not getting desired results however as when I am updating temp all the new values are 0. I think that the issue is because the array of structs I am passing to my add function cannot be accessed in device memory. I, however, am having a hard time figuring out how to fix this. I found this post on stackoverflow and am a little unsure what the suggested answer did to fix the issue: Array of structs of arrays CUDA C
The cuda code I have for reference is here:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define SIZE 1000
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__ void add(int n, S * s){
int index = threadIdx.x;
int stride = blockDim.x;
//printf("%d\n",(n-index)/stride);
//printf("%d\n",s[0].temp);
for(int i = index; i < n; i+=stride){
printf("%d\n",index);
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
printf("%d\n",index);
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int *h_a;
int *d_a;
int num_blocks= 2;
int num_th_per_blk= 5;
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
size_t memSize;
memSize = num_blocks* num_th_per_blk* sizeof(int);
h_a= (int*) malloc(memSize);
cudaMallocManaged((void **)&grid, n * sizeof(S));
cudaMalloc( (void**) &d_a, memSize);
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].newtemp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
I feel like there is a simple fix here that I am not seeing, or maybe I am missing a key concept. Whatever the case any help would be greatly appreciated.
There is actually a lot wrong in your code, so much so that it is easier to post a working version than point out all the individual mistakes:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__
void add(int n, S * s)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < n; i+=stride){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<10;
S* grid;
cudaMallocManaged((void **)&grid, n * sizeof(S));
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand()%n;
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
int num_th_per_blk= 32;
int num_blocks= (n / num_th_per_blk) + (n % num_th_per_blk > 0) ? 1 : 0;
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaDeviceSynchronize();
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%10==1)printf("%d %d\n",i,grid[i].temp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
The most egregious error is how you handle grid in the host code. Doing this:
S grid[n];
// code initializing grid
cudaMallocManaged((void **)&grid, n * sizeof(S));
is both illegal (you shouldn't try and set grid to another pointer value, it isn't a pointer), and nonsensical. cudaMallocManaged allocates new memory, so all you are doing is initializing grid, then throwing away all the carefully initialized memory and replacing it with uninitialized memory which you pass to the kernel. The kernel then operates on random data. Note also that the grid stride loop within the kernel is also incorrect, and both the original code and CUDA version potentially suffer from integer overflow due to how you initialize the temp members of the structure in both versions using rand().
Related
Let's say I have two arrays
A = {1, 2, 3}
and
B = {10,20,30,40,50}
I want to generate a new array which would have a size of
sizeof(A) * sizeof(B)
I want to replicate B sizeof(A) times, and on each repetition i, the resultant array should have A[i] added to it. So the result would be something like
{11,21,31,41,51,12,22,32,42,52,13,23,33,43,53}
This task can be interpreted as a 2-dimensional problem where the output array can be treated as a matrix of dimensions sizeof(A) times sizeof(B). In this way, we can use 2D CUDA indexing to achieve the desired functionality. A sample CUDA C++ code of this 2D implementation is shown below:
#include <iostream>
#include <cuda_runtime.h>
#include <cassert>
using namespace std;
__global__ void kernel_replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
const int ai = blockIdx.x * blockDim.x + threadIdx.x;
const int bi = blockIdx.y * blockDim.y + threadIdx.y;
if(ai<alen && bi<blen)
{
const int ci = ai * blen + bi;
c[ci] = a[ai] + b[bi];
}
}
void replicate_device(int* a, int* b, int* c, int alen, int blen, int clen)
{
dim3 block(16,16);
dim3 grid;
grid.x = (alen + block.x - 1) / block.x;
grid.y = (blen + block.y - 1) / block.y;
kernel_replicate<<<grid, block>>>(a,b,c,alen,blen,clen);
assert(cudaSuccess == cudaDeviceSynchronize());
}
void replicate(int* a, int* b, int* c, int alen, int blen, int clen)
{
int *ad, *bd, *cd;
size_t abytes = alen * sizeof(int);
size_t bbytes = blen * sizeof(int);
size_t cbytes = clen * sizeof(int);
cudaMalloc(&ad, abytes);
cudaMalloc(&bd, bbytes);
cudaMalloc(&cd, cbytes);
cudaMemcpy(ad,a, abytes, cudaMemcpyHostToDevice);
cudaMemcpy(bd,b, bbytes, cudaMemcpyHostToDevice);
replicate_device(ad,bd,cd, alen,blen,clen);
cudaMemcpy(c,cd, cbytes, cudaMemcpyDeviceToHost);
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
}
int main()
{
const int alen = 3;
const int blen = 5;
const int clen = alen * blen;
int A[alen] = {1,2,3};
int B[blen] = {10,20,30,40,50};
int C[clen] = {0};
replicate(A,B,C,alen, blen, clen);
for(int i=0; i<alen; i++)
{
cout<<A[i]<<" ";
}
cout<<endl;
for(int i=0; i<blen; i++)
{
cout<<B[i]<<" ";
}
cout<<endl;
for(int i=0; i<clen; i++)
{
cout<<C[i]<<" ";
}
cout<<endl;
return 0;
}
I want to write a prefix scan for large arrays using the instruction in GPUgem, It's a homework for my parallel class. I did follow all the steps in the book but still my code's not working. I got it to work for array size 4096 but it's not working for larger arrays. Here is my code :
#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;
__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
__shared__ mytype temp[THREADS];
const int tid1 = threadIdx.x;
int offset = 1;
temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
temp[2*tid1+1] = g_idata[2*tid1+1];
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0) {
aux[blockIdx.x] = temp[THREADS - 1];
temp[THREADS - 1] = 0;
}
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid]; // write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
const int tid1 = threadIdx.x;
const int B = (n / THREADS);
int offset = 1;
for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
__syncthreads();
if (tid1 == 0 && blockIdx.x == 0) {
aux[B - 1] = 0;
}
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tid1 < d)
{
int ai = offset*(2*tid1+1)-1;
int bi = offset*(2*tid1+2)-1;
mytype t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] += aux[blockIdx.x];
g_odata[2*thid+1] += aux[blockIdx.x];
}
int main(int argc, char *argv[])
{
if (argc != 2) {
printf("usage: %s n\n", argv[0]);
return -1;
}
const int n = atoi(argv[1]);
mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
const int size = n * sizeof(mytype);
h_i = (mytype *)malloc(size);
h_o = (mytype *)malloc(size);
if ((h_i == NULL) || (h_o == NULL)) {
printf("malloc failed\n");
return -1;
}
for (int i = 0; i < n; i++) {
h_i[i] = i;
h_o[i] = 0;
}
cudaMalloc(&d_i, size);
cudaMalloc(&d_temp, (n / THREADS) );
cudaMalloc(&d_o, size);
cudaMemset(d_o, 0, size);
cudaMemset(d_temp, 0, (n / THREADS));
cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
int blocks = n / THREADS;
phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
cudaThreadSynchronize();
cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < n ; i++) {
printf(" %d", h_o[i]);
}
printf("\n\n");
return 0;
}
Does anyone have any idea what I'm doing wrong?
One possible error I see in your code is here:
aux[thid] = temp[THREADS];
If your temp array is temp[1024], as you say, and each block has 1024 threads, as you say, then if THREADS is 1024, temp[THREADS] will access your shared memory array out-of-bounds (one past the end.) An array of 1024 elements only has valid indices from 0 to 1023.
Beyond that, it seems like you're asking how to take the last element out of a shared memory array (temp) and place it in a position in a (presumably global) aux array, which has one element for each block.
Here's a fully worked example:
$ cat t831.cu
#include <stdio.h>
#define THREADS 1024
#define BLOCKS 20
__global__ void kernel(int *aux){
__shared__ int temp[THREADS];
temp[threadIdx.x] = threadIdx.x + blockIdx.x;
__syncthreads();
if (threadIdx.x == 0)
aux[blockIdx.x] = temp[THREADS-1];
}
int main(){
int *h_data, *d_data;
const int dsize = BLOCKS*sizeof(int);
h_data=(int *)malloc(dsize);
cudaMalloc(&d_data, dsize);
memset(h_data, 0, dsize);
cudaMemset(d_data, 0, dsize);
kernel<<<BLOCKS, THREADS>>>(d_data);
cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
printf("\n");
return 0;
}
$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$
I want to compute 'out = alpha * px + beta * py','px' and 'py' is array.*
I have a simple kernel:
__global__
void saxpyGPU2( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
size_t i = blockDim.x*blockIdx.x + threadIdx.x;
while (i < N)
{
out[i] = alpha * px[i] + beta * py[i];
i += blockDim.x*gridDim.x;
}
}
It works, so I want to loop unroll.
The code in cuda-handbook is:
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py, size_t N, float alpha,float beta)
{
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x; i < N-n*blockDim.x*gridDim.x; i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
// to avoid the (index<N) conditional in the inner loop,
// we left off some work at the end
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = alpha*x[j]+beta* y[j];
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py, size_t N, float alpha,float beta )
{
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
I don't understand in the second branch when i > N-n*blockDim.x*gridDim.x. why use a outer loop
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ )....}
And I test those two kernel , first one is OK, but second one I copy from the book is incorrect.
I initial two array while(i<1024) a[i] = i; b[i] = 10*i;i++, and I want to compute the c = alpha*a + beta*b use the two kernels above, but the result in the loop unrolled kernel is 4.3e8 for all element in c.
This my test code:
int main(){
int arraySize = 1024;
float* a =new float[arraySize];
float* b =new float[arraySize];
float* c =new float[arraySize];
for (int i =0;i<arraySize;i++)
{
a[i] = 1.0* i;
b[i] = 10.0*i;
c[i] = 0.0;
}
float* d_a;
float* d_b;
float* d_c;
cudaMalloc((void**)&d_a,sizeof(float)*arraySize);
cudaMemcpy(d_a,a,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b,sizeof(float)*arraySize);
cudaMemcpy(d_b,b,sizeof(float)*arraySize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_c,sizeof(float)*arraySize);
for (int i=0;i<arraySize;i++)
{
c[i] = a[i] + b[i];
}
dim3 block_size(256,1,1);
dim3 grid_size((arraySize -1)/block_size.x+1,1,1);
float alpha = 1.0;
float beta = 1.0;
bool flag = true;
if(flag)
{
saxpyGPU<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
float* temp = new float[arraySize];
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
}
else
{
saxpyGPU2<<<grid_size,block_size>>>(d_c,d_a,d_b,arraySize,alpha,beta);
cudaMemcpy(temp,d_c,arraySize*sizeof(float),cudaMemcpyDeviceToHost);
for (int i = 0;i<arraySize;i++)
{
cout<<(temp[i] - c[i])<<",";
}
Those two kernel show different result
The kernel code you posted is perfectly correct and produces the expected results. This can be demonstrated using the following code:
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
template<const int n>
__device__
void saxpy_unrolled(float *out, const float *px, const float *py,
size_t N, float alpha,float beta) {
float x[n], y[n];
size_t i;
for ( i = n*blockIdx.x*blockDim.x+threadIdx.x;
i < N-n*blockDim.x*gridDim.x;
i += n*blockDim.x*gridDim.x ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
x[j] = px[index];
y[j] = py[index];
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
out[index] = alpha*x[j]+beta* y[j];
}
}
for ( int j = 0; j < n; j++ ) {
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
x[j] = px[index];
y[j] = py[index];
}
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) {
out[index] = alpha*x[j] + beta*y[j];
}
}
}
}
__global__
void saxpyGPU( float *out, const float *px, const float *py,
size_t N, float alpha,float beta ) {
saxpy_unrolled<4>( out, px, py, N, alpha ,beta);
}
struct prg {
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const {
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void) {
const int N = 100000;
const float alpha = 0.12345f, beta = 0.9876f;
prg gen(1.f, 2.f);
thrust::device_vector<float> x(N), y(N), z(N);
thrust::counting_iterator<unsigned int> iseqx(0);
thrust::counting_iterator<unsigned int> iseqy(N);
thrust::transform(iseqx, iseqx + N, x.begin(), gen);
thrust::transform(iseqy, iseqy + N, y.begin(), gen);
float *xp = thrust::raw_pointer_cast(&x[0]);
float *yp = thrust::raw_pointer_cast(&y[0]);
float *zp = thrust::raw_pointer_cast(&z[0]);
dim3 blockdim(128);
dim3 griddim(16);
saxpyGPU<<<griddim, blockdim>>>(zp, xp, yp, N, alpha, beta);
cudaDeviceSynchronize();
std::vector<float> xh(N), yh(N), zh(N);
thrust::copy(x.begin(), x.end(), xh.begin());
thrust::copy(y.begin(), y.end(), yh.begin());
thrust::copy(z.begin(), z.end(), zh.begin());
float maxabserr = -1.f, maxrelerr = -1.f;
for(int i=0; i<N; i++) {
float saxpyval = alpha * xh[i] + beta * yh[i];
float abserr = fabs(zh[i]-saxpyval);
float relerr = abserr / fmaxf(fabs(zh[i]), fabs(saxpyval));
maxabserr = fmaxf(abserr, maxabserr);
maxrelerr = fmaxf(relerr, maxrelerr);
}
std::cout.precision(10);
std::cout << "Maximum absolute error = " << maxabserr << std::endl;
std::cout << "Maximum relative error = " << maxrelerr << std::endl;
return 0;
}
which gives me the following:
$ nvcc -arch=sm_30 -o unrolled_saxpy unrolled_saxpy.cu
$ ./unrolled_saxpy
Maximum absolute error = 2.384185791e-07
Maximum relative error = 1.1920676e-07
If you (still) do not understand why the kernel is written as it is, follow what I showed you in your previous question and manually unroll the saxpy function. Start with n=1 and confirm it is functionally the same as the unrolled equivalent, and then try n=2, n=4, etc. to see what the action of loop unrolling is.
I am new to CUDA, for the vector to find max value and its index I use CUDA
here its my code:
#include < cuda.h >
#include < stdio.h >
#include < time.h >
#include <iostream>
using namespace std;
#define tbp 256
#define nblocks 1
__global__ void kernel_max(int *a, int *d, int *index,int *idx)
{
__shared__ int sdata[tbp]; //"static" shared memory
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = a[i];
index[tid] = i;
__syncthreads();
for(int s=tbp/2 ; s >= 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] < sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
index[tid] = index[tid+s];
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
if(tid == 0 )
{
d[blockIdx.x] = sdata[0];
idx[blockIdx.x] = index[0];
}
__syncthreads();
}
int main()
{
int i;
const int N=tbp*nblocks;
srand(time(NULL));
int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));
int *index;
index = (int*)malloc(N * sizeof(int));
int *idx;
idx = (int*)malloc(nblocks * sizeof(int));
int *dev_a, *dev_d, *dev_index,*dev_idx;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
cudaMalloc((void **) &dev_index, N*sizeof(int));
cudaMalloc((void **) &dev_idx, nblocks*sizeof(int));
int mmm=0;
int ddd=0;
for( i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
index[i]=i;
//printf("%d\n",a[i]);
if(mmm<a[i])
{
mmm=a[i];
ddd=i;
}
}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_index , index, N*sizeof(int),cudaMemcpyHostToDevice);
kernel_max <<< nblocks,tbp >>>(dev_a,dev_d,dev_index,dev_idx);
cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(index, dev_index, N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(idx, dev_idx, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
printf("cpu max= %d, gpu_max = %d ,cpu index: %d, gpu index: %d",mmm,d[0],ddd,idx[0]);
printf("\n");
if(ddd!=idx[0])
{
cout<<"index mismatch!damn!!"<<endl;
}
else
{
cout<<"congratulations!!"<<endl;
}
/*
for(i=0;i<N;i++)
cout<<*(index+i)<<endl;
*/
cudaFree(dev_a);
cudaFree(dev_d);
cudaFree(dev_index);
cudaFree(dev_idx);
free(a);
free(d);
free(index);
free(idx);
return 0;
}
The problem is that for the tbp < 128 it can get correct result both in value and index
when increase to 256,512,1024, the result will sometimes go wrong.
Can anyone given a explanation for this situation?Thanks.
Use another loop to deal with the index to avoid same max value with different index problem in this computation
int temp=0;
for(i=0;i<tbp;i++)
{
if(d[blockIdx.x]==a[i] && temp==0)
{temp = i;}
}
idx[0] = temp;
you need set int temp= -1 instead 0 to avoid the case of maximum value lcoated at 0.
using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
const int threadsPerBlock = 256;
const int N = 40000;
void generateArray(double *data, int count) {
for (int i = 0; i < count; i++)
data[i] = rand() / ((rand() + rand()) / 2.0 + 1);
}
double maxCPU(double *arr, int count) {
int max = arr[0];
for (int i = 0; i < count; i++)
if (arr[i] > max)
max = arr[i];
return max;
}
__global__ void MaxGPU(double *a, int count, double *result){
__shared__ double cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
int temp = a[tid];
tid+= blockDim.x * gridDim.x;
while(tid < count){
if(a[tid] > temp)
temp = a[tid];
tid+= blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i!=0){
if(cacheIndex < i)
if(cache[cacheIndex + i] > cache[cacheIndex])
cache[cacheIndex] = cache[cacheIndex + i];
__syncthreads();
i/=2;
}
if(cacheIndex == 0)
result[blockIdx.x] = cache[0];
}
int main(void) {
double *arr = new double[N], resultGPU;
generateArray(arr, N);
double *devA, *dev_partial_result;
double resultCPU = maxCPU(arr, N);
cudaMalloc((void**)&devA, N * sizeof(double));
cudaMalloc((void**)&dev_partial_result, 512 * sizeof(double));
cudaMemcpy(devA, arr, N * sizeof(double), cudaMemcpyHostToDevice);
MaxGPU<<<1, 256>>>(devA, N, dev_partial_result);
cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost);
cout << "Max CPU: " << resultCPU << endl;
cout << "Max GPU: " << resultGPU << endl;
cudaFree(devA);
cudaFree(dev_partial_result);
delete [] arr;
return 0;
}
I wrote above code. I don't why but it only works with one block. It does not work with say, 256 or 512 blocks. Why? What's wrong?
Try change
double resultGPU; to
double* resultGPU = new double[blocks_count];
and
cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost); to
cudaMemcpy(resultGPU, dev_partial_result,blocks_count*sizeof(double), cudaMemcpyDeviceToHost);