CUDA Kernel not returning values - cuda
I am working with a server that has multiple GPUs. I am using openMP to launch a kernel over multiple GPUs at once. The problem I am seeing is that the Kernel I am running does not seem to update the values in the thrust device vectors it is passed. The code below should output a value of 1 for all elements in the device vectors but instead outputs a value of 0. The code compiles and runs and shows me that the kernel executes successfully.
I do not understand why this code is not behaving as expected.
#include <iostream>
#include <cmath>
#include <omp.h>
#include <vector>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/device_vector.h>
using namespace::std;
const long N_R1 = 100;
const long N_R2 = 100;
__global__ void kernel(long* ND, long* NR1,
float* a, float* b, float* c, float* d)
{
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
//Values correspond to 2D array limits
long idxR1 = idx / ND[0];
long idxR2 = idx % ND[0];
if(idxR1 >= NR1[0] || idxR2 >= ND[0])
{
return;
}
a[idx] =1.0;
b[idx] =1.0;
c[idx] =1.0;
d[idx] =1.0;
}
void kernel_wrapper()
{
// GPU Count
int num_gpus = 0;
cudaGetDeviceCount(&num_gpus);
omp_set_num_threads(num_gpus);
//Calculate Dimensioning
long D_total = N_R1 * N_R2;
//Region 1 coordinates are loaded on to each GPU
//Region 2 coordinates are divided up onto GPUs
long R2_stride = ceil(float(N_R2)/float(num_gpus));
//Distance arrays need to be split longo whole sections of region 1.
//(Distances size = N_R1 * N_R2) subset of distance size needs to be N_R1
long D_stride = R2_stride * N_R1;
#pragma omp parallel
{
// Get CPU thread number
long cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
// Set up Local Arrays for distance and potential
// Step 1: Calculate rough Array Limits
// If array spaces divide evenly between threads then beginnings and endings can be calculated below
long R2_begin = cpu_thread_id * R2_stride;
long D_begin = cpu_thread_id * D_stride;
long R2_end = R2_begin + R2_stride;
long D_end = D_begin + D_stride;
// Step 2: Check Ends are not out of bounds
// The last thread in the calculation is likely to have array sizings that are out of bounds
// if this is the case then the ends need to be clipped:
if(R2_end >= N_R2)
{
R2_end = N_R2;
}
if(D_end >= D_total)
{
D_end = D_total;
}
// Local aray sizes are (end - begin)
long l_R2 = R2_end - R2_begin;
long l_D = D_end - D_begin;
float zero = 0.0;
// Create Region 2 potential components
thrust::host_vector<float > a(l_D,zero);
thrust::host_vector<float > b(l_D,zero);
thrust::host_vector<float > c(l_D,zero);
thrust::host_vector<float > d(l_D,zero);
long* p_NR1;
long nr1 = N_R1;
cudaMalloc( (void**)&p_NR1, sizeof(long) );
cudaMemcpy( p_NR1, &nr1, sizeof(long), cudaMemcpyHostToDevice);
long* p_NR2;
cudaMalloc( (void**)&p_NR2, sizeof(long) );
cudaMemcpy( p_NR2, &l_D, sizeof(long), cudaMemcpyHostToDevice);
//Generate Device Side Data for region 2 potential components
thrust::device_vector< float > d_a = a;
thrust::device_vector< float > d_b = b;
thrust::device_vector< float > d_c = c;
thrust::device_vector< float > d_d = d;
// Generate pointers to Device Side Data for region 2 potential components
float* p_a = thrust::raw_pointer_cast(d_a.data());
float* p_b = thrust::raw_pointer_cast(d_b.data());
float* p_c = thrust::raw_pointer_cast(d_c.data());
float* p_d = thrust::raw_pointer_cast(d_d.data());
dim3 blocks = N_R1;
dim3 threads = l_R2;
kernel<<<blocks,threads>>>(p_NR2, p_NR1,
p_a, p_b, p_c, p_d);
cudaDeviceSynchronize();
if(cudaGetLastError() == cudaSuccess)
{
cout << "Kernel Successful!" << cudaGetErrorString(cudaGetLastError()) << endl;
cin.ignore(1);
}
a = d_a;
b = d_b;
c = d_c;
d = d_d;
for(long j = 0; j != a.size(); j++)
{
cout << "a[" << j << "] = " << a[j] << endl;
}
for(long j = 0; j != b.size(); j++)
{
cout << "b[" << j << "] = " << b[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
}
cin.ignore(1);
}
int main()
{
kernel_wrapper();
return 0;
}
Any help would be greatly appreciated.
Some of the output values are getting set to 1, some are not. The problem is due to this statement:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
That isn't what I would call a proper generic conversion of 3D grid/block to globally unique 1D index, which I assume is your intent. Let's just pick one example to prove that it is broken. Suppose you are launching a 1D grid of 1D blocks (which is what you are doing). Then all of the (block,thread)Idx.y and .z variables will all be zero. Only blockIdx.x and threadIdx.x can take on non-zero values in that launch configuration.
In that case your expression reduces to:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
* ( 0 * ( blockDim.x*blockDim.y ) ) + 0
* blockDim.x + threadIdx.x;
i.e. it reduces to:
long idx = threadIdx.x;
So the first (block-size) elements of your arrays (a,b,c,d) are getting set properly, the rest are not. Since threadIdx.x is not unique from one block to the next, this is not a proper globally-unique thread ID, and therefore each block is writing the same output locations, rather than each taking care of a separate part of the array.
So what is a possible (correct) generic 3D-to-1D index conversion?
That is answered here (and probably other places). This answer actually only converts a 3D grid plus 1D block configuration to a globally-unique ID, but it is sufficient for demonstration purposes of what is wrong in this code.
When I replace your in-kernel calculation of idx with that code, your kernel populates all array entries with 1.0 according to my testing.
Related
Search Minimum/Maximum from n Arrays parallel in CUDA (Reduction Problem)
Is there a performant way in CUDA to get out of multiple arrays (which exist in different structures) to find the maximum/minimum in parallel? The structures are structured according to the Structure of Arrays format. A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical. An other approach is to calculate every Miminum/Maximum separetly for each Array. I think this is to slow. struct Cube { int* x; int* y; int* z; int size; }; int main() { Cube* c1 = new Cube(); //c1 includes 100 Cubes (because of SOA) c1-> x = new int[100]; c1-> y = new int[100]; c1 -> z = new int[100]; Cube* c2 = new Cube(); c2-> x = new int[1047]; c2-> y = new int[1047]; c2 -> z = new int[1047]; Cube* c3 = new Cube(); c3-> x = new int[5000]; c3-> y = new int[5000]; c3 -> z = new int[5000]; //My goal now is to find the smallest/largest x dimension of all cubes in c1, c2, ..., and cn, //with one Kernel launch. //So the smallest/largest x in c1, the smallest/largest x in c2 etc.. } Does anyone know an efficient approach? Thanks.
A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical. There is no problem with shared memory size. You may wish to review Mark Harris' canonical parallel reduction tutorial and look at the later methods to understand how we can use a loop to populate shared memory, reducing values into shared memory as we go. Once the input loop is completed, then we begin the block-sweep phase of the reduction. This doesn't impose any special requirements on the shared memory per block. Here's a worked example demonstrating both a thrust::reduce_by_key method (single call) and a CUDA block-segmented method (single kernel call): $ cat t1535.cu #include <iostream> #include <thrust/reduce.h> #include <thrust/copy.h> #include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <thrust/iterator/constant_iterator.h> #include <thrust/iterator/discard_iterator.h> #include <thrust/iterator/zip_iterator.h> #include <thrust/functional.h> #include <cstdlib> #define IMAX(x,y) (x>y)?x:y #define IMIN(x,y) (x<y)?x:y typedef int dtype; const int ncubes = 3; struct Cube { dtype* x; dtype* y; dtype* z; int size; }; struct my_f { template <typename T1, typename T2> __host__ __device__ thrust::tuple<dtype,dtype> operator()(T1 t1, T2 t2){ thrust::tuple<dtype,dtype> r; thrust::get<0>(r) = IMAX(thrust::get<0>(t1),thrust::get<0>(t2)); thrust::get<1>(r) = IMIN(thrust::get<1>(t1),thrust::get<1>(t2)); return r; } }; const int MIN = -1; const int MAX = 0x7FFFFFFF; const int BS = 512; template <typename T> __global__ void block_segmented_minmax_reduce(const T * __restrict__ in, T * __restrict__ max, T * __restrict__ min, const size_t * __restrict__ slen){ __shared__ T smax[BS]; __shared__ T smin[BS]; size_t my_seg_start = slen[blockIdx.x]; size_t my_seg_size = slen[blockIdx.x+1] - my_seg_start; smax[threadIdx.x] = MIN; smin[threadIdx.x] = MAX; for (size_t idx = my_seg_start+threadIdx.x; idx < my_seg_size; idx += BS){ T my_val = in[idx]; smax[threadIdx.x] = IMAX(my_val, smax[threadIdx.x]); smin[threadIdx.x] = IMIN(my_val, smin[threadIdx.x]);} for (int s = BS>>1; s > 0; s>>=1){ __syncthreads(); if (threadIdx.x < s){ smax[threadIdx.x] = IMAX(smax[threadIdx.x], smax[threadIdx.x+s]); smin[threadIdx.x] = IMIN(smin[threadIdx.x], smin[threadIdx.x+s]);} } if (!threadIdx.x){ max[blockIdx.x] = smax[0]; min[blockIdx.x] = smin[0];} } int main() { // data setup Cube *c = new Cube[ncubes]; thrust::host_vector<size_t> csize(ncubes+1); csize[0] = 100; csize[1] = 1047; csize[2] = 5000; csize[3] = 0; c[0].x = new dtype[csize[0]]; c[1].x = new dtype[csize[1]]; c[2].x = new dtype[csize[2]]; size_t ctot = 0; for (int i = 0; i < ncubes; i++) ctot+=csize[i]; // method 1: thrust // concatenate thrust::host_vector<dtype> h_d(ctot); size_t start = 0; for (int i = 0; i < ncubes; i++) {thrust::copy_n(c[i].x, csize[i], h_d.begin()+start); start += csize[i];} for (size_t i = 0; i < ctot; i++) h_d[i] = rand(); thrust::device_vector<dtype> d_d = h_d; // build flag vector thrust::device_vector<int> d_f(d_d.size()); thrust::host_vector<size_t> coff(csize.size()); thrust::exclusive_scan(csize.begin(), csize.end(), coff.begin()); thrust::device_vector<size_t> d_coff = coff; thrust::scatter(thrust::constant_iterator<int>(1), thrust::constant_iterator<int>(1)+ncubes, d_coff.begin(), d_f.begin()); thrust::inclusive_scan(d_f.begin(), d_f.end(), d_f.begin()); // min/max reduction thrust::device_vector<dtype> d_max(ncubes); thrust::device_vector<dtype> d_min(ncubes); thrust::reduce_by_key(d_f.begin(), d_f.end(), thrust::make_zip_iterator(thrust::make_tuple(d_d.begin(), d_d.begin())), thrust::make_discard_iterator(), thrust::make_zip_iterator(thrust::make_tuple(d_max.begin(), d_min.begin())), thrust::equal_to<int>(), my_f()); thrust::host_vector<dtype> h_max = d_max; thrust::host_vector<dtype> h_min = d_min; std::cout << "Thrust Maxima: " <<std::endl; thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ",")); std::cout << std::endl << "Thrust Minima: " << std::endl; thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ",")); std::cout << std::endl; // method 2: CUDA kernel (block reduce) block_segmented_minmax_reduce<<<ncubes, BS>>>(thrust::raw_pointer_cast(d_d.data()), thrust::raw_pointer_cast(d_max.data()), thrust::raw_pointer_cast(d_min.data()), thrust::raw_pointer_cast(d_coff.data())); thrust::copy_n(d_max.begin(), ncubes, h_max.begin()); thrust::copy_n(d_min.begin(), ncubes, h_min.begin()); std::cout << "CUDA Maxima: " <<std::endl; thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ",")); std::cout << std::endl << "CUDA Minima: " << std::endl; thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ",")); std::cout << std::endl; return 0; } $ nvcc -o t1535 t1535.cu $ ./t1535 Thrust Maxima: 2145174067,2147469841,2146753918, Thrust Minima: 35005211,2416949,100669, CUDA Maxima: 2145174067,2147469841,2146753918, CUDA Minima: 35005211,2416949,100669, $ For a small number of Cube objects, the thrust method is likely to be faster. It will tend to make better use of medium to large GPUs than the block method will. For a large number of Cube objects, the block method should also be fairly efficient.
In cuda, is it possible to write dense array from sparse array with expected sequence?
There is array1 that represent 0 or 1 (for each thread block): bool array1[]: [1, 1, 0, 0, 1, 1] Each thread in thread block accesses array1 by using threadIdx.x. And, I need to make shared dense array2 (each value represents thread ID with '1' value from array1: __shared__ bool array2[] (thread ID) : [0, 1, 4, 5] It seems that, at least, I need atomicAdd() operation to index array2. Even with atomicAdd(), I think that it is hard to make array2 like above sequence (0, 1, 4, 5). Is it possible to make array2 from array1 in cuda (for each thread block)?
you can coalesced groups: suppose the read Boolean is threasIsIN: #include <cooperative_groups.h> namespace cg = cooperative_groups; uint32_t tid = threadIdx.x; const uint32_t warpLength = 32; uint32_t warpIdx = tid / warpLength; if (threadIsIn){ auto active = cg::coalesced_threads(); uint32_t idx = active.thread_rank() + warpIdx * warpLength; array2[idx] = tid; } Edit solution with multiple warps in a block: the first warp of the block will prepare the shared array for the rest of warps in the block, this makes the other warps to wait for the first warp to finish. thread_block block = this_thread_block(); uint32_t tid = threadIdx.x; const uint32_t warpLength = 32; uint32_t warpIdx = tid / warpLength; uint32_t startIdx = 0; uint32_t tidToWrite = tid; uint32_t maxItr = blockSize / warpLength; uint32_t itr = 0; while (warpIdx == 0 && itr < maxItr){ auto warp = cg::coalesced_threads(); auto warpMask = warp.ballot(threadIsIn); // the tid'th bit is set to 1 if threadIsIn is true for tid uint32_t trueThreadsSize = __popc(warpMask); // counts the number of bits that are set to 1 if(threadIsIn){ auto active = cg::coalesced_threads(); // active.size() has the same value as trueThreadsSize array2[startIdx + active.thread_rank()] = tidToWrite; } startIdx += trueThreadsSize; tidToWrite += warpLength; ++itr; arr1Idx += warpLength; threadIsIn = arr1[arr1Idx]; } block.sync();
This is in a general category of problems called stream compaction. The canonical approach is to perform a prefix sum (scan operation) on a processed version of your data (converting the kept values to 1, the discarded values to 0), then use that prefix sum as the index to write to, in the output array. CUB provides a convenient block-level scan operation, so we don't have to write our own. Thereafter, the indexed copy is trivial: $ cat t1465.cu #include <cub/cub.cuh> #include <iostream> #include <cstdlib> const int nTPB = 1024; const int ds = nTPB; __global__ void BlockCompactKernel(bool *data, int *result, int *data_size) { // Specialize BlockScan for a 1D block of nTPB threads on type int typedef cub::BlockScan<int, nTPB> BlockScan; // Allocate shared memory for BlockScan __shared__ typename BlockScan::TempStorage temp_storage; // Obtain a segment of consecutive items that are blocked across threads int scan_data[1]; // load data bool tmp = data[threadIdx.x]; // process data scan_data[0] = (tmp)?1:0; // scan data // Collectively compute the block-wide exclusive prefix sum BlockScan(temp_storage).ExclusiveSum(scan_data, scan_data); // indexed copy if (tmp) result[scan_data[0]] = threadIdx.x; // optional: return result size if (threadIdx.x == nTPB-1) *data_size = scan_data[0] + ((tmp)?1:0); } int main(){ bool *d_data, *data = new bool[ds]; int data_size, *d_data_size, *d_result, *result = new int[ds]; cudaMalloc(&d_data_size, sizeof(d_data_size[0])); cudaMalloc(&d_result, ds*sizeof(d_result[0])); for (int i = 0; i < ds; i++) data[i] = (rand() > (RAND_MAX/2))?true:false; std::cout << "Original data:" << std::endl; for (int i=0; i < ds; i++) std::cout << (int)data[i] << ","; cudaMalloc(&d_data, ds*sizeof(d_data[0])); cudaMemcpy(d_data, data, ds*sizeof(d_data[0]), cudaMemcpyHostToDevice); BlockCompactKernel<<<1,nTPB>>>(d_data, d_result, d_data_size); cudaMemcpy(&data_size, d_data_size, sizeof(d_data_size[0]), cudaMemcpyDeviceToHost); cudaMemcpy(result, d_result, data_size*sizeof(d_result[0]), cudaMemcpyDeviceToHost); std::cout << std::endl << "Compacted data:" << std::endl; for (int i=0; i < data_size; i++) std::cout << result[i] << ","; std::cout << std::endl; } $ nvcc -o t1465 t1465.cu $ cuda-memcheck ./t1465 ========= CUDA-MEMCHECK Original data: 1,0,1,1,1,0,0,1,0,1,0,1,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,1,0,1,0,1,1,1,1,0,0,1,1,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,1,0,0,1,1,1,0,0,1,1,1,1,1,0,1,0,0,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,0,1,1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,1,1,0,1,1,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0, Compacted data: 0,2,3,4,7,9,11,13,14,15,16,17,19,23,28,30,31,32,34,35,37,39,40,41,43,46,47,49,50,53,54,61,62,63,65,67,68,69,70,73,74,75,77,78,80,83,84,87,89,90,91,92,93,95,97,98,99,102,103,105,106,108,110,116,119,123,124,125,126,128,132,135,137,139,141,143,146,147,148,149,150,151,154,159,160,161,164,166,168,170,173,174,178,179,181,182,184,186,187,189,190,191,192,193,195,196,197,198,199,200,201,202,203,204,207,208,210,212,214,219,221,222,223,225,226,229,230,233,237,238,240,244,246,249,250,251,254,255,256,258,260,261,262,264,267,268,272,273,274,276,280,282,286,287,288,289,291,293,294,295,296,298,299,301,302,303,305,308,311,315,316,318,320,321,329,330,331,332,333,337,338,343,349,350,352,353,356,357,358,360,362,366,367,368,370,374,375,378,379,382,383,386,391,392,397,398,401,402,403,404,407,410,411,412,413,415,418,422,425,427,428,431,432,433,437,439,440,441,448,450,455,457,458,459,460,461,462,464,466,467,468,469,470,473,474,475,479,481,482,483,488,489,492,493,494,496,499,500,501,502,505,506,507,508,509,511,512,513,515,516,517,518,519,520,521,522,524,525,526,527,528,529,531,534,535,536,537,539,540,541,542,544,546,547,548,549,552,554,556,563,564,565,566,569,572,573,576,577,578,581,582,583,584,585,587,590,592,593,596,597,598,600,601,604,605,606,610,611,613,614,618,619,620,621,623,624,629,630,631,632,633,637,638,639,642,644,645,648,650,651,652,653,658,662,667,668,670,677,678,682,683,685,687,689,690,692,693,696,697,698,699,700,702,704,706,712,714,715,717,720,722,724,725,726,727,728,731,732,734,737,740,741,744,747,749,751,752,753,755,756,757,761,762,763,764,765,766,767,775,776,777,782,786,787,789,790,793,794,796,797,798,799,801,802,806,808,811,812,814,815,817,820,822,827,829,830,832,833,835,836,839,847,851,852,853,854,855,858,860,863,864,865,866,868,869,870,872,876,878,879,880,881,882,883,884,885,886,887,888,890,891,895,896,897,899,902,908,909,911,912,913,916,917,918,920,921,922,923,924,926,927,928,929,932,938,941,942,944,945,950,952,954,955,961,964,968,973,975,976,977,980,981,983,985,986,987,989,990,991,994,996,999,1001,1002,1004,1008,1011,1014,1019,1020,1021,1022, ========= ERROR SUMMARY: 0 errors $
CUB reduction using 2D grid of blocks
I'm trying to make a sum using the CUB reduction method. The big problem is: I'm not sure how to return the values of each block to the Host when using 2-dimensional grids. #include <iostream> #include <math.h> #include <cub/block/block_reduce.cuh> #include <cub/block/block_load.cuh> #include <cub/block/block_store.cuh> #include <iomanip> #define nat 1024 #define BLOCK_SIZE 32 #define GRID_SIZE 32 struct frame { int natm; char title[100]; float conf[nat][3]; }; using namespace std; using namespace cub; __global__ void add(frame* s, float L, float rc, float* blocksum) { int i = blockDim.x*blockIdx.x + threadIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; float E=0.0, rij, dx, dy, dz; // Your calculations first so that each thread holds its result dx = fabs(s->conf[j][0] - s->conf[i][0]); dy = fabs(s->conf[j][1] - s->conf[i][1]); dz = fabs(s->conf[j][2] - s->conf[i][2]); dx = dx - round(dx/L)*L; dy = dy - round(dy/L)*L; dz = dz - round(dz/L)*L; rij = sqrt(dx*dx + dy*dy + dz*dz); if ((rij <= rc) && (rij > 0.0)) {E = (4*((1/pow(rij,12))-(1/pow(rij,6))));} // E = 1.0; __syncthreads(); // Block wise reduction so that one thread in each block holds sum of thread results typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; float aggregate = BlockReduce(temp_storage).Sum(E); if (threadIdx.x == 0 && threadIdx.y == 0) blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate; } int main(void) { frame * state = (frame*)malloc(sizeof(frame)); float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float)); state->natm = nat; //inicializando o numero de atomos; char name[] = "estado1"; strcpy(state->title,name); for (int i = 0; i < nat; i++) { state->conf[i][0] = i; state->conf[i][1] = i; state->conf[i][2] = i; } frame * d_state; float *d_blocksum; cudaMalloc((void**)&d_state, sizeof(frame)); cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float))); cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice); dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); dim3 gridBlock(GRID_SIZE,GRID_SIZE); add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum); cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost); float Etotal = 0.0; for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){ Etotal += blocksum[k]; } cout << endl << "energy: " << Etotal << endl; if (cudaSuccess != status) { cout << cudaGetErrorString(status) << endl; } // Free memory cudaFree(d_state); cudaFree(d_blocksum); return cudaThreadExit(); } What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code: blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate; The idea here is to return a 1D array, which contains the sum of each block. I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that). I looked for some example that use 2D grid with CUB, but did not find. I really new in CUDA program, maybe I'm making a mistake. edit: I put the complete code. For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE: $ cat t1360.cu #include <stdio.h> #include <cub/cub.cuh> #define BLOCK_SIZE 32 #define GRID_SIZE 25 __global__ void add(float* blocksum) { float E = 1.0; // Block wise reduction so that one thread in each block holds sum of thread results typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; float aggregate = BlockReduce(temp_storage).Sum(E); __syncthreads(); if (threadIdx.x == 0 && threadIdx.y == 0) blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate; } int main(){ float *d_result, *h_result; h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float)); cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float)); dim3 grid = dim3(GRID_SIZE,GRID_SIZE); dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE); add<<<grid, block>>>(d_result); cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;} float result = 0; for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i]; if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result); else printf("Success\n"); return 0; } $ nvcc -o t1360 t1360.cu $ ./t1360 Success $ The important change I made to your kernel code was in the output indexing: blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate; We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up. Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim. Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.
Solving tridiagonal linear systems in CUDA
I am trying to implement a tridiagonal system solver based on the Cyclic Reduction method on my GTS450. Cyclic Reduction is illustrated in this paper Y. Zhang, J. Cohen, J.D. Owens, "Fast Tridiagonal Solvers on GPU" However, whatever I do, my CUDA code is far slower than the sequential counterpart. My result for a total of 512 x 512 points is 7ms, however on my i7 3.4GHz it is 5ms. The GPU is not accelerating! Which could be the problem? #include "cutrid.cuh" __global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x) { int idx_global=blockIdx.x*blockDim.x+threadIdx.x; int idx=threadIdx.x; __shared__ double asub[512]; __shared__ double bsub[512]; __shared__ double csub[512]; __shared__ double dsub[512]; double at=0; double bt=0; double ct=0; double dt=0; asub[idx]=a[idx_global]; bsub[idx]=b[idx_global]; csub[idx]=c[idx_global]; dsub[idx]=d[idx_global]; for(int stride=1;stride<N;stride*=2) { int margin_left,margin_right; margin_left=idx-stride; margin_right=idx+stride; at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f; bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f) -((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f); ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f; dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f) -((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f); __syncthreads(); asub[idx]=at; bsub[idx]=bt; csub[idx]=ct; dsub[idx]=dt; __syncthreads(); } x[idx_global]=dsub[idx]/bsub[idx]; }/*}}}*/ I launched this kernel by cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x), and reached 100% device occupancy. This result has puzzled me for days. There is an improved version of my code: #include "cutrid.cuh" __global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x) {/*{{{*/ int idx_global=blockIdx.x*blockDim.x+threadIdx.x; int idx=threadIdx.x; __shared__ float asub[512]; __shared__ float bsub[512]; __shared__ float csub[512]; __shared__ float dsub[512]; asub[idx]=a[idx_global]; bsub[idx]=b[idx_global]; csub[idx]=c[idx_global]; dsub[idx]=d[idx_global]; __syncthreads(); //Reduction for(int stride=1;stride<512;stride*=2) { int margin_left=(idx-stride); int margin_right=(idx+stride); if(margin_left<0) margin_left=0; if(margin_right>=512) margin_right=511; float tmp1 = asub[idx] / bsub[margin_left]; float tmp2 = csub[idx] / bsub[margin_right]; float tmp3 = dsub[margin_right]; float tmp4 = dsub[margin_left]; __syncthreads(); dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2; bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2; tmp3 = -csub[margin_right]; tmp4 = -asub[margin_left]; __syncthreads(); asub[idx] = tmp3*tmp1; csub[idx] = tmp4*tmp2; __syncthreads(); } x[idx_global]=dsub[idx]/bsub[idx]; }/*}}}*/ The speed is improved to 0.73ms on a Quadro k4000 for 512 x 512 system, however the code in the mentioned paper runs in 0.5ms on a GTX280.
Solving a tridiagonal system of equations is a challenging parallel problem since the classical solution scheme, i.e., Gaussian elimination, is inherently sequential. Cyclic Reduction consists of two phases: Forward Reduction. The original system is split in two independent tridiagonal systems for two sets of unknowns, the ones with odd index and the ones with even index. Such systems can be solved independently and this step can be seen as the first of a divide et impera scheme. The two smaller systems are split again in the same way in two subsystems and the process is repeated until a system of only 2 equations is reached. Backward Substitution. The system of 2 equations is solved first. Then, the divide et impera structure is climbed up by solving the sub-systems independently on different cores. I'm not sure (but correct me if I'm wrong) that your code will return consistent results. N does not appear to be defined. Also, you are accessing csub[idx-stride], but I'm not sure what does it mean when idx==0 and stride>1. Furthermore, you are using several conditional statements, essentially for boundary checkings. Finally, your code lacks a proper thread structure capable to deal with the mentioned divide et impera scheme, conceptually pretty much like the one used in the CUDA SDK reduction samples. As mentioned in one of my comments above, I remembered that at tridiagonalsolvers you can find an implementation of the Cyclic Reduction scheme for solving tridiagonal equation systems. Browsing the related google pages, it seems to me that the code is mantained, among others, by the first Author of the above paper (Yao Zhang). The code is copied and pasted below. Note that the boundary check is done only once (if (iRight >= systemSize) iRight = systemSize - 1;), thus limiting the number of conditional statements involved. Note also the thread structure capable to deal with a divide et impera scheme. The code by Zhang, Cohen and Owens __global__ void crKernel(T *d_a, T *d_b, T *d_c, T *d_d, T *d_x) { int thid = threadIdx.x; int blid = blockIdx.x; int stride = 1; int numThreads = blockDim.x; const unsigned int systemSize = blockDim.x * 2; int iteration = (int)log2(T(systemSize/2)); #ifdef GPU_PRINTF if (thid == 0 && blid == 0) printf("iteration = %d\n", iteration); #endif __syncthreads(); extern __shared__ char shared[]; T* a = (T*)shared; T* b = (T*)&a[systemSize]; T* c = (T*)&b[systemSize]; T* d = (T*)&c[systemSize]; T* x = (T*)&d[systemSize]; a[thid] = d_a[thid + blid * systemSize]; a[thid + blockDim.x] = d_a[thid + blockDim.x + blid * systemSize]; b[thid] = d_b[thid + blid * systemSize]; b[thid + blockDim.x] = d_b[thid + blockDim.x + blid * systemSize]; c[thid] = d_c[thid + blid * systemSize]; c[thid + blockDim.x] = d_c[thid + blockDim.x + blid * systemSize]; d[thid] = d_d[thid + blid * systemSize]; d[thid + blockDim.x] = d_d[thid + blockDim.x + blid * systemSize]; __syncthreads(); //forward elimination for (int j = 0; j <iteration; j++) { __syncthreads(); stride *= 2; int delta = stride/2; if (threadIdx.x < numThreads) { int i = stride * threadIdx.x + stride - 1; int iLeft = i - delta; int iRight = i + delta; if (iRight >= systemSize) iRight = systemSize - 1; T tmp1 = a[i] / b[iLeft]; T tmp2 = c[i] / b[iRight]; b[i] = b[i] - c[iLeft] * tmp1 - a[iRight] * tmp2; d[i] = d[i] - d[iLeft] * tmp1 - d[iRight] * tmp2; a[i] = -a[iLeft] * tmp1; c[i] = -c[iRight] * tmp2; } numThreads /= 2; } if (thid < 2) { int addr1 = stride - 1; int addr2 = 2 * stride - 1; T tmp3 = b[addr2]*b[addr1]-c[addr1]*a[addr2]; x[addr1] = (b[addr2]*d[addr1]-c[addr1]*d[addr2])/tmp3; x[addr2] = (d[addr2]*b[addr1]-d[addr1]*a[addr2])/tmp3; } // backward substitution numThreads = 2; for (int j = 0; j <iteration; j++) { int delta = stride/2; __syncthreads(); if (thid < numThreads) { int i = stride * thid + stride/2 - 1; if(i == delta - 1) x[i] = (d[i] - c[i]*x[i+delta])/b[i]; else x[i] = (d[i] - a[i]*x[i-delta] - c[i]*x[i+delta])/b[i]; } stride /= 2; numThreads *= 2; } __syncthreads(); d_x[thid + blid * systemSize] = x[thid]; d_x[thid + blockDim.x + blid * systemSize] = x[thid + blockDim.x]; }
I want to add a further answer to mention that tridiagonal systems can be easily solved in the framework of the cuSPARSE library by aid of the function cusparse<t>gtsv() cuSPARSE also provides cusparse<t>gtsv_nopivot() which, at variance with the first mentioned routine, does not perform pivoting. Both the above functions solve the same linear system with multiple right hand sides. A batched routine cusparse<t>gtsvStridedBatch() also exists which solves multiple linear systems. For all the above routines, the system matrix is fixed by simply specifying the lower diagonal, the main diagonal and the upper diagonal. Below, I'm reporting a fully worked out example using cusparse<t>gtsv() to solve a tridiagonal linear system. #include <stdio.h> #include <stdlib.h> #include <iostream> #include <assert.h> #include <cuda_runtime.h> #include <cusparse_v2.h> /********************/ /* CUDA ERROR CHECK */ /********************/ // --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) { exit(code); } } } extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); } /***************************/ /* CUSPARSE ERROR CHECKING */ /***************************/ static const char *_cusparseGetErrorEnum(cusparseStatus_t error) { switch (error) { case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; } return "<unknown>"; } inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line) { if(CUSPARSE_STATUS_SUCCESS != err) { fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \ _cusparseGetErrorEnum(err)); \ cudaDeviceReset(); assert(0); \ } } extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); } /********/ /* MAIN */ /********/ int main() { // --- Initialize cuSPARSE cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle)); const int N = 5; // --- Size of the linear system // --- Lower diagonal, diagonal and upper diagonal of the system matrix double *h_ld = (double*)malloc(N * sizeof(double)); double *h_d = (double*)malloc(N * sizeof(double)); double *h_ud = (double*)malloc(N * sizeof(double)); h_ld[0] = 0.; h_ud[N-1] = 0.; for (int k = 0; k < N - 1; k++) { h_ld[k + 1] = -1.; h_ud[k] = -1.; } for (int k = 0; k < N; k++) h_d[k] = 2.; double *d_ld; gpuErrchk(cudaMalloc(&d_ld, N * sizeof(double))); double *d_d; gpuErrchk(cudaMalloc(&d_d, N * sizeof(double))); double *d_ud; gpuErrchk(cudaMalloc(&d_ud, N * sizeof(double))); gpuErrchk(cudaMemcpy(d_ld, h_ld, N * sizeof(double), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_d, h_d, N * sizeof(double), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_ud, h_ud, N * sizeof(double), cudaMemcpyHostToDevice)); // --- Allocating and defining dense host and device data vectors double *h_x = (double *)malloc(N * sizeof(double)); h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0; h_x[4] = 300.0; double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double))); gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double), cudaMemcpyHostToDevice)); // --- Allocating the host and device side result vector double *h_y = (double *)malloc(N * sizeof(double)); double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double))); cusparseSafeCall(cusparseDgtsv(handle, N, 1, d_ld, d_d, d_ud, d_x, N)); cudaMemcpy(h_x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost); for (int k=0; k<N; k++) printf("%f\n", h_x[k]); } At this gitHub repository, a comparison of different CUDA routines available in the cuSOLVER library for the solution of tridiagonal linear systems is reported.
Things I see: 1st __syncthreads() seems redundant. There are repetitive sets of operations such as (-csub[idx-stride]*asub[idx]/bsub[idx-stride]) in your code. Use intermediate variables to hold the result and reuse them instead of making GPU calculate those sets each time. Use NVIDIA profiler to see where issues are.
cuda addvectors memory intuitive explanation
I have the following code and #include <iostream> #include <cuda.h> #include <cuda_runtime.h> #include <ctime> #include <vector> #include <numeric> float random_float(void) { return static_cast<float>(rand()) / RAND_MAX; } std::vector<float> add(float alpha, std::vector<float>& v1, std::vector<float>& v2 ) { /*Do quick size check on vectors before proceeding*/ std::vector<float> result(v1.size()); for (unsigned int i = 0; i < result.size(); ++i) { result[i]=alpha*v1[i]+v2[i]; } return result; } __global__ void Addloop( int N, float alpha, float* x, float* y ) { int i; int i0 = blockIdx.x*blockDim.x + threadIdx.x; for( i = i0; i < N; i += blockDim.x*gridDim.x ) y[i] = alpha*x[i] + y[i]; /* if ( i0 < N ) y[i0] = alpha*x[i0] + y[i0]; */ } int main( int argc, char** argv ) { float alpha = 0.3; // create array of 256k elements int num_elements = 10;//1<<18; // generate random input on the host std::vector<float> h1_input(num_elements); std::vector<float> h2_input(num_elements); for(int i = 0; i < num_elements; ++i) { h1_input[i] = random_float(); h2_input[i] = random_float(); } for (std::vector<float>::iterator it = h1_input.begin() ; it != h1_input.end(); ++it) std::cout << ' ' << *it; std::cout << '\n'; for (std::vector<float>::iterator it = h2_input.begin() ; it != h2_input.end(); ++it) std::cout << ' ' << *it; std::cout << '\n'; std::vector<float> host_result;//(std::vector<float> h1_input, std::vector<float> h2_input ); host_result = add( alpha, h1_input, h2_input ); for (std::vector<float>::iterator it = host_result.begin() ; it != host_result.end(); ++it) std::cout << ' ' << *it; std::cout << '\n'; // move input to device memory float *d1_input = 0; cudaMalloc((void**)&d1_input, sizeof(float) * num_elements); cudaMemcpy(d1_input, &h1_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice); float *d2_input = 0; cudaMalloc((void**)&d2_input, sizeof(float) * num_elements); cudaMemcpy(d2_input, &h2_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice); Addloop<<<1,3>>>( num_elements, alpha, d1_input, d2_input ); // copy the result back to the host std::vector<float> device_result(num_elements); cudaMemcpy(&device_result[0], d2_input, sizeof(float) * num_elements, cudaMemcpyDeviceToHost); for (std::vector<float>::iterator it = device_result.begin() ; it != device_result.end(); ++it) std::cout << ' ' << *it; std::cout << '\n'; cudaFree(d1_input); cudaFree(d2_input); h1_input.clear(); h2_input.clear(); device_result.clear(); std::cout << "DONE! \n"; getchar(); return 0; } I am trying to understand the gpu memory access. The kernel, for reasons of simplicity, is launched as Addloop<<<1,3>>>. I am trying to understand how this code is working by imagining the for loops working on the gpu as instances. More specifically, I imagine the following instances but they do not help. Instance 1: for( i = 0; i < N; i += 3*1 ) // ( i += 0*1 --> i += 3*1 after Eric's comment) y[i] = alpha*x[i] + y[i]; Instance 2: for( i = 1; i < N; i += 3*1 ) y[i] = alpha*x[i] + y[i]; Instance 3: for( i = 3; i < N; i += 3*1 ) y[i] = alpha*x[i] + y[i]; Looking inside of every loop it does not make any sense in the logic of adding two vectors. Can some one help? The reason I am adopting this logic of instances is because it is working well in the case of the code inside the kernel which is in comments. If these thoughts are correct what would be the instances in case we have multiple blocks inside the grid? In other words what would be the i values and the update rates (+=updaterate) in some examples? PS: The kernel code borrowed from here. UPDATE: After Eric's answer I think the execution for N = 15, e.i the number of elements, goes like this (correct me if I am wrong): For the instance 1 above i = 0 , 3, 6, 9, 12 which computes the corresponding y[i] values. For the instance 2 above i = 1 , 4, 7, 10, 13 which computes the corresponding remaining y[i] values. For the instance 3 above i = 2 , 5, 8, 11, 14 which computes the rest y[i] values.
Your blockDim.x is 3 and gridDim.x is 1 according to your setup <<<1,3>>>. So in each thread (you call it instance), it should be i+=3*1 update With the for loop you can compute 15 element using only 3 threads. Generally you can use limited number of threads to do "infinit" work. And more work per threads can improve the performance by reducing the launch overhead and hiding the instruction stalls. Another advantage is you could use fixed number of threads/blocks to do work of various sizes, thus requires less tuning.