NVProf for NCCL program - cuda

When I want to use NVProf for NCCL problem with --metrics all, The profiling results always return me like
==2781== NVPROF is profiling process 2781, command: ./nccl_example 2 16
==2781== Profiling application: ./nccl_example 2 16
==2781== Profiling result:
No events/metrics were profiled.
My simple nccl program
#include <stdio.h>
#include "cuda_runtime.h"
#include "nccl.h"
int main(int argc, char* argv[])
{
ncclComm_t comms[4];
// managing 4 devices
int nDev = 3;
int size = 32*1024*1024;
int devs[4] = {0, 1, 2};
//allocating and initializing device buffers
float** sendbuff = (float**)malloc(nDev * sizeof(float*));
float** recvbuff = (float**)malloc(nDev * sizeof(float*));
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));
CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
CUDACHECK(cudaStreamCreate(s+i));
}
//initializing NCCL
NCCLCHECK(ncclCommInitAll(comms, nDev, devs));
//calling NCCL communication API. Group API is required when using
//multiple devices per thread
NCCLCHECK(ncclGroupStart());
for (int i = 0; i < nDev; ++i)
NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i],
size, ncclFloat, ncclSum, comms[i], s[i]));
NCCLCHECK(ncclGroupEnd());
//synchronizing on CUDA streams to wait for completion of NCCL operation
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaStreamSynchronize(s[i]));
}
//free device buffers
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaFree(sendbuff[i]));
CUDACHECK(cudaFree(recvbuff[i]));
}
//finalizing NCCL
for(int i = 0; i < nDev; ++i)
ncclCommDestroy(comms[i]);
printf("Success \n");
return 0;
}
Because I need to know the detailed metrics of NCCL APIs such that I can get more insights of its performance.

That behavior is expected.
events, metrics, that are gathered by default pertain to CUDA device code activity. To see something that might be instructive, try profiling with --print-gpu-trace switch (and remove --metrics all).
The documented "metrics" don't apply to the operations (data copying) that NCCL is doing. They apply to CUDA kernels (i.e. CUDA device code activity).
nvprof does seem to have metrics that can be collected for NVLink activity. To see these, on a system that is applicable (e.g. has NVLink), run a command such as:
nvprof --query-metrics
or
nvprof --query-metrics |grep -i nvlink

Related

Has cudaMalloc changed to be asynchronous?

I've read in other places that cudaMalloc will synchronize across kernels.
(e.g. will cudaMalloc synchronize host and device?)
However, I just tested this code out and based on what I'm seeing in the visual profiler, it seems like cudaMalloc is not synchronizing. if you add cudaFree into the loop, that does synchronize. I'm using CUDA 7.5. Does anyone know if cudaMalloc changed its behavior? Or am I missing some subtlety? Thanks very much!
__global__ void slowKernel()
{
float input = 5;
for( int i = 0; i < 1000000; i++ ){
input = input * .9999999;
}
}
__global__ void fastKernel()
{
float input = 5;
for( int i = 0; i < 100000; i++ ){
input = input * .9999999;
}
}
void mallocSynchronize(){
cudaStream_t stream1, stream2;
cudaStreamCreate( &stream1 );
cudaStreamCreate( &stream2 );
slowKernel <<<1, 1, 0, stream1 >>>();
int *dev_a = 0;
for( int i = 0; i < 10; i++ ){
cudaMalloc( &dev_a, 4 * 1024 * 1024 );
fastKernel <<<1, 1, 0, stream2 >>>();
// cudaFree( dev_a ); // If you uncomment this, the second fastKernel launch will wait until slowKernel completes
}
}
Your methodology is flawed, but you conclusion looks correct to me (if you look at your profile data you should see that both long and short kernels are taking the same amount of time and run very quickly, because aggressive compiler optimisation is eliminating all the code in both cases).
I turned your example into something more reasonable
#include <time.h>
__global__ void slowKernel(float *output, bool write=false)
{
float input = 5;
#pragma unroll
for( int i = 0; i < 10000000; i++ ){
input = input * .9999999;
}
if (write) *output -= input;
}
__global__ void fastKernel(float *output, bool write=false)
{
float input = 5;
#pragma unroll
for( int i = 0; i < 100000; i++ ){
input = input * .9999999;
}
if (write) *output -= input;
}
void burntime(long val) {
struct timespec tv[] = {{0, val}};
nanosleep(tv, 0);
}
void mallocSynchronize(){
cudaStream_t stream1, stream2;
cudaStreamCreate( &stream1 );
cudaStreamCreate( &stream2 );
const size_t sz = 1 << 21;
slowKernel <<<1, 1, 0, stream1 >>>((float *)(0));
burntime(500000000L); // 500ms wait - slowKernel around 1300ms
int *dev_a = 0;
for( int i = 0; i < 10; i++ ){
cudaMalloc( &dev_a, sz );
fastKernel <<<1, 1, 0, stream2 >>>((float *)(0));
burntime(1000000L); // 1ms wait - fastKernel around 15ms
}
}
int main()
{
mallocSynchronize();
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
[note requires POSIX time functions so this won't run on Windows]
On a fairly fast Maxwell device (GTX970), I see that cudaMalloc calls in the loop overlap with the still executing slowKernel call in the profile trace, and then with running fastKernel calls in the other stream. I was willing to accept the initial conclusion that minor timing variations could be cause the effect you saw in your broken example. However, in this code, 0.5 seconds time shift in synchronisation between the host and device traces seems very improbable. You might need to vary the duration of the burntime calls to get the same effect, depending on how fast your GPU is.
So this is a very long way of saying, yes it looks like it is a non-synchronising call on Linux with CUDA 7.5 and a Maxwell device. I don't believe that has always been the case, but then again the documentation has never, as best as I can tell, said whether is should block/synchronize or not. I don't have access to older CUDA versions and supported hardware to see what this example would do with an older driver and a Fermi or Kepler device.

cuda file error "Invalid device function"

I have a GPU card GeForce GTX 295 and visual studio 2012 and cuda with version 6.5. I run a simple code like
#include "stdafx.h"
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx]; }
// main routine that executes on the host
int main(void)
{ float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device // Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice); // Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++)
printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h);
cudaFree(a_d); }
In this code ,when I use command cudaGetLastError (void) after calling the kernel, at console window an error display "Invalid device function" .How can I get rid of it?
Sample codes of cuda kit 6.5 are being run successfully with visual studio 2012.enter code here
GTX 295 has compute capability 1.3 I believe. It may be worth checking your solution compiler settings to see whether you are not compiling the solution using something like compute_20,sm_20. If so, try to change these values to e.g. compute_10,sm_10, rebuild and see whether it helps. See here for details on setting these values.
EDIT:
According to njuffa and also CUDA documentation support for cc1.0 devices was removed in CUDA 6.5 so you'll have to use compute_13,sm_13.

Getting CUDA Thrust to use a CUDA stream of your choice

Looking at kernel launches within the code of CUDA Thrust, it seems they always use the default stream. Can I make Thrust use a stream of my choice? Am I missing something in the API?
I want to update the answer provided by talonmies following the release of Thrust 1.8 which introduces the possibility of indicating the CUDA execution stream as
thrust::cuda::par.on(stream)
see also
Thrust Release 1.8.0.
In the following, I'm recasting the example in
False dependency issue for the Fermi architecture
in terms of CUDA Thrust APIs.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust\device_vector.h>
#include <thrust\execution_policy.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
struct BinaryOp{ __host__ __device__ int operator()(const int& o1,const int& o2) { return o1 * o2; } };
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), BinaryOp());
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_out[offset + streamSize/2]), BinaryOp());
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++) {
//printf("%i %i\n", h_out[i], h_checkResults[i]);
sum += h_checkResults[i] - h_out[i];
}
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page.
The Visual Profiler timeline shows the concurrency of CUDA Thrust operations and memory transfers
No you are not missing anything (at least up to the release snapshot which ships with CUDA 6.0).
The original Thrust tag based dispatch system deliberately abstracts all of the underlying CUDA API calls away, sacrificing some performance for ease of use and consistency (keep in mind that thrust has backends other than CUDA). If you want that level of flexibility, you will need to try another library (CUB, for example).
In versions since the CUDA 7.0 snapshot it has become possible to set a stream of choice for thrust operations via the execution policy and dispatch feature.

Cuda Summation per block. I get 0 returned to the sums. What is wrong?

I tried summation in cuda . I cant find what i did wrong here.
The sum is always returned 0. Can anyone help.
The shared tag defines the variable common in each block.
So i tried to sum one block at a time and finally sum up the result for overall sum.
But the sum doesnt work for block. And i am stuck.
Can anyone help.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <stdlib.h>
//#define BLOCK_SIZE 32 // size of vectors
__global__ void add( float * i_data, float * sum){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s_data;
s_data = 0;
// must be synchronized
__syncthreads();
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
__syncthreads();
if (tid <= i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
}
if (tid == 0)
sum[blockIdx.x]=s_data;
}
int main() {
int T = 10, B = 5; // threads per block and blocks per grid
float *a,*b; // host pointers
float *dev_a, *dev_b; // device pointers to host memory
int sizeIN = T*B*sizeof(int);
int sizeOUT = B*sizeof(int);
a= new float[T*B];
b= new float[B];
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
a[i*T+j]=i;
}
}
for(int i = 0;i<B;i++)
{
b[i]=0;
}
cudaMalloc((void **) &dev_a, sizeIN);
cudaMalloc((void **) &dev_b, sizeOUT);
cudaMemcpy(dev_a, a, sizeIN, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeOUT, cudaMemcpyHostToDevice);
add<<< B, T >>> (dev_a, dev_b);
cudaMemcpy(a,dev_a, sizeIN, cudaMemcpyDeviceToHost);
cudaMemcpy(b,dev_b, sizeOUT, cudaMemcpyDeviceToHost);
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
std::cout<< a[i*T+j]<<"\t";
std::cout<<std::endl;
}
std::cout<<std::endl<<std::endl<<"sum is: "<<b[i]<<std::endl;
}
std::cout<<std::endl<<std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
free(a);
free(b);
return 0;
}
This is wrong in 2 ways:
if (tid = 0)
First, you should be doing a comparison == not an assignment =. I don't know why your compiler didn't warn you about this.
Second, tid is only zero for one thread in the entire grid:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
You want one thread in each block to write the block result out to global memory:
if (threadIdx.x == 0)
This is also a problem, similarly:
if (tid <= i)
This is only satisfied for threads in the first block. Beyond that, I have to start to guess at what you want. I guess you're trying to sum the values in each block. Your construction is not a parallel reduction, but to make the minimum changes to get it "functional" I would rewrite the end of your kernel like this:
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
if (threadIdx.x == i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
__syncthreads();
}
if (threadIdx.x == 0)
sum[blockIdx.x]=s_data;
}
Although you didn't have any CUDA API errors, it's good practice to use proper cuda error checking and also run your code with cuda-memcheck any time you are having trouble with a cuda code.
I mentioned that your code above is not a classical reduction. Its just an unoptimal for-loop.
To learn about a CUDA parallel reduction, study the cuda sample code and the accompanying presentation, and there are many examples here on the CUDA tag on SO as well that you can search on.

Concurrency of CUDA default stream with created streams

I created streams in this way:
cudaStream_t stream0;
cudaStream_t stream1;
cudaStreamCreate( &stream0);
cudaStreamCreate( &stream1);
I run the kernel functions like
singlecore<<<1,1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);
The two kernels are not executed currently. But if I execute the first kernel in stream1 as:
singlecore<<<1,1,0,stream1>>>(devL2,1000);
singlecore<<<1,1,0,stream0>>>(devL2,1000);
they will execute currently.
I wonder if the kernel function in default stream can not be executed currently.
Yes there is a limitation on cuda commands issued to the default stream. Referring to the C programming guide section on implicit synchronization:
"Two commands from different streams cannot run concurrently if any one of the following operations is issued in-between them by the host thread:
...
•any CUDA command to the default stream,
"
So as a general rule of thumb, for overlapped copy and compute operations, it's easiest to program all such operations in a set of non-default streams. There's a bit of a loophole (which you've discovered) where it's possible to get overlap with commands issued in the default stream (and other streams), but it requires careful understanding of the restrictions between the default stream and other streams, as well as careful attention to the order in which you issue commands. A good example is explained in the C programming guide. Read all the way through the section on "overlapping behavior".
In your first example, the kernel issued to the default stream blocks execution of the kernel issued to the other stream. In your second example, you can have concurrency because the kernel issued to the non-default stream does not block the execution of the kernel issued to the default stream.
I want to update Robert Crovella's answer in the light of the newly issue CUDA 7.0 which, as of March 2015, is in the Release Candidate version.
With CUDA 7.0, default streams are regular streams in the sense that commands in the default stream may run concurrently with commands in non-default streams. A more detailed explanation of this new feature can be found at
CUDA 7 Streams Simplify Concurrency
This feature can be simply enabled by the additional --default stream per-thread compilation option.
At the page linked above, an example worked out by Mark Harris can be found. Here, I want to resume the example I posted at False dependency issue for the Fermi architecture. In particular, in the new example below, although I'm creating 3 streams, I'm not using anymore the first one and adopting the default stream in its place.
This is the timeline produced without the --default stream per-thread compilation option:
As you can see, the execution in the default stream does not exploit concurrency.
On this other side, this is the timeline produced with the --default stream per-thread compilation option:
As you can see now, the default stream execution overlaps with the other two streams execution.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int N)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = N;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
int offset = 0;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, 0);
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
kernel<<<subKernelBlock, nThreads>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
for(int i = 1; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, 0);
for(int i = 1; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, 0);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++)
sum += h_checkResults[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}