CUDA pinned memory flushing from the device - cuda

CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.
There is no shared memory access between threads, every thread is standalone.
I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize on the host.
I want to be able to:
Flush into the pinned memory as soon as the device has updated it.
Not block the device thread (maybe by copying asynchronously)
I tried calling __threadfence_system and __threadfence after each device write, but that didn't flush.
Below is a full sample CUDA code that demonstrates my question:
#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}
Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}

I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:
The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.
I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1 2 and for a long time does nothing until stream finished... is printed to the console.
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )
bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}
class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);
private:
void call();
};
void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}
void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}
int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));
int filledElementsCounter = 0;
while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.

You cannot pass the host pointer directly to the kernel. If you allocate host memory using cudaHostAlloc with cudaHostAllocMapped flag, then first you have to retrieve the device pointer of the mapped host memory before you can use it in the kernel. Use cudaHostGetDevicePointer to get the device pointer of mapped host memory.
float* hResult, *dResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
cudaHostGetDevicePointer(&dResult,hResult);
Kernel<<<1,blocks>>>(dResult);

Calling __threadfence_system() will ensure that the write is visible to the system before proceeding, but your CPU will be caching the h_result variable and hence you're just spinning on the old value in an infinite loop. Try marking h_result as volatile.

Related

terminate called after throwing an instance of 'thrust::system::system_error' what(): parallel_for failed: cudaErrorInvalidValue: invalid argument

I am trying to count the number of times curand_uniform() returns 1.0. However i cant seem to get the following code to work for me:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
using namespace std;
__global__
void counts(int length, int *sum, curandStatePhilox4_32_10_t* state) {
int tempsum = int(0);
int i = blockIdx.x * blockDim.x + threadIdx.x;
curandStatePhilox4_32_10_t localState = state[i];
for(; i < length; i += blockDim.x * gridDim.x) {
double thisnum = curand_uniform( &localState );
if ( thisnum == 1.0 ){
tempsum += 1;
}
}
atomicAdd(sum, tempsum);
}
__global__
void curand_setup(curandStatePhilox4_32_10_t *state, long seed) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, id, 0, &state[id]);
}
int main(int argc, char *argv[]) {
const int N = 1e5;
int* count_h = 0;
int* count_d;
cudaMalloc(&count_d, sizeof(int) );
cudaMemcpy(count_d, count_h, sizeof(int), cudaMemcpyHostToDevice);
int threads_per_block = 64;
int Nblocks = 32*6;
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
curand_setup<<<Nblocks, threads_per_block>>>(d_state.data().get(), time(0));
counts<<<Nblocks, threads_per_block>>>(N, count_d, d_state.data().get());
cudaMemcpy(count_h, count_d, sizeof(int), cudaMemcpyDeviceToHost);
cout << count_h << endl;
cudaFree(count_d);
free(count_h);
}
I am getting the terminal error (on
linux):
terminate called after throwing an instance of 'thrust::system::system_error'
what(): parallel_for failed: cudaErrorInvalidValue: invalid argument
Aborted (core dumped)
And i am compiling like this:
nvcc -Xcompiler "-fopenmp" -o test uniform_one_hit_count.cu
I don't understand this error message.
This line:
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
is initializing a new vector on the device. When thrust does that, it calls the constructor for the object in use, in this case curandStatePhilox4_32_10, a struct whose definition is in /usr/local/cuda/include/curand_philox4x32_x.h (on linux, anyway). Unfortunately that struct definition doesn't provide any constructors decorated with __device__, and this is causing trouble for thrust.
A simple workaround would be to assemble the vector on the host and copy it to the device:
thrust::host_vector<curandStatePhilox4_32_10_t> h_state(Nblocks*threads_per_block);
thrust::device_vector<curandStatePhilox4_32_10_t> d_state = h_state;
Alternatively, just use cudaMalloc to allocate space:
curandStatePhilox4_32_10_t *d_state;
cudaMalloc(&d_state, (Nblocks*threads_per_block)*sizeof(d_state[0]));
You have at least one other problem as well. This is not actually providing a proper allocation of storage for what the pointer should be pointing to:
int* count_h = 0;
after that, you should do something like:
count_h = (int *)malloc(sizeof(int));
memset(count_h, 0, sizeof(int));
and on your print-out line, you most likely want to do this:
cout << count_h[0] << endl;
The other way to address the count_h issue would be to start with:
int count_h = 0;
and this would necessitate a different set of changes to your code (to the cudaMemcpy operations).

Unified memory and struct with arrays

I have a big Struct of Arrays of Structs on CUDA, that is constant and read only for my application. A quite simplified example would be
struct Graph{
Node * nodes;
int nNode;
}
struct Node{
int* pos;
int nPos;
}
My kernels would need to navigate this graph and query it. As you know, copying this struct to GPU memory with cudaMalloc and cudaMemcpy is just lots of code, that unified memory is supposed to remove the need of.
In my code, I generated the graph in CPU and then, for testing, I designed the following kernel
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nNode;
};
being called as:
// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));
testKernel<<<1,1>>>(graph,d_res);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));
with the error checks from here.
When I use the testKernel as is shown, it works fine, but if I change the kernel to:
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nodes[0].nPos;
};
I get illegal memory access errors.
Is this because the unified memory does not handle this type of data correctly?
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Full MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
// d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
graph.nodes = (Node*)malloc(2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
Your code isn't using CUDA unified memory. UM is not "automatic" in any way. It requires specific programming steps to take advantage of it and it has specific system requirements.
All of this is covered in the UM section of the programming guide.
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Proper use of UM should allow this. Here is a fully worked example. The only thing I have done is mechanically convert your malloc operations in host code to equivalent cudaMallocManaged operations.
$ cat t1389.cu
#include <algorithm>
#include <stdio.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
$ nvcc t1389.cu -o t1389
$ cuda-memcheck ./t1389
========= CUDA-MEMCHECK
2
3========= ERROR SUMMARY: 0 errors
$
UM has a number of system requirements that are documented. I'm not going to try to recite them all here. Primarily you need a cc3.0 or higher GPU. Your MCVE did not include any standard error checking, and I didn't try to add it. But if you still have problems with this code, be sure to use proper CUDA error checking and run it with cuda-memcheck.
If your entire data structure, including embedded pointers, is allocated using ordinary host allocators, and you have no control over that, then you won't be able to use it directly in a UM regime, without doing some sort of involved copying. The exception here would be on an IBM Power9 system as mentioned in section K.1.6 of the above linked programming guide section.
Before attempting to use a host allocator (e.g. malloc) with UM, you should first test the pageableMemoryAccessUsesHostPageTables property, as mentioned in that section.
That property currently won't be set on any system except a properly configured IBM Power9 system. No x86 system currently has this property set/available.

Checking currently residing entities in GPU memory

What would be the easiest way of checking which (and their size) entities that have been allocated with cudaMalloc (), reside currently on a GPU device? I want to find a memory leak inside a function, that if it's just called once and exit, there is no memory leak (checked via cuda-memcheck), but if called multiple times the memory footprint gets bigger and bigger.
Nsight Visual Profiler seems too complex for what I ask and cuda-memcheck finds no leak!
There is no way to do this with the CUDA APIs. If you did want to do this, you would need to make your own instrumentation system which wraps the CUDA memory allocation/deallocation APIs which your code calls. The simplest implementation could look something like this:
#include <iostream>
#include <vector>
#include <algorithm>
typedef std::pair<void*, size_t> mrecord;
struct mymemory
{
std::vector<mrecord> mstack;
mymemory() {};
cudaError_t cudaMalloc(void** p, size_t sz);
cudaError_t cudaFree(void* p);
void print_stack();
};
cudaError_t mymemory::cudaMalloc(void** p, size_t sz)
{
cudaError_t ret = ::cudaMalloc(p, sz);
if (ret == cudaSuccess) {
mstack.push_back(mrecord(*p,sz));
}
return ret;
};
cudaError_t mymemory::cudaFree(void* p)
{
cudaError_t ret = ::cudaFree(p);
if (ret == cudaSuccess) {
auto rit = std::find_if( mstack.begin(), mstack.end(),
[&](const mrecord& r){ return r.first == p; } );
if (rit != mstack.end()) {
mstack.erase(rit);
}
}
return ret;
};
void mymemory::print_stack()
{
auto it = mstack.begin();
for(; it != mstack.end(); ++it) {
mrecord rec = *it;
std::cout << rec.first << " : " << rec.second << std::endl;
}
}
int main(void)
{
const int nallocs = 10;
void* pointers[nallocs];
mymemory mdebug;
for(int i=0; i<nallocs; ++i) {
mdebug.cudaMalloc(&pointers[i], 4<<i);
}
std::cout << "After Allocation" << std::endl;
mdebug.print_stack();
mdebug.cudaFree(pointers[1]);
mdebug.cudaFree(pointers[7]);
mdebug.cudaFree(pointers[8]);
mdebug.cudaFree(0);
std::cout << "After Deallocation" << std::endl;
mdebug.print_stack();
return 0;
}
[Warning: only very lightly tested and required C++11 compiler support]
which would do this:
~/SO$ nvcc -std=c++11 -g -arch=sm_52 instrumentation.cu
~/SO$ ./a.out
After Allocation
0x705e40000 : 4
0x705e40200 : 8
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705e40e00 : 512
0x705e41000 : 1024
0x705f40000 : 2048
After Deallocation
0x705e40000 : 4
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705f40000 : 2048
This might be enough to understand which memory allocations are leaking. But be aware that memory management on the GPU isn't as predictable as you might believe it to be, and you need to be careful when diagnosing a memory leak just on the basis of the amount of free memory which the device reports at any given instant. See this question for some more details.

Counting occurrences of specific events in CUDA kernels

Problem
I am trying to find the best way to count how many times my program ends up in some specific branches of my CUDA kernels. The idea is that some events should nearly never happen, but since the data processed by the GPU is given by a numerical optimization solver, there may be some situations where ill-defined cases become more common. Thus, I want to be able to track/monitor these phenomenons over multiple simulations to make some global statistics later.
Possible idea
The most straightforward way to do this may be to use a structure dedicated to monitoring such occurrences. Then, when entering a monitored branch, we increment the associated counter using atomicAdd. At the end of the simulation, we copy the counters back to the host and store them for some future statistics processing.
In my case, the cost of using atomicAdd should not be that important since I should not be entering those branches that much, but still, I may want to monitor some of the common branches later on, so what would be a better approach then? Since this is just for monitoring, I do not want the overhead to be too important.
I guess I could also have one monitoring structure per block and do a sum at the end, since it should not use much global memory anyway (1 unsigned int per monitored branch).
Code example
#include <iostream>
#include <time.h>
#include <cuda.h>
#include <stdio.h>
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
struct Stats
{
unsigned int even;
};
__global__ void test_kernel(int* A, int* B, Stats* stats)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int res = A[tid] + (int)tid;
if (res%2 == 0)
atomicAdd(&(stats->even), 1);
B[tid] = res;
}
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
void print_array(int* ar, unsigned int n)
{
for (unsigned int i = 0; i < n; ++i)
std::cout << ar[i] << " ";
std::cout << std::endl;
}
void print_stats(Stats* s)
{
std::cout << "even: " << s->even << std::endl;
}
int main()
{
// vector size
const unsigned int N = 10;
// device vectors
int *d_A, *d_B;
Stats *d_stats;
// host vectors
int *h_A, *h_B;
Stats *h_stats;
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc(&d_A, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_B, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_stats, sizeof(Stats)));
// allocate host memory
h_A = new int[N];
h_B = new int[N];
h_stats = new Stats;
// initialize host data
srand(time(NULL));
for (unsigned int i = 0; i < N; ++i)
{
h_A[i] = get_random_int(0,10);
h_B[i] = 0;
}
memset(h_stats, 0, sizeof(Stats));
// copy data to the device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_stats, h_stats, sizeof(Stats), cudaMemcpyHostToDevice));
// launch kernel
dim3 grid_size, block_size;
grid_size.x = N;
test_kernel<<<grid_size, block_size>>>(d_A, d_B, d_stats);
// copy result back to host
CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_stats, d_stats, sizeof(Stats), cudaMemcpyDeviceToHost));
print_array(h_B, N);
print_stats(h_stats);
// free device memory
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_stats));
// free host memory
delete [] h_A;
delete [] h_B;
delete h_stats;
}
Hardware/software information
The solution I am looking for should work for CC >= 2.0 devices and CUDA >= 5.0.
The atomicAdd is is one possibility and i would probably go that route. If you do not use the result of the atomicAdd function call the compiler will emit a reduction operation such as RED.E.ADD. Reduction is very fast as long as there are not many conflicts happening (i actually use it sometimes even if i do not need the operation to be atomic because it can be quicker than loading value from global memory, doing an arithmetic operation and saving back to global memory).
The second option you have is to use a profiler counter and use the profiler to analyze the result. Please see Profiler Counter Function for more details.

cuda- code doesnt enter the kernel

I am trying to learn cuda. I am trying to run a simple code
#include <stdlib.h>
#include <stdio.h>
__global__ void kernel(int *array)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
array[index] = 7;
}
int main(void)
{
int num_elements = 256;
int num_bytes = num_elements * sizeof(int);
// pointers to host & device arrays
int *device_array = 0;
int *host_array = 0;
// malloc a host array
host_array = (int*)malloc(num_bytes);
// cudaMalloc a device array
cudaMalloc((void**)&device_array, num_bytes);
int block_size = 128;
int grid_size = num_elements / block_size;
kernel<<<grid_size,block_size>>>(device_array);
// download and inspect the result on the host:
cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyDeviceToHost);
// print out the result element by element
for(int i=0; i < num_elements; ++i)
{
printf("%d ", host_array[i]);
}
// deallocate memory
free(host_array);
cudaFree(device_array);
}
It is supposed to print 7's but it prints 0's
This statement doesn't seem to get executed
"kernel<<>>(device_array);"
It doesn't give any compilation error also.
Any help ??
The code runs fine on my machine, but make sure you add cudaDeviceSynchronize and error checking after the kernel call.
Change the code as follows to check for errors:
kernel<<<grid_size,block_size>>>(device_array);
// wait until tasks are completed
cudaDeviceSynchronize();
// check for errors
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
fprintf(stderr, "ERROR: %s \n", cudaGetErrorString(error));
}