Contiguous Memory Allocation on GPU - cuda

Does cudaMalloc allocate contiguous chunks of memory (i.e., physical bytes next to each other)?
I have a piece of CUDA code that simply copies 128 bytes from global device memory to shared memory, using 32 threads. I am trying to find a way to guarantee that this transfer can be completed in one memory transaction of 128 byes. If cudaMalloc allocates contiguous memory blocks, then it can be easily done.
Following is the code:
#include <iostream>
using namespace std;
#define SIZE 32 //SIZE of the array to store in shared memory
#define NUMTHREADS 32
__global__ void copy(uint* memPointer){
extern __shared__ uint bits[];
int tid = threadIdx.x;
bits[tid] = memPointer[tid];
}
int main(){
uint inputData[SIZE];
uint* storedData;
for(int i=0;i<SIZE;i++){
inputData[i] = i;
}
cudaError_t e1=cudaMalloc((void**) &storedData, sizeof(uint)*SIZE);
if(e1 == cudaSuccess){
cudaError_t e3= cudaMemcpy(storedData, inputData, sizeof(uint)*SIZE, cudaMemcpyHostToDevice);
if(e3==cudaSuccess){
copy<<<1,NUMTHREADS, SIZE*4>>>(storedData);
cudaError_t e6 = cudaFree(storedData);
if(e6==cudaSuccess){
}
else{
cout << "Error freeing memory storedData" << e6 << endl;
}
}
else{
cout << "Failed to copy" << " " << e3 << endl;
}
}
else{
cout << "Failed to allocate memory" << " " << e1 << endl;
}
return 0;
}

Yes, cudaMalloc allocates contiguous chunks of memory. The "Matrix Transpose" example in the SDK (http://developer.nvidia.com/cuda-cc-sdk-code-samples) has a kernel called "copySharedMem" that does almost exactly what you're describing.

Related

Checking currently residing entities in GPU memory

What would be the easiest way of checking which (and their size) entities that have been allocated with cudaMalloc (), reside currently on a GPU device? I want to find a memory leak inside a function, that if it's just called once and exit, there is no memory leak (checked via cuda-memcheck), but if called multiple times the memory footprint gets bigger and bigger.
Nsight Visual Profiler seems too complex for what I ask and cuda-memcheck finds no leak!
There is no way to do this with the CUDA APIs. If you did want to do this, you would need to make your own instrumentation system which wraps the CUDA memory allocation/deallocation APIs which your code calls. The simplest implementation could look something like this:
#include <iostream>
#include <vector>
#include <algorithm>
typedef std::pair<void*, size_t> mrecord;
struct mymemory
{
std::vector<mrecord> mstack;
mymemory() {};
cudaError_t cudaMalloc(void** p, size_t sz);
cudaError_t cudaFree(void* p);
void print_stack();
};
cudaError_t mymemory::cudaMalloc(void** p, size_t sz)
{
cudaError_t ret = ::cudaMalloc(p, sz);
if (ret == cudaSuccess) {
mstack.push_back(mrecord(*p,sz));
}
return ret;
};
cudaError_t mymemory::cudaFree(void* p)
{
cudaError_t ret = ::cudaFree(p);
if (ret == cudaSuccess) {
auto rit = std::find_if( mstack.begin(), mstack.end(),
[&](const mrecord& r){ return r.first == p; } );
if (rit != mstack.end()) {
mstack.erase(rit);
}
}
return ret;
};
void mymemory::print_stack()
{
auto it = mstack.begin();
for(; it != mstack.end(); ++it) {
mrecord rec = *it;
std::cout << rec.first << " : " << rec.second << std::endl;
}
}
int main(void)
{
const int nallocs = 10;
void* pointers[nallocs];
mymemory mdebug;
for(int i=0; i<nallocs; ++i) {
mdebug.cudaMalloc(&pointers[i], 4<<i);
}
std::cout << "After Allocation" << std::endl;
mdebug.print_stack();
mdebug.cudaFree(pointers[1]);
mdebug.cudaFree(pointers[7]);
mdebug.cudaFree(pointers[8]);
mdebug.cudaFree(0);
std::cout << "After Deallocation" << std::endl;
mdebug.print_stack();
return 0;
}
[Warning: only very lightly tested and required C++11 compiler support]
which would do this:
~/SO$ nvcc -std=c++11 -g -arch=sm_52 instrumentation.cu
~/SO$ ./a.out
After Allocation
0x705e40000 : 4
0x705e40200 : 8
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705e40e00 : 512
0x705e41000 : 1024
0x705f40000 : 2048
After Deallocation
0x705e40000 : 4
0x705e40400 : 16
0x705e40600 : 32
0x705e40800 : 64
0x705e40a00 : 128
0x705e40c00 : 256
0x705f40000 : 2048
This might be enough to understand which memory allocations are leaking. But be aware that memory management on the GPU isn't as predictable as you might believe it to be, and you need to be careful when diagnosing a memory leak just on the basis of the amount of free memory which the device reports at any given instant. See this question for some more details.

Managing properly an array of results that is larger than the memory available at the GPU?

Having defined how to deal with errors:
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
Normally, to store our results in the array d_results, of type double, of size N, that can be allocated in the GPU memory at once, we can manage to transfer the data from the device to the host like so:
double *d_results;
HANDLE_ERROR(cudaMalloc(&d_results,N*sizeof(double)));
//Launch our kernel to do some computations and store the results in d_results
.....
// and transfer our data from the device to the host
vector<double> results(N);
cudaMemcpy(results.data(),d_results,N*sizeof(double),cudaMemcpyDeviceToHost);
If the second line fails because there are not enough memory to store all the results at once. How can I manage to do the computations and transfer the results to the host properly? is mandatory to do the computation by batches? I rather to avoid a manual batching. What is the standard approach to manage this situation in CUDA?
Batching is the best way to go. You can automate most of the batching process if you do something like this:
#include <assert.h>
#include <iostream>
int main()
{
// Allocate 4 Gb array on host
const size_t N = 1 << 30;
int * data = new int[N];
// Allocate as much memory as will fit on GPU
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
const size_t MB = 1 << 20;
cudaError_t status;
int *buffer;
size_t buffer_size = free_mem;
for(; buffer_size > MB; buffer_size -= MB) {
status = cudaMalloc((void **)&buffer, buffer_size);
if (status == cudaSuccess)
break;
}
std::cout << "Allocated " << buffer_size << " bytes on GPU" << std::endl;
// Loop through host source data in batches
std::cout << N << " items require processing" << std::endl;
size_t batchN = buffer_size / sizeof(int);
size_t remainN = N;
int * dp = data;
std::cout << "Using batch size " << batchN << std::endl;
for(; remainN > 0; remainN -= batchN) {
batchN = (remainN < batchN) ? remainN : batchN;
size_t worksize = batchN * sizeof(int);
std::cout << "Processing batch of size " << batchN;
std::cout << "," << remainN << " items remaining" << std::endl;
cudaMemcpy(buffer, dp, worksize, cudaMemcpyHostToDevice);
cudaMemset(buffer, 0xff, worksize);
cudaMemcpy(dp, buffer, worksize, cudaMemcpyDeviceToHost);
dp += batchN;
}
for(size_t i = 0; i < N; i++) {
assert(data[i] == 0xffffffff);
}
cudaDeviceReset();
return 0;
}
Which is basically
Allocate as much free memory as your device has
Iteratively process the input data to the gpu in buffer size chunks until everything is done
In the above code I have used cudaMemset as a proxy for a real kernel, but it gives you an idea of what is required. If you want to get fancier, you could use two buffers and streams (with registered/pinned host memory) and copy asynchronously to get compute/copy overlap which will improve the overall performance in non trivial cases.

How to get a self written class, including dynamic Memory, to the Kernel and back?

i want to do light scattering simulations in 3D Objects with CUDA, its similar to raytracing. So i need a vector Class.
I tried to bring a Class, representing my Photons,to the Device. This Class includes a dynamic allocated vector class(self written). It is not realy necassary that it is dynamic, but i will have the same Problem in an other case.
The Problem is, that i get an unspecified launch failure if i try to modify the vector inside the Kernel.
I think it can also be a problem of copy constructors, or something. I didnĀ“t programmed C++/CUDA for a while.
I use a GTX 480 with Compute capability 2.0 and CUDA 5.0.
Here is my main:
#include "photon.cuh"
#include "Container/vector3f.cu"
// Device code (Kernel, GPU)
__global__ void Sim(photon * l_x){
l_x->vec->m_x = l_x->vec->m_x +1;
l_x->vec->m_y = l_x->vec->m_y +1;
l_x->vec->m_z = l_x->vec->m_z +1;
}
// Host Code (CPU)
int main(int argc, char** argv)
{
photon *h_x,*d_x,*h_x2;
h_x = new photon();
//h_x->vec = new vector3f();
h_x->vec->m_x = 1;
h_x->vec->m_y = 2;
h_x->vec->m_z = 3;
std::cout << "Malloc" << std::endl;
h_x2 = (photon*)malloc(sizeof(photon));
cudaMalloc((void**)&d_x,sizeof(photon));
std::cout << "Cpy h-d" << std::endl;
cudaMemcpy(d_x,h_x,sizeof(photon),cudaMemcpyHostToDevice);
cudaError_t Err = cudaGetLastError();
if ( cudaSuccess != Err )
std::cout << cudaGetErrorString (Err) << std::endl;
std::cout << "Sim" << std::endl;
Sim<<<1, 1>>>(d_x);
cudaThreadSynchronize();
Err = cudaGetLastError();
if ( cudaSuccess != Err )
std::cout << cudaGetErrorString (Err) << std::endl;
std::cout << "CPY back" << std::endl;
cudaMemcpy(h_x2, d_x, sizeof(photon), cudaMemcpyDeviceToHost);
std::cout << h_x2->vec->m_x << std::endl;
std::cout << h_x2->vec->m_y << std::endl;
std::cout << h_x2->vec->m_z << std::endl;
cudaFree(d_x);
return 0;
}
The Photon Class:(.cuh)
class photon {
public:
vector3f *vec;
__host__ __device__ photon();
__host__ __device__ virtual ~photon();
__host__ __device__ photon(const photon &other);
};
(.cu)
#include "photon.cuh"
#include "Container/vector3f.cu"
__host__ __device__ photon::photon(){
this->vec = new vector3f();}
__host__ __device__ photon::~photon(){
delete this->vec;}
__host__ __device__ photon::photon(const photon &rhs){
this->vec = new vector3f(*rhs.vec);}
And Finaly the vector Class:
class vector3f {
public:
float m_x;
float m_y;
float m_z;
__host__ __device__ vector3f(float l_x, float l_y, float l_z){
this->m_x = l_x;
this->m_y = l_y;
this->m_z = l_z;}
__host__ __device__ vector3f(const vector3f& l_vector){
this->m_x = l_vector.m_x;
this->m_y = l_vector.m_y;
this->m_z = l_vector.m_z;}
__host__ __device__ vector3f(){
this->m_x = 0;
this->m_y = 0;
this->m_z = 0;}};
The underlying problem is that the only time you instantiate your photon class anywhere is on the host, and you are copying that host instance directly to the device. That means that the device code is attempting to de-reference a host pointer on the GPU, which is illegal and produces the runtime error you are seeing. The CUDA APIs don't do any sort of magic deep copying, so you have to manage this yourself somehow.
The obvious solution is to redesign the photon class so that vec is stored by value rather than reference. Then the whole problem goes away (and the performance will be a lot better on the GPU because you remove a level of pointer indirection during memory access).
If you are fixated on having a pointer to vec, redesign the constructor so that it takes a pointer from a memory pool, and allocate a device pool for construction. If you pass a device pointer to the constructor, the resulting instance will have a pointer to valid device memory.

Counting occurrences of specific events in CUDA kernels

Problem
I am trying to find the best way to count how many times my program ends up in some specific branches of my CUDA kernels. The idea is that some events should nearly never happen, but since the data processed by the GPU is given by a numerical optimization solver, there may be some situations where ill-defined cases become more common. Thus, I want to be able to track/monitor these phenomenons over multiple simulations to make some global statistics later.
Possible idea
The most straightforward way to do this may be to use a structure dedicated to monitoring such occurrences. Then, when entering a monitored branch, we increment the associated counter using atomicAdd. At the end of the simulation, we copy the counters back to the host and store them for some future statistics processing.
In my case, the cost of using atomicAdd should not be that important since I should not be entering those branches that much, but still, I may want to monitor some of the common branches later on, so what would be a better approach then? Since this is just for monitoring, I do not want the overhead to be too important.
I guess I could also have one monitoring structure per block and do a sum at the end, since it should not use much global memory anyway (1 unsigned int per monitored branch).
Code example
#include <iostream>
#include <time.h>
#include <cuda.h>
#include <stdio.h>
#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)
inline void __cuda_check_errors(const char *filename, const int line_number)
{
cudaError err = cudaDeviceSynchronize();
if(err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
inline void __cuda_safe_call(cudaError err, const char *filename, const int line_number)
{
if (err != cudaSuccess)
{
printf("CUDA error %i at %s:%i: %s\n",
err, filename, line_number, cudaGetErrorString(err));
exit(-1);
}
}
struct Stats
{
unsigned int even;
};
__global__ void test_kernel(int* A, int* B, Stats* stats)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int res = A[tid] + (int)tid;
if (res%2 == 0)
atomicAdd(&(stats->even), 1);
B[tid] = res;
}
int get_random_int(int min, int max)
{
return min + (rand() % (int)(max - min + 1));
}
void print_array(int* ar, unsigned int n)
{
for (unsigned int i = 0; i < n; ++i)
std::cout << ar[i] << " ";
std::cout << std::endl;
}
void print_stats(Stats* s)
{
std::cout << "even: " << s->even << std::endl;
}
int main()
{
// vector size
const unsigned int N = 10;
// device vectors
int *d_A, *d_B;
Stats *d_stats;
// host vectors
int *h_A, *h_B;
Stats *h_stats;
// allocate device memory
CUDA_SAFE_CALL(cudaMalloc(&d_A, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_B, N * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc(&d_stats, sizeof(Stats)));
// allocate host memory
h_A = new int[N];
h_B = new int[N];
h_stats = new Stats;
// initialize host data
srand(time(NULL));
for (unsigned int i = 0; i < N; ++i)
{
h_A[i] = get_random_int(0,10);
h_B[i] = 0;
}
memset(h_stats, 0, sizeof(Stats));
// copy data to the device
CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_stats, h_stats, sizeof(Stats), cudaMemcpyHostToDevice));
// launch kernel
dim3 grid_size, block_size;
grid_size.x = N;
test_kernel<<<grid_size, block_size>>>(d_A, d_B, d_stats);
// copy result back to host
CUDA_SAFE_CALL(cudaMemcpy(h_B, d_B, N * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_stats, d_stats, sizeof(Stats), cudaMemcpyDeviceToHost));
print_array(h_B, N);
print_stats(h_stats);
// free device memory
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_stats));
// free host memory
delete [] h_A;
delete [] h_B;
delete h_stats;
}
Hardware/software information
The solution I am looking for should work for CC >= 2.0 devices and CUDA >= 5.0.
The atomicAdd is is one possibility and i would probably go that route. If you do not use the result of the atomicAdd function call the compiler will emit a reduction operation such as RED.E.ADD. Reduction is very fast as long as there are not many conflicts happening (i actually use it sometimes even if i do not need the operation to be atomic because it can be quicker than loading value from global memory, doing an arithmetic operation and saving back to global memory).
The second option you have is to use a profiler counter and use the profiler to analyze the result. Please see Profiler Counter Function for more details.

CUDA pinned memory flushing from the device

CUDA 5, device capabilities 3.5, VS 2012, 64bit Win 2012 Server.
There is no shared memory access between threads, every thread is standalone.
I am using pinned memory with zero-copy. From the host, I can only read the pinned memory the device has written, only when I issue a cudaDeviceSynchronize on the host.
I want to be able to:
Flush into the pinned memory as soon as the device has updated it.
Not block the device thread (maybe by copying asynchronously)
I tried calling __threadfence_system and __threadfence after each device write, but that didn't flush.
Below is a full sample CUDA code that demonstrates my question:
#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}
Currently this sample will wait indefinitely as nothing is being read from the device unless I issue cudaDeviceSynchronize. The sample below works, but it is NOT what I want as it defeats the purpose of async copying:
void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}
I played with your code on a Centos 6.2 with CUDA 5.5 and a Tesla M2090 and can conclude this:
The problem that it does not work on your system must be a driver issue and I suggest that you get the TCC drivers.
I attached my code that runs fine and does what you want. The values appear on the host side before the kernel ends. As you can see I added some compute code to prevent the for loop to be removed due to compiler optimizations. I added a stream and a callback that get executed after all work in the stream is finished. The program outputs 1 2 and for a long time does nothing until stream finished... is printed to the console.
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )
bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}
class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);
private:
void call();
};
void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}
void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}
__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}
int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));
cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));
int filledElementsCounter = 0;
while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}
SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}
No call returned an error and cuda-memcheck didn't find any problems. This works as intended. You should really try the TCC driver.
You cannot pass the host pointer directly to the kernel. If you allocate host memory using cudaHostAlloc with cudaHostAllocMapped flag, then first you have to retrieve the device pointer of the mapped host memory before you can use it in the kernel. Use cudaHostGetDevicePointer to get the device pointer of mapped host memory.
float* hResult, *dResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
cudaHostGetDevicePointer(&dResult,hResult);
Kernel<<<1,blocks>>>(dResult);
Calling __threadfence_system() will ensure that the write is visible to the system before proceeding, but your CPU will be caching the h_result variable and hence you're just spinning on the old value in an infinite loop. Try marking h_result as volatile.