Not sure what is the best way to find the number of maximum available threads of my GPU.
I have the following code:
int deviceCount, device;
int gpuDeviceCount = 0;
struct cudaDeviceProp properties;
cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
if (cudaResultCode != cudaSuccess)
deviceCount = 0;
/* machines with no GPUs can still report one emulation device */
for (device = 0; device < deviceCount; ++device) {
cudaGetDeviceProperties(&properties, device);
if (properties.major != 9999) /* 9999 means emulation only */
if (device==0)
{
printf("multiProcessorCount %d\n",properties.multiProcessorCount);
printf("maxThreadsPerMultiProcessor %d\n",properties.maxThreadsPerMultiProcessor);
}
}
which returns:
multiProcessorCount 14
maxThreadsPerMultiProcessor 1536
It turns out the total number is 14*1536=21504. I got a feeling it is too small (I have a Tesla M2070).
your way of checking is correct.
you can check the NVIDIA cuda SDK samples, "Device query" sample in SDK defines it well
Related
I am trying to understand how to synchronize a grid of threads with cudaLaunchCooperativeKernel.
https://developer.nvidia.com/blog/cooperative-groups/
I have a very simple kernel where two threads update an array, sync and both print the array:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void kernel(float *buf){
cg::grid_group
grid = cg::this_grid();
if(grid.thread_rank()<2)
buf[grid.thread_rank()] = 10+grid.thread_rank();
assert(grid.is_valid()); // ok!
grid.sync();
if(grid.thread_rank()<2)
printf("thread=%d: %g %g\n",(int)grid.thread_rank(),buf[0],buf[1]);
}
Instead of printing values (10,11) twice, I get:
thread=0: 10 0
thread=1: 0 11
All cuda calls were fine, cuda-memcheck is happy, my cards is "GeForce RTX 2060 SUPER" and it does support cooperative kernels work, checked with:
int supportsCoopLaunch = 0;
if( cudaSuccess != cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev) )
throw std::runtime_error("Cooperative Launch is not supported on this machine configuration.");
I am confused... Why I don't see the synchronization?
This test is incorrect:
int supportsCoopLaunch = 0;
if( cudaSuccess != cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev) )
throw std::runtime_error("Cooperative Launch is not supported on this machine configuration.");
The support (or lack of) is not communicated via the cudaError_t return value of the function, instead it is communicated via the value placed in the supportsCoopLaunch variable. You would want to do something like:
int supportsCoopLaunch = 0;
cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev);
if( supportsCoopLaunch != 1)
throw std::runtime_error("Cooperative Launch is not supported on this machine configuration.");
I found the bug. The actual code was something like that:
__device__ void kernel(float *buf){/* see the function body above*/}
__global__ void parent_kernel(){
float buf[2]; // per-thread buffer!!! The kernel will not 'sync' it!
kernel(buf); // different kernels will get different buffers
}
I have the following simple code to find available GPUs
int * getFreeGpuList(int *numFree) {
int * gpuList;
int nDevices;
int i, j = 0, count = 0;
cudaGetDeviceCount(&nDevices);
gpuList = (int *) malloc(nDevices * sizeof(int));
for (i = 0; i < nDevices; ++i) {
cudaSetDevice(i);
size_t freeMem;
size_t totalMem;
cudaMemGetInfo(&freeMem, &totalMem);
if (freeMem > .9 * totalMem) {
gpuList[j] = i;
count++;
j++;
}
}
*numFree = count;
return gpuList;
}
The problem is that cudaMemGetInfo occupies some memory (~150MB in my case) in each GPU. This code is a part of a bigger program that runs for a long time, and I often run several processes at the same time, so in the end the memory occupied by this function is significant. Could you please let me know how I can free the GPU memory occupied by cudaMemGetInfo? Thanks!
Based on some insight from talonmies above that cudaSetDevice creates a context and occupies some memory in the device, I found out that cudaDeviceReset can "explicitly destroys and cleans up all resources associated with the current device in the current process" without affecting other processes on the same device.
Update Nov 26: If one wants to query GPU info, it's better using the NVML library. In my experience, it is much faster and does not take up memory for simple memory and name queryings.
I found that CUDA stream will block when I launch lots of kernels (more than 1000). I am wondering is there any configuration that I can change?
In my experiments, I launch a small kernel 10000 times. This kernel ran shortly (about 190us). The kernel launched very fast when launching the first 1000 kernels. It takes 4~5us to launch a kernel. But after that, The launch process becomes slow. It takes about 190us to launch a new kernel. The CUDA stream seems to wait for the previous kernel complete and the buffer size is about 1000 kernel.
When I created 3 streams, each stream can launch 1000 kernel asynchrony.
I want to make this buffer bigger. I try to set cudaLimitDevRuntimePendingLaunchCount, but it does not work. Is there any way?
#include <stdio.h>
#include "cuda_runtime.h"
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if (e != cudaSuccess) { \
printf("Failed: Cuda error %s:%d '%s'\n", \
__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// a dummy kernel for test
__global__ void add(float *a, int n) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = 0; i < n; i++) {
a[id] = sqrt(a[id] + 1);
}
}
int main(int argc, char* argv[])
{
// managing 1 devices
int nDev = 1;
int nStream = 1;
int size = 32*1024*1024;
// allocating and initializing device buffers
float** buffer = (float**)malloc(nDev * sizeof(float*));
cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev*nStream);
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
// CUDACHECK(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, 10000));
CUDACHECK(cudaMalloc(buffer + i, size * sizeof(float)));
CUDACHECK(cudaMemset(buffer[i], 1, size * sizeof(float)));
for (int j = 0; j < nStream; j++) {
CUDACHECK(cudaStreamCreate(s+i*nStream+j));
}
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
for (int j=0; j < 10000; j++) {
for (int k=0; k < nStream; k++) {
add<<<32, 1024, 0, s[i*nStream+k]>>>(buffer[i], 1000);
}
}
}
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
cudaDeviceSynchronize();
}
// free device buffers
for (int i = 0; i < nDev; ++i) {
CUDACHECK(cudaSetDevice(i));
CUDACHECK(cudaFree(buffer[i]));
}
printf("Success \n");
return 0;
}
Here is the nvprof results:
When I create 3 streams, the first 3000 kernel launched quickly and then become slow
When I create 1 streams, the first 1000 kernel launched quickly and then become slow
The behavior you are witnessing is expected behavior. If you search on the cuda tag for "queue" or "launch queue" you will find many other questions that refer to it. CUDA has a queue (apparently per-stream) that kernel launches go into. As long as the outstanding launch count is less than the queue depth, the launch process will be asynchronous.
However when the outstanding (i.e. uncompleted) launches exceed the queue depth, the launch process changes to a kind of synchronous behavior (although not synchronous in the usual sense). Specifically, when the outstanding number of kernel launches exceeds the queue depth, the launch process will block the CPU thread that is performing the next launch, until a launch slot opens in the queue (effectively means a kernel has retired at the other end of the queue).
You have no visibility into this (no way to query the number of slots open in the queue) nor any way to view or control the queue depth. Most of the information I'm reciting here is obtained by inspection; it is not formally published in CUDA documentation that I am aware of.
As already discussed in the comments, one possible approach to alleviate your concern around launches in a multi-device scenario is to launch breadth-first rather than depth-first. By this I mean that you should modify your launch loops so that you launch a kernel to device 0, then device 1, then device 2, etc. before launching the next kernel on device 0. This will give you the optimum performance in the sense that all GPUs will be engaged with processing, as early as possible in the launch sequence.
If you'd like to see changes in CUDA behavior or documentation, the general suggestion is to become a registered developer at developer.nvidia.com, then log into your account there and file a bug, using the bug filing process accessible by clicking on your account name in the upper right hand corner.
Suppose I have 8 blocks of 32 threads each running on a GTX 970. Each blcok either writes all 1's or all 0's to an array of length 32 in global memory, where thread 0 in a block writes to position 0 in the array.
Now to write the actual values atomicExch is used, exchanging the current value in the array with the value that the block attempts to write. Because of SIMD, atomic operation and the fact that a warp executes in lockstep I would expect the array to, at any point in time, only contain 1's or 0's. But never a mix of the two.
However, while running code like this there are several cases where at some point in time the array contains of a mix of 0's and 1's. Which appears to point to the fact that atomic operations are not executed per warp, and instead scheduled using some other scheme.
From other sources I have not really found a conclusive write-up detailing the scheduling of atomic operations across different warps (please correct me if I'm wrong), so I was wondering if there is any information on this topic. Since I need to write many small vectors consisting of several 32 bit integers atomically to global memory, and an atomic operation that is guaranteed to write a single vector atomically is obviously very important.
For those wondering, the code I wrote was executed on a GTX 970, compiled on compute capability 5.2, using CUDA 8.0.
The atomic instructions, like all instructions, are scheduled per warp. However there is an unspecified pipeline associated with atomics, and the scheduled instruction flow through the pipeline is not guaranteed to be executed in lockstep, for every thread, for every stage through the pipeline. This gives rise to the possibility for your observations.
I believe a simple thought experiment will demonstrate that this must be true: what if 2 threads in the same warp targeted the same location? Clearly every aspect of the processing could not proceed in lockstep. We could extend this thought experiment to the case where we have multiple issue per clock within an SM and even across SMs, to as additional examples.
If the vector length were short enough (16 bytes or less) then it should be possible to accomplish this ("atomic update") simply by having a thread in a warp write an appropriate vector-type quantity, e.g. int4. As long as all threads (regardless of where they are in the grid) are attempting to update a naturally aligned location, the write should not be corrupted by other writes.
However, after discussion in the comments, it seems that OP's goal is to be able to have a warp or threadblock update a vector of some length, without interference from other warps or threadblocks. It seems to me that really what is desired is access control (so that only one warp or threadblock is updating a particular vector at a time) and OP had some code that wasn't working as desired.
This access control can be enforced using an ordinary atomic operation (atomicCAS in the example below) to permit only one "producer" to update a vector at a time.
What follows is an example producer-consumer code, where there are multiple threadblocks that are updating a range of vectors. Each vector "slot" has a "slot control" variable, which is atomically updated to indicate:
vector is empty
vector is being filled
vector is filled, ready for "consumption"
with this 3-level scheme, we can allow for ordinary access to the vector by both consumer and multiple producer workers, with a single ordinary atomic variable access mechanism. Here is an example code:
#include <assert.h>
#include <iostream>
#include <stdio.h>
const int num_slots = 256;
const int slot_length = 32;
const int max_act = 65536;
const int slot_full = 2;
const int slot_filling = 1;
const int slot_empty = 0;
const int max_sm = 64; // needs to be greater than the maximum number of SMs for any GPU that it will be run on
__device__ int slot_control[num_slots] = {0};
__device__ int slots[num_slots*slot_length];
__device__ int observations[max_sm] = {0}; // reported by consumer
__device__ int actives[max_sm] = {0}; // reported by producers
__device__ int correct = 0;
__device__ int block_id = 0;
__device__ volatile int restricted_sm = -1;
__device__ int num_act = 0;
static __device__ __inline__ int __mysmid(){
int smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;}
// this code won't work on a GPU with a single SM!
__global__ void kernel(){
__shared__ volatile int done, update, next_slot;
int my_block_id = atomicAdd(&block_id, 1);
int my_sm = __mysmid();
if (my_block_id == 0){
if (!threadIdx.x){
restricted_sm = my_sm;
__threadfence();
// I am "block 0" and process the vectors, checking for coherency
// "consumer"
next_slot = 0;
volatile int *vslot_control = slot_control;
volatile int *vslots = slots;
int scount = 0;
while(scount < max_act){
if (vslot_control[next_slot] == slot_full){
scount++;
int slot_val = vslots[next_slot*slot_length];
for (int i = 1; i < slot_length; i++) if (slot_val != vslots[next_slot*slot_length+i]) { assert(0); /* badness - incoherence */}
observations[slot_val]++;
vslot_control[next_slot] = slot_empty;
correct++;
__threadfence();
}
next_slot++;
if (next_slot >= num_slots) next_slot = 0;
}
}}
else {
// "producer"
while (restricted_sm < 0); // wait for signaling
if (my_sm == restricted_sm) return;
next_slot = 0;
done = 0;
__syncthreads();
while (!done) {
if (!threadIdx.x){
while (atomicCAS(slot_control+next_slot, slot_empty, slot_filling) > slot_empty) {
next_slot++;
if (next_slot >= num_slots) next_slot = 0;}
// we grabbed an empty slot, fill it with my_sm
if (atomicAdd(&num_act, 1) < max_act) update = 1;
else {done = 1; update = 0;}
}
__syncthreads();
if (update) slots[next_slot*slot_length+threadIdx.x] = my_sm;
__threadfence(); //enforce ordering
if ((update) && (!threadIdx.x)){
slot_control[next_slot] = 2; // mark slot full
atomicAdd(actives+my_sm, 1);}
__syncthreads();
}
}
}
int main(){
kernel<<<256, slot_length>>>();
cudaDeviceSynchronize();
cudaError_t res= cudaGetLastError();
if (res != cudaSuccess) printf("kernel failure: %d\n", (int)res);
int *h_obs = new int[max_sm];
int *h_act = new int[max_sm];
int h_correct;
cudaMemcpyFromSymbol(h_obs, observations, sizeof(int)*max_sm);
cudaMemcpyFromSymbol(h_act, actives, sizeof(int)*max_sm);
cudaMemcpyFromSymbol(&h_correct, correct, sizeof(int));
int h_total_act = 0;
int h_total_obs = 0;
for (int i = 0; i < max_sm; i++){
std::cout << h_act[i] << "," << h_obs[i] << " ";
h_total_act += h_act[i];
h_total_obs += h_obs[i];}
std::cout << std::endl << h_total_act << "," << h_total_obs << "," << h_correct << std::endl;
}
I don't claim this code to be defect free for any use case. It is advanced to demonstrate the workability of a concept, not as production-ready code. It seems to work for me on linux, on a couple different systems I tested it on. It should not be run on GPUs that have only a single SM, as one SM is reserved for the consumer, and the remaining SMs are used by the producers.
I have a computer with 2 GPUs; I wrote a CUDA C program and I need to tell it somehow that I want to run it on just 1 out of the 2 graphic cards; what is the command I need to type and how should I use it? I believe somehow that is related to the cudaSetDevice but I can't really find out how to use it.
It should be pretty much clear from documentation of cudaSetDevice, but let me provide following code snippet.
bool IsGpuAvailable()
{
int devicesCount;
cudaGetDeviceCount(&devicesCount);
for(int deviceIndex = 0; deviceIndex < devicesCount; ++deviceIndex)
{
cudaDeviceProp deviceProperties;
cudaGetDeviceProperties(&deviceProperties, deviceIndex);
if (deviceProperties.major >= 2
&& deviceProperties.minor >= 0)
{
cudaSetDevice(deviceIndex);
return true;
}
}
return false;
}
This is how I iterated through all available GPUs (cudaGetDeviceCount) looking for the first one of Compute Capability of at least 2.0. If such device was found, then I used cudaSetDevice so all the CUDA computations were executed on that particular device. Without executing the cudaSetDevice your CUDA app would execute on the first GPU, i.e. the one with deviceIndex == 0 but which particular GPU is that depends on which GPU is in which PCIe slot.
EDIT:
After clarifying your question in comments, it seems to me that it should be suitable for you to choose the device based on its name. If you are unsure about your actual GPU names, then run this code which will print names of all your GPUs into console:
int devicesCount;
cudaGetDeviceCount(&devicesCount);
for(int deviceIndex = 0; deviceIndex < devicesCount; ++deviceIndex)
{
cudaDeviceProp deviceProperties;
cudaGetDeviceProperties(&deviceProperties, deviceIndex);
cout << deviceProperties.name << endl;
}
After that, choose the name of the GPU that you want to use for computations, lets say it is "GTX XYZ". Call the following method from your main method, thanks to it, all the CUDA kernels will be executed on the device with name "GTX XYZ". You should also check the return value - true if device with such name is found, false otherwise:
bool SetGPU()
{
int devicesCount;
cudaGetDeviceCount(&devicesCount);
string desiredDeviceName = "GTX XYZ";
for(int deviceIndex = 0; deviceIndex < devicesCount; ++deviceIndex)
{
cudaDeviceProp deviceProperties;
cudaGetDeviceProperties(&deviceProperties, deviceIndex);
if (deviceProperties.name == desiredDeviceName)
{
cudaSetDevice(deviceIndex);
return true;
}
}
return false;
}
Of course you have to change the value of desiredDeviceName variable to desired value.
Searching more carefully in the internet I found this lines of code that select the GPU with more cores among all the devices installed in the Pc.
int num_devices, device;
cudaGetDeviceCount(&num_devices);
if (num_devices > 1) {
int max_multiprocessors = 0, max_device = 0;
for (device = 0; device < num_devices; device++) {
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, device);
if (max_multiprocessors < properties.multiProcessorCount) {
max_multiprocessors = properties.multiProcessorCount;
max_device = device;
}
}
cudaSetDevice(max_device);
}