Cuda get gpu load percent - cuda

I want to calculate the GPU load. How get gpu load percent in cuda?
enter image description here

http://eliang.blogspot.com.by/2011/05/getting-nvidia-gpu-usage-in-c.html?m=1
//
// Getting Nvidia GPU Usage
//
// Reference: Open Hardware Monitor (http://code.google.com/p/open-hardware-monitor)
//
#include <windows.h>
#include <iostream>
// magic numbers, do not change them
#define NVAPI_MAX_PHYSICAL_GPUS 64
#define NVAPI_MAX_USAGES_PER_GPU 34
// function pointer types
typedef int *(*NvAPI_QueryInterface_t)(unsigned int offset);
typedef int (*NvAPI_Initialize_t)();
typedef int (*NvAPI_EnumPhysicalGPUs_t)(int **handles, int *count);
typedef int (*NvAPI_GPU_GetUsages_t)(int *handle, unsigned int *usages);
int main()
{
HMODULE hmod = LoadLibraryA("nvapi.dll");
if (hmod == NULL)
{
std::cerr << "Couldn't find nvapi.dll" << std::endl;
return 1;
}
// nvapi.dll internal function pointers
NvAPI_QueryInterface_t NvAPI_QueryInterface = NULL;
NvAPI_Initialize_t NvAPI_Initialize = NULL;
NvAPI_EnumPhysicalGPUs_t NvAPI_EnumPhysicalGPUs = NULL;
NvAPI_GPU_GetUsages_t NvAPI_GPU_GetUsages = NULL;
// nvapi_QueryInterface is a function used to retrieve other internal functions in nvapi.dll
NvAPI_QueryInterface = (NvAPI_QueryInterface_t) GetProcAddress(hmod, "nvapi_QueryInterface");
// some useful internal functions that aren't exported by nvapi.dll
NvAPI_Initialize = (NvAPI_Initialize_t) (*NvAPI_QueryInterface)(0x0150E828);
NvAPI_EnumPhysicalGPUs = (NvAPI_EnumPhysicalGPUs_t) (*NvAPI_QueryInterface)(0xE5AC921F);
NvAPI_GPU_GetUsages = (NvAPI_GPU_GetUsages_t) (*NvAPI_QueryInterface)(0x189A1FDF);
if (NvAPI_Initialize == NULL || NvAPI_EnumPhysicalGPUs == NULL ||
NvAPI_EnumPhysicalGPUs == NULL || NvAPI_GPU_GetUsages == NULL)
{
std::cerr << "Couldn't get functions in nvapi.dll" << std::endl;
return 2;
}
// initialize NvAPI library, call it once before calling any other NvAPI functions
(*NvAPI_Initialize)();
int gpuCount = 0;
int *gpuHandles[NVAPI_MAX_PHYSICAL_GPUS] = { NULL };
unsigned int gpuUsages[NVAPI_MAX_USAGES_PER_GPU] = { 0 };
// gpuUsages[0] must be this value, otherwise NvAPI_GPU_GetUsages won't work
gpuUsages[0] = (NVAPI_MAX_USAGES_PER_GPU * 4) | 0x10000;
(*NvAPI_EnumPhysicalGPUs)(gpuHandles, &gpuCount);
// print GPU usage every second
for (int i = 0; i < 100; i++)
{
(*NvAPI_GPU_GetUsages)(gpuHandles[0], gpuUsages);
int usage = gpuUsages[3];
std::cout << "GPU Usage: " << usage << std::endl;
Sleep(1000);
}
return 0;
}

Related

Functions returning pointers not printing correctly

When I run the program shown below, it prints the address correctly inside the function (fun), but outside the function is always prints zero
#include <stdio.h>
int *fun();
main()
{
int *p;
printf("\n %u", p);
p = fun();
printf("\n %u", fun());
}
int *fun()
{
int i = 20;
printf("\n %u", &i);
return (&i);
}

Does bool variable in kernel need to be synchronized

I have a kernel consisting of a for loop that searches through an array for a specific int value. I'm using a grid block of 256 threads to do this. However, when one thread finds the value, I want to let the other threads know to exit. Currently I'm using a boolean flag, but I'm not sure if its working properly. My concern is synchronization.
__device__ bool found;
__global__
void search()
{
for(int i = threadIdx.x; i<1000000; i += stride)
{
if(found == true)
{
break;
}
else if(arr[i] = x)
{
found = true;
break;
}
}
}
int main()
{
bool flag = false;
cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
As pointed out in comments, you can probably achieve what you want by declaring the global device flag to be volatile, which will inhibit caching, and by using a memory fence function. There really isn't a global synchronization primitive which would do want you want other than the new grid synchronization mechanism introduced in CUDA 9 and new hardware, but that probably isn't necessary in this case. Turning your pseudocode into a toy example:
#include <iostream>
#include <thrust/device_vector.h>
__device__ volatile bool found;
__device__ volatile size_t idx;
template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = blockDim.x * gridDim.x;
for(; (i<N) && (!found); i += stride)
{
if(arr[i] == x)
{
if (docheck) found = true;
idx = i;
__threadfence();
break;
}
}
}
int main()
{
const size_t N = 1 << 24;
const size_t findidx = 280270;
const int findval = 0xdeadbeef;
thrust::device_vector<int> data(N,1);
data[findidx] = findval;
bool flag = false;
size_t zero = 0;
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
{
cudaMemcpyToSymbol(found, &flag, sizeof(bool));
cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
int blocks, threads;
cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
cudaDeviceSynchronize();
size_t result = 0;
cudaMemcpyFromSymbol(&result, idx, sizeof(size_t));
std::cout << "result = " << result << std::endl;
}
return 0;
}
and profiling it gives the following:
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.00% 1.6773ms 1 1.6773ms 1.6773ms 1.6773ms void search<bool=0>(int const *, int, unsigned long)
19.93% 428.63us 1 428.63us 428.63us 428.63us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
1.82% 39.199us 1 39.199us 39.199us 39.199us void search<bool=1>(int const *, int, unsigned long)
As you can see, the version which sets the found flag completes the search in 40 microseconds, whereas the version which does not set the flag takes 1.7 milliseconds. Given that the kernel is run with the maximum number of resident blocks in both cases, we can conclude that the early exit mechanism worked correctly and running blocks detected that the required value had been found.

Using host class member pointing to device memory in device code

I want to have an instance of a Container class allocating some device and host memory on initialization. I want to use the allocated memory in device code, without passing the actual pointer (for API reasons).
How do I create a global __device__ pointer to the member pointing to the device memory? I am happy to use thrust if that helps.
Here is a small example:
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
Container stuff;
__device__ auto d_int = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
There are two problems here:
You can't statically inititalize a __device__ variable in the way you are trying to (and the value you are trying to apply isn't correct either). The CUDA runtime API contains a function for initialising global scope device symbols. Use that instead.
Your global scope declaration of stuff shouldn't work either for a number of subtle reasons discussed here (it is technically undefined behaviour). Declare it at main scope instead.
Putting these two things together should lead your to do something like this instead:
__device__ int* d_int;
// ...
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));
edit<<<4, 1>>>();
// ...
Here is a fully worked example:
$ cat t1199.cu
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
//Container stuff;
__device__ int *d_int; // = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
$ nvcc -std=c++11 -o t1199 t1199.cu
$ cuda-memcheck ./t1199
========= CUDA-MEMCHECK
1337
========= ERROR SUMMARY: 0 errors
$

Error memcpy in device code

I write a code that get first _var positions of a vector of possibilities (i.e., matrix _size*_var with _var=3 and _size=27) and calling this function in my kernel (32 threads, ie, each has an object) but I do not get any return value of the function neither the NULL pointer.
The program exit without error but the printf lines in the kernel is not executed or displayed (even compiled with sm_20 or higher) as if the program stopped before.
dataIntern.h:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
template <class a_type>
class dataIntern{
private:
a_type *possibilities;
int _assign;
int _size;
int _var;
int _maxsize;
public:
CUDA_CALLABLE_MEMBER dataIntern(){
}
CUDA_CALLABLE_MEMBER dataIntern(int var){
_var = var;
_size = (int)pow(3.0, (double)_var);
_maxsize = _size * _var;
_assign = 1;
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
if(!possibilities){
exit(1);
}
createTable();
}
CUDA_CALLABLE_MEMBER void createTable(){
int i, j, k, limit, pos;
a_type value;
if(_assign == 1){
for(i=0; i<_var; i++){
#ifdef __CUDA_ARCH__
limit = (int)pow(3.0, _var-i-1);
#else
limit = (int)pow(3.0, (double)_var-i-1);
#endif
value = (a_type)_MIN;
k = 0;
for(j=0; j<_size; j++){
pos = _var*j+i;
if(k >= limit){
value++;
if(value > _MAX){
value = (a_type)_MIN;
}
k = 0;
}
possibilities[pos] = value;
k++;
}
}
}
}
CUDA_CALLABLE_MEMBER void print(){
int i;
printf("Printing.\n");
if(_assign == 1){
for(i=0; i<_size*_var; i++){
printf("%d ", possibilities[i]);
if(i%_var == _var-1){
printf("\n");
}
}
}
else{
printf("Not assigned.\n");
}
}
CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
int i, pos, count, initpos, attrib;
a_type *newnode;
a_type *newlist = NULL, *morelist = NULL;
pos = posChanged;
initpos = 0;
count = 0;
if(_assign == 1){
attrib = 0;
newnode = (a_type*)malloc(_var*sizeof(a_type));
for(i=0; i<_size; i++){
if(possibilities[pos] == valueRetified){
memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));
count++;
if(newlist!=NULL){
morelist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
}
newlist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));
initpos+=_var;
attrib = 1;
}
pos+=_var;
}
if(attrib == 1){
_size = count;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Allocation fail in newlist retify.\n");
exit(1);
}
memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
}
else{
_assign = 0;
}
}
}
CUDA_CALLABLE_MEMBER a_type* unstack(){
a_type* solution = NULL, *backup = NULL;
if(_assign == 1){
if(_size>0){
backup = (a_type*)malloc(_var*_size*sizeof(a_type));
if(backup == NULL){
printf("Erro to alloc backup pointer on unstack function in data intern\n");
return NULL;
}
solution = (a_type*)malloc(_var*sizeof(a_type));
if(solution == NULL){
printf("Erro to alloc solution pointer on unstack function in data intern\n");
return NULL;
}
memcpy(backup, possibilities, _size*_var*sizeof(a_type));
memcpy(solution, possibilities, _var*sizeof(a_type));
free(possibilities);
_size--;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Error to realloc possibilities pointer in data intern\n");
return NULL;
}
memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));
free(backup);
return solution;
}
}
return NULL;
}
CUDA_CALLABLE_MEMBER int get_size(){
return _size;
}
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
};
deviceCode.h:
#ifndef DEVICECODE_H
#define DEVICECODE_H
void CallingInMain();
__global__ void kernel();
#endif
deviceCode.cu:
#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>
//I declared like this to my kernel:
__global__ void kernel(){
__shared__ dataIntern<int> data[32];
int *vetor;
vetor = NULL;
data[threadIdx.x] = dataIntern<int>(3);
//_var == 3 in the class above
vetor = (int*)malloc(sizeof(int)*3);
vetor = data[threadIdx.x].unstack();
while(vetor!=NULL){
//never past here
printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
vetor = data[threadIdx.x].unstack();
}
//neither here in if or else
if(vetor)
printf("Not null\n");
else
printf("Null final\n");
free(vetor);
}
void CallingInMain(){
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
}
main.cu:
#include <iostream>
#include <stdio.h>
#ifndef deviceCode_H
#include "deviceCode.h"
#endif
int main(int argc, char* argv[]){
CallingInMain();
return 0;
}
Some colleagues pointed out to me that your code seems to have an error in it.
Consider this line in your kernel:
data[threadIdx.x] = dataIntern<int>(3);
This line instantiates a temporary dataIntern<int> object, runs the constructor with a value of 3 on it, and then does a copy from that object to the storage in data[threadIdx.x]. Note that the constructor performs a malloc operation:
CUDA_CALLABLE_MEMBER dataIntern(int var){
...
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
But since the original object is temporary, the C++ standard allows the object to be deleted at the termination of the statement, i.e. at the semicolon here:
data[threadIdx.x] = dataIntern<int>(3);
^
after the copy-construction process is complete. But the deletion of the object triggers the destructor, which does a free operation on possibilities:
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
Therefore usage of the pointer so allocated subsequent to this line of code:
data[threadIdx.x] = dataIntern<int>(3);
such as in unstack here:
vetor = data[threadIdx.x].unstack();
will be invalid.
This is a violation of C++ programming rules, and the error is not specific to CUDA.

cublassgemm for row-major matrix

I really tried to implement a function in C to multiply to row-major matrix in cublas. I don't know where I mistaking.
In the function below A, B and C are pointers to an row matrix correctly
allocated.
I'd like to keep the option of translate a matrix before perform the product.
The function below is not working.
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
The entire source code
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
void getFromDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//A = (m,n)
//B = (n,k)
//C = (m,k)
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
float *mallocfDevice(int size){
float *d_C = NULL;
cudaError_t err = cudaMalloc((void **)&d_C, size * sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}else{
size_t freeM, total;
cudaMemGetInfo ( &freeM, &total);
printf("MEM:%.3f\n",freeM,total,100 - ((double)freeM/total)*100 );
}
return d_C;
}
void printHostMatrix(int nl, int nc, float *h_s){
for(int j = 0; j < nl ; j++) {
for(int i = 0; i < (nc) ; i++){
int idx = j*nc + i;
printf("%.2f ", h_s[idx]);
}
printf("\n");
}
}
void printfDeviceMatrix(float *d_s,int m, int p){
float *h_s =(float*) malloc(sizeof(float)*m*p);
getFromDevice(h_s,d_s,sizeof(float)*m*p);
printHostMatrix(m,p,h_s);
free(h_s);
}
void sendTofDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(d_A, h_A, size*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(int argc,char **argv){
int ma = 2,
na = 3,
mb = 3,
nb = 2;
float A[] = { 1,2,3,
4,5,6};
float B[] = {7, 8,
9,10,
11,12};
float *C = new float[ma*nb];
float *d_a = mallocfDevice(ma*mb),
*d_b = mallocfDevice(mb*nb),
*d_c = mallocfDevice(ma*nb);
sendTofDevice(A,d_a,ma*na);
sendTofDevice(B,d_b,mb*nb);
cublasHandle_t handle ; // CUBLAS context
cublasCreate (&handle );
puts("A");
printfDeviceMatrix(d_a,ma,na);
puts("B");
printfDeviceMatrix(d_b,mb,nb);
matrixMul(handle, d_a,d_b,d_c,
ma,na,nb,0,0);
puts("AB=C");
printfDeviceMatrix(d_c,ma,nb);
}
CUBLAS assumes that the matrix in the device is stored in column major:
"
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. Also, for matrix A
Read more at: http://docs.nvidia.com/cuda/cublas/index.html#ixzz3mSDJTWrM "
That means the matrix needs to be treated as differently on the device than on the host.