How to create out-of-tree QEMU devices? - qemu

Two possible mechanisms come to mind:
IPC like the existing QMP and QAPI
QEMU loads a shared library plugin that contains the model
Required capabilities (of course all possible through the C API, but not necessarily IPC APIs):
inject interrupts
register callbacks for register access
modify main memory
Why I want this:
use QEMU as a submodule and leave its source untouched
additional advantages only present for IPC methods:
write the models in any language I want
use a non-GPL license for my device
I'm aware of in-tree devices as explained at: How to add a new device in QEMU source code? which are the traditional way of doing things.
What I've found so far:
interrupts: could only find NMI generation with the nmi monitor command
IO ports: IO possible with i and o monitor commands, so I'm fine there
main memory:
the ideal solution would be to map memory to host directly, but that seems hard:
http://kvm.vger.kernel.narkive.com/rto1dDqn/sharing-variables-memory-between-host-and-guest
https://www.linux-kvm.org/images/e/e8/0.11.Nahanni-CamMacdonell.pdf
http://www.fp7-save.eu/papers/SCALCOM2016.pdf
memory read is possible through the x and xp monitor commands
could not find how to write to memory with monitor commands. But I think the GDB API supports, so it should not be too hard to implement.
The closest working piece of code I could find was: https://github.com/texane/vpcie , which serializes PCI on both sides, and sends it through QEMU's TCP API. But this is more inefficient and intrusive, as it requires extra setup on both guest and host.

This create out of tree PCI device , it just display device in lspci..
It will ease faster PCI driver implementation as it will act as module,
can we extend this to to have similar functionality as edu-pci of QEMU.?
https://github.com/alokprasad/pci-hacking/blob/master/ksrc/virtual_pcinet/virtual_pci.c
/*
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/fs.h>
#include <linux/kobject.h>
#include <linux/device.h>
#include <linux/proc_fs.h>
#include <linux/types.h>
#include <linux/pci.h>
#include <linux/version.h>
#include<linux/kernel.h>
#define PCI_VENDOR_ID_XTREME 0x15b3
#define PCI_DEVICE_ID_XTREME_VNIC 0x1450
static struct pci_bus *vbus;
static struct pci_sysdata *sysdata;
static DEFINE_PCI_DEVICE_TABLE( vpci_dev_table) = {
{PCI_DEVICE(PCI_VENDOR_ID_XTREME, PCI_DEVICE_ID_XTREME_VNIC)},
{0}
};
MODULE_DEVICE_TABLE(pci, vpci_dev_table);
int vpci_read(struct pci_bus *bus, unsigned int devfn, int where,
int size, u32 *val)
{
switch (where) {
case PCI_VENDOR_ID:
*val = PCI_VENDOR_ID_XTREME | PCI_DEVICE_ID_XTREME_VNIC << 16;
/* our id */
break;
case PCI_COMMAND:
*val = 0;
break;
case PCI_HEADER_TYPE:
*val = PCI_HEADER_TYPE_NORMAL;
break;
case PCI_STATUS:
*val = 0;
break;
case PCI_CLASS_REVISION:
*val = (4 << 24) | (0 << 16) | 1;
/* network class, ethernet controller, revision 1 */ /*2 or 4*/
break;
case PCI_INTERRUPT_PIN:
*val = 0;
break;
case PCI_SUBSYSTEM_VENDOR_ID:
*val = 0;
break;
case PCI_SUBSYSTEM_ID:
*val = 0;
break;
default:
*val = 0;
/* sensible default */
}
return 0;
}
int vpci_write(struct pci_bus *bus, unsigned int devfn, int where,
int size, u32 val)
{
switch (where) {
case PCI_BASE_ADDRESS_0:
case PCI_BASE_ADDRESS_1:
case PCI_BASE_ADDRESS_2:
case PCI_BASE_ADDRESS_3:
case PCI_BASE_ADDRESS_4:
case PCI_BASE_ADDRESS_5:
break;
}
return 0;
}
struct pci_ops vpci_ops = {
.read = vpci_read,
.write = vpci_write
};
void vpci_remove_vnic()
{
struct pci_dev *pcidev = NULL;
if (vbus == NULL)
return;
pci_remove_bus_device(pcidev);
pci_dev_put(pcidev);
}
EXPORT_SYMBOL( vpci_remove_vnic);
void vpci_vdev_remove(struct pci_dev *dev)
{
}
static struct pci_driver vpci_vdev_driver = {
.name = "Xtreme-Virtual-NIC1",
.id_table = vpci_dev_table,
.remove = vpci_vdev_remove
};
int vpci_bus_init(void)
{
struct pci_dev *pcidev = NULL;
sysdata = kzalloc(sizeof(void *), GFP_KERNEL);
vbus = pci_scan_bus_parented(NULL, 2, & vpci_ops, sysdata);
//vbus = pci_create_root_bus(NULL,i,& vpci_ops, sysdata,NULL);
//if (vbus != NULL)
//break;
memset(sysdata, 0, sizeof(void *));
if (vbus == NULL) {
kfree(sysdata);
return -EINVAL;
}
if (pci_register_driver(& vpci_vdev_driver) < 0) {
pci_remove_bus(vbus);
vbus = NULL;
return -EINVAL;
}
pcidev = pci_scan_single_device(vbus, 0);
if (pcidev == NULL)
return 0;
else
pci_dev_get(pcidev);
pci_bus_add_devices(vbus);
return 0;
}
void vpci_bus_remove(void)
{
if (vbus) {
pci_unregister_driver(&vpci_vdev_driver);
device_unregister(vbus->bridge);
pci_remove_bus(vbus);
kfree(sysdata);
vbus = NULL;
}
}
static int __init pci_init(void)
{
printk( "module loaded");
vpci_bus_init();
return 0;
}
static void __exit pci_exit(void)
{
printk(KERN_ALERT "unregister PCI Device\n");
pci_unregister_driver(&vpci_vdev_driver);
}
module_init(pci_init);
module_exit(pci_exit);
MODULE_LICENSE("GPL");

There is at least one fork of QEMU I'm aware of that offers shared library plugins for QEMU... but it's a fork of QEMU 4.0.
https://github.com/cromulencellc/qemu-shoggoth
It is possible to build out of tree plugins with this fork, though it's not documented.

On Nov 11 2019 Peter Maydell, a major QEMU contributor, commented on another Stack Overflow question that:
Device plugins are specifically off the menu, because upstream does not want to provide a nice easy mechanism for people to use to have out-of-tree non-GPL/closed-source devices.
So it seems that QEMU devs oppose this idea at that point in time. It is worth learning about the QEMU plugin system though which might come handy for related applications in any case: How to count the number of guest instructions QEMU executed from the beginning to the end of a run?
This is a shame. Imagine if the Linux kernel didn't have a kernel module interface! I suggest QEMU expose this interface, but just don't make it stable, so that it won't impose a developer burden, and which gives the upside that those who merge won't have as painful rebases.

Related

CUDA cudaLaunchCooperativeKernel and grid synchronization

I am trying to understand how to synchronize a grid of threads with cudaLaunchCooperativeKernel.
https://developer.nvidia.com/blog/cooperative-groups/
I have a very simple kernel where two threads update an array, sync and both print the array:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void kernel(float *buf){
cg::grid_group
grid = cg::this_grid();
if(grid.thread_rank()<2)
buf[grid.thread_rank()] = 10+grid.thread_rank();
assert(grid.is_valid()); // ok!
grid.sync();
if(grid.thread_rank()<2)
printf("thread=%d: %g %g\n",(int)grid.thread_rank(),buf[0],buf[1]);
}
Instead of printing values (10,11) twice, I get:
thread=0: 10 0
thread=1: 0 11
All cuda calls were fine, cuda-memcheck is happy, my cards is "GeForce RTX 2060 SUPER" and it does support cooperative kernels work, checked with:
int supportsCoopLaunch = 0;
if( cudaSuccess != cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev) )
throw std::runtime_error("Cooperative Launch is not supported on this machine configuration.");
I am confused... Why I don't see the synchronization?
This test is incorrect:
int supportsCoopLaunch = 0;
if( cudaSuccess != cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev) )
throw std::runtime_error("Cooperative Launch is not supported on this machine configuration.");
The support (or lack of) is not communicated via the cudaError_t return value of the function, instead it is communicated via the value placed in the supportsCoopLaunch variable. You would want to do something like:
int supportsCoopLaunch = 0;
cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev);
if( supportsCoopLaunch != 1)
throw std::runtime_error("Cooperative Launch is not supported on this machine configuration.");
I found the bug. The actual code was something like that:
__device__ void kernel(float *buf){/* see the function body above*/}
__global__ void parent_kernel(){
float buf[2]; // per-thread buffer!!! The kernel will not 'sync' it!
kernel(buf); // different kernels will get different buffers
}

Aggregate results from CUDA threads

I am solving minimal dominant set problem on CUDA. Every thread finds some local candiate result and I need to find the best. I am using __device__ variables for the global result (dev_bestConfig and dev_bestValue).
I need to do something like this:
__device__ configType dev_bestConfig = 0;
__device__ int dev_bestValue = INT_MAX;
__device__ void findMinimalDominantSet(int count, const int *matrix, Lock &lock)
{
// here is some algorithm that finds local bestValue and bestConfig
// set device variables
if (bestValue < dev_bestValue)
{
dev_bestValue = bestValue;
dev_bestConfig = bestConfig;
}
}
I know that this does not work because more threads accesses the memory at the same time so I use this critical section:
// set device variables
bool isSet = false;
do
{
if (isSet = atomicCAS(lock.mutex, 0, 1) == 0)
{
// critical section goes here
if (bestValue < dev_bestValue)
{
dev_bestValue = bestValue;
dev_bestConfig = bestConfig;
}
}
if (isSet)
{
*lock.mutex = 0;
}
} while (!isSet);
This actually works as expected but it is really slow. For example without this critical section it takes 0.1 secodns and with this critical section it takes 1.8 seconds.
What can i do differetly to make it faster?
I actually avoided any critical sections and locking at the end. I saved local results to an array and then searched for the best one. The searching can be done sequentially or by parallel reduction.

How to select a GPU with CUDA?

I have a computer with 2 GPUs; I wrote a CUDA C program and I need to tell it somehow that I want to run it on just 1 out of the 2 graphic cards; what is the command I need to type and how should I use it? I believe somehow that is related to the cudaSetDevice but I can't really find out how to use it.
It should be pretty much clear from documentation of cudaSetDevice, but let me provide following code snippet.
bool IsGpuAvailable()
{
int devicesCount;
cudaGetDeviceCount(&devicesCount);
for(int deviceIndex = 0; deviceIndex < devicesCount; ++deviceIndex)
{
cudaDeviceProp deviceProperties;
cudaGetDeviceProperties(&deviceProperties, deviceIndex);
if (deviceProperties.major >= 2
&& deviceProperties.minor >= 0)
{
cudaSetDevice(deviceIndex);
return true;
}
}
return false;
}
This is how I iterated through all available GPUs (cudaGetDeviceCount) looking for the first one of Compute Capability of at least 2.0. If such device was found, then I used cudaSetDevice so all the CUDA computations were executed on that particular device. Without executing the cudaSetDevice your CUDA app would execute on the first GPU, i.e. the one with deviceIndex == 0 but which particular GPU is that depends on which GPU is in which PCIe slot.
EDIT:
After clarifying your question in comments, it seems to me that it should be suitable for you to choose the device based on its name. If you are unsure about your actual GPU names, then run this code which will print names of all your GPUs into console:
int devicesCount;
cudaGetDeviceCount(&devicesCount);
for(int deviceIndex = 0; deviceIndex < devicesCount; ++deviceIndex)
{
cudaDeviceProp deviceProperties;
cudaGetDeviceProperties(&deviceProperties, deviceIndex);
cout << deviceProperties.name << endl;
}
After that, choose the name of the GPU that you want to use for computations, lets say it is "GTX XYZ". Call the following method from your main method, thanks to it, all the CUDA kernels will be executed on the device with name "GTX XYZ". You should also check the return value - true if device with such name is found, false otherwise:
bool SetGPU()
{
int devicesCount;
cudaGetDeviceCount(&devicesCount);
string desiredDeviceName = "GTX XYZ";
for(int deviceIndex = 0; deviceIndex < devicesCount; ++deviceIndex)
{
cudaDeviceProp deviceProperties;
cudaGetDeviceProperties(&deviceProperties, deviceIndex);
if (deviceProperties.name == desiredDeviceName)
{
cudaSetDevice(deviceIndex);
return true;
}
}
return false;
}
Of course you have to change the value of desiredDeviceName variable to desired value.
Searching more carefully in the internet I found this lines of code that select the GPU with more cores among all the devices installed in the Pc.
int num_devices, device;
cudaGetDeviceCount(&num_devices);
if (num_devices > 1) {
int max_multiprocessors = 0, max_device = 0;
for (device = 0; device < num_devices; device++) {
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, device);
if (max_multiprocessors < properties.multiProcessorCount) {
max_multiprocessors = properties.multiProcessorCount;
max_device = device;
}
}
cudaSetDevice(max_device);
}

Issue with periodically discrepancies in cufft-fftw complex to real transformations

For my thesis, I have to optimize a special MPI-Navier Stokes-Solver program with CUDA. The original program uses FFTW for solving several PDEs. In detail, several upper triangle matrices are fourier tranformed in two dimensions, but handled as one dimensional arrays. For the moment, I'm struggling with parts of the original code: (N is always set to 64)
Original:
//Does the complex to real in place fft an normalizes
void fftC2R(double complex *arr) {
fftw_execute_dft_c2r(plan_c2r, (fftw_complex*)arr, (double*)arr);
//Currently ignored: Normalization
/* for(int i=0; i<N*(N/2+1); i++)
arr[i] /= (double complex)sqrt((double complex)(N*N));*/
}
void doTimeStepETDRK2_nonlin_original() {
//calc velocity
ux[0] = 0;
uy[0] = 0;
for(int i=1; i<N*(N/2+1); i++) {
ux[i] = I*kvec[1][i]*qvec[i] / kvec[2][i];
uy[i] = -I*kvec[0][i]*qvec[i] / kvec[2][i];
}
fftC2R(ux);
fftC2R(uy);
//do some stuff here...
//...
return;
}
Where ux and uy are allocated as (double complex arrays):
ux = (double complex*)fftw_malloc(N*(N/2+1) * sizeof(double complex));
uy = (double complex*)fftw_malloc(N*(N/2+1) * sizeof(double complex));
The fft-plan is created as:
plan_c2r = fftw_plan_dft_c2r_2d(N, N,(fftw_complex*) qvec, (double*)qvec, FFTW_ESTIMATE);
Where qvec is allocated the same way like ux and uy and has data type double complex.
Here are the relevant parts of the CUDA code:
NN2_VecSetZero_and_init<<<block_size,grid_size>>>();
cudaSafeCall(cudaDeviceSynchronize());
cudaSafeCall(cudaGetLastError());
int err = (int)cufftExecZ2D(cu_plan_c2r,(cufftDoubleComplex*)sym_ux,(cufftDoubleReal*)sym_ux);
if (err != CUFFT_SUCCESS ) {
exit(EXIT_FAILURE);
return;
}
err = (int)cufftExecZ2D(cu_plan_c2r,(cufftDoubleComplex*)sym_uy,(cufftDoubleReal*)sym_uy);
if (err != CUFFT_SUCCESS ) {
exit(EXIT_FAILURE);
return;
}
//do some stuff here...
//...
return;
Where sim_ux and sim_uy are allocated as:
cudaMalloc((void**)&sym_ux, N*(N/2+1)*sizeof(cufftDoubleComplex));
cudaMalloc((void**)&sym_uy, N*(N/2+1)*sizeof(cufftDoubleComplex));
The initialization of the relevant cufft parts looks like
if (cufftPlan2d(&cu_plan_c2r,N,N, CUFFT_Z2D) != CUFFT_SUCCESS){
exit(EXIT_FAILURE);
return -1;
}
if (cufftPlan2d(&cu_plan_r2c,N,N, CUFFT_D2Z) != CUFFT_SUCCESS){
exit(EXIT_FAILURE);
return -1;
}
if ( cufftSetCompatibilityMode ( cu_plan_c2r , CUFFT_COMPATIBILITY_FFTW_ALL) != CUFFT_SUCCESS ) {
exit(EXIT_FAILURE);
return -1;
}
if ( cufftSetCompatibilityMode ( cu_plan_r2c , CUFFT_COMPATIBILITY_FFTW_ALL) != CUFFT_SUCCESS ) {
exit(EXIT_FAILURE);
return -1;
}
So I use full FFTW compatibility and I call every function with the FFTW calling patterns.
When I run both versions, I receive almost equal results for ux and uy (sim_ux and sim_uy). But at periodically positions of the arrays, Cufft seems to ignore these elements, where FFTW sets the real part of these elements to zero and calculates the complex parts (the arrays are too large to show them here). The stepcount, for which this occurs, is N/2+1. So I believe, I didn't completely understand fft-padding theory between Cufft and FFTW.
I can exclude any former discrepancies between these arrays, until Cufft-executions are called. So any other arrays of the code above are not relevant here.
My question is: am I too optimistic in using almost 100% of the FFTW calling style? Do I have to handle my arrays before the ffts? The Cufft documentation says, I'd have to resize the data input and output arrays. But how can I do this, when I'm running inplace transformations? I really wouldn't like to go too far distant from the original code and I don't want to use any more copy instructions for each fft call, because memory is limited and arrays should remain and processed on gpu as long as possible.
I'm thankful for every hint, critical statement or idea!
My configuration:
Compiler: gcc 4.6 (C99 Standard)
MPI-Package: mvapich2-1.5.1p1 (shouldn't play a role because of reduced single processing debug mode)
CUDA-Version: 4.2
GPU: CUDA-arch-compute_20 (NVIDIA GeForce GTX 570)
FFTW 3.3
once i had to do with CUFFT the only solution I got to work was with exclusive usage of "cu_plan_c2c" - there are easy transformations between real and complex arrays:
-filling complex part with 0 for emulating cu_plan_r2c
-use atan2 (not atan) on the complex result to emulate cu_plan_c2r
Sorry for not pointing you to any better solution, but this is how I ended up solving this problem. Hope you are not getting into any hard toruble with low memory cpu-sided....

CUDA block synchronization differences between GTS 250 and Fermi devices

So I've been working on program in which I'm creating a hash table in global memory. The code is completely functional (albeit slower) on a GTS250 which is a Compute 1.1 device. However, on a Compute 2.0 device (C2050 or C2070) the hash table is corrupt (data is incorrect and pointers are sometimes wrong).
Basically the code works fine when only one block is utilized (both devices). However, when 2 or more blocks are used, it works only on the GTS250 and not on any Fermi devices.
I understand that the warp scheduling and memory architecture between the two platforms are different and I am taking that into account when developing the code. From my understanding, using __theadfence() should make sure any global writes are committed and visible to other blocks, however, from the corrupt hash table, it appears that they are not.
I've also posted the problem on the NVIDIA CUDA developer forum and it can be found here.
Relevant code below:
__device__ void lock(int *mutex) {
while(atomicCAS(mutex, 0, 1) != 0);
}
__device__ void unlock(int *mutex) {
atomicExch(mutex, 0);
}
__device__ void add_to_global_hash_table(unsigned int key, unsigned int count, unsigned int sum, unsigned int sumSquared, Table table, int *globalHashLocks, int *globalFreeLock, int *globalFirstFree)
{
// Find entry if it exists
unsigned int hashValue = hash(key, table.count);
lock(&globalHashLocks[hashValue]);
int bucketHead = table.entries[hashValue];
int currentLocation = bucketHead;
bool found = false;
Entry currentEntry;
while (currentLocation != -1 && !found) {
currentEntry = table.pool[currentLocation];
if (currentEntry.data.x == key) {
found = true;
} else {
currentLocation = currentEntry.next;
}
}
if (currentLocation == -1) {
// If entry does not exist, create entry
lock(globalFreeLock);
int newLocation = (*globalFirstFree)++;
__threadfence();
unlock(globalFreeLock);
Entry newEntry;
newEntry.data.x = key;
newEntry.data.y = count;
newEntry.data.z = sum;
newEntry.data.w = sumSquared;
newEntry.next = bucketHead;
// Add entry to table
table.pool[newLocation] = newEntry;
table.entries[hashValue] = newLocation;
} else {
currentEntry.data.y += count;
currentEntry.data.z += sum;
currentEntry.data.w += sumSquared;
table.pool[currentLocation] = currentEntry;
}
__threadfence();
unlock(&globalHashLocks[hashValue]);
}
As pointed out by LSChien in this post, the issue is with L1 cache coherency. While using __threadfence() will guarantee shared and global memory writes are visible to other threads, since it is not atomic, thread x in block 1 may reach a cached memory value until thread y in block 0 has executed to the threadfence instruction. Instead LSChien suggested a hack in his post of using an atomicCAS() to force the thread to read from global memory instead of a cached value. The proper way to do this is by declaring the memory as volatile, requiring that every write to that memory be visible to all other threads in the grid immediately.
__threadfence guarantees that writes to global memory are visible to other threads in the current block before returning. That is not the same as "write operation on global memory is complete"! Think caching on each multicore.