RISC-V fuzzing emulation - qemu

I am new to this but I need to emulate RISC-V using qemu. As a start for my fuzzing project, how can I do give qemu an instruction set and get the changes in the registries as an output.

I probably understand your question. Because I don't have a riscv-related environment here, I can only provide a solution.
For example, in riscv, we design a function to get the values of all registers, relying on qemu's plugin module (such as qemu_plugin_register_vcpu_insn_exec_cb()).
plugin_test.c
#include <inttypes.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#include <glib.h>
#include <qemu-plugin.h>
QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
#define CPU_SIZE 32
static int cpu_num;
static int cpu_value[CPU_SIZE]={0};
static void vcpu_insn_exec_before(unsigned int cpu_index, void *)
{
for (size_t i = 0; i < cpu_num; i++)
{
/* code */
for (size_t j = 0; j < CPU_SIZE; i++)
{
if(cpu_value[j] != get_cpu_register(i,j)) {
// The value of cpu has changed
...
} else {
// The value of cpu has not changed
...
}
}
}
}
static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
{
size_t n = qemu_plugin_tb_n_insns(tb);
size_t i;
for (i = 0; i < n; i++) {
struct qemu_plugin_insn *insn = qemu_plugin_tb_get_insn(tb, i);
qemu_plugin_register_vcpu_insn_exec_cb(
insn, vcpu_insn_exec_before, QEMU_PLUGIN_CB_NO_REGS,void *);
}
}
static void plugin_exit(qemu_plugin_id_t id, void *p)
{
}
QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id,
const qemu_info_t *info,
int argc, char **argv)
{
if(info->system_emulation) {
cpu_num = info->system.smp_vcpus;
} else {
cpu_num = 1;
}
qemu_plugin_register_vcpu_tb_trans_cb(id, vcpu_tb_trans);
qemu_plugin_register_atexit_cb(id, plugin_exit, NULL);
return 0;
}
api-ext.c
void *qemu_get_cpu(int index);
static uint32_t get_cpu_register(unsigned int cpu_index, unsigned int reg) {
uint8_t* cpu = qemu_get_cpu(cpu_index);
return *(uint32_t*)(cpu + 33488 + 5424 + reg * 4);
}
It should be noted that the content in api-ext.c is obtained from others. This is the function used to obtain the value of arm cpu. You need to check the source code or documentation for riscv.

Related

Using host class member pointing to device memory in device code

I want to have an instance of a Container class allocating some device and host memory on initialization. I want to use the allocated memory in device code, without passing the actual pointer (for API reasons).
How do I create a global __device__ pointer to the member pointing to the device memory? I am happy to use thrust if that helps.
Here is a small example:
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
Container stuff;
__device__ auto d_int = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
There are two problems here:
You can't statically inititalize a __device__ variable in the way you are trying to (and the value you are trying to apply isn't correct either). The CUDA runtime API contains a function for initialising global scope device symbols. Use that instead.
Your global scope declaration of stuff shouldn't work either for a number of subtle reasons discussed here (it is technically undefined behaviour). Declare it at main scope instead.
Putting these two things together should lead your to do something like this instead:
__device__ int* d_int;
// ...
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));
edit<<<4, 1>>>();
// ...
Here is a fully worked example:
$ cat t1199.cu
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
//Container stuff;
__device__ int *d_int; // = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
$ nvcc -std=c++11 -o t1199 t1199.cu
$ cuda-memcheck ./t1199
========= CUDA-MEMCHECK
1337
========= ERROR SUMMARY: 0 errors
$

Error memcpy in device code

I write a code that get first _var positions of a vector of possibilities (i.e., matrix _size*_var with _var=3 and _size=27) and calling this function in my kernel (32 threads, ie, each has an object) but I do not get any return value of the function neither the NULL pointer.
The program exit without error but the printf lines in the kernel is not executed or displayed (even compiled with sm_20 or higher) as if the program stopped before.
dataIntern.h:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define _MIN -1
#define _MAX 1
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
template <class a_type>
class dataIntern{
private:
a_type *possibilities;
int _assign;
int _size;
int _var;
int _maxsize;
public:
CUDA_CALLABLE_MEMBER dataIntern(){
}
CUDA_CALLABLE_MEMBER dataIntern(int var){
_var = var;
_size = (int)pow(3.0, (double)_var);
_maxsize = _size * _var;
_assign = 1;
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
if(!possibilities){
exit(1);
}
createTable();
}
CUDA_CALLABLE_MEMBER void createTable(){
int i, j, k, limit, pos;
a_type value;
if(_assign == 1){
for(i=0; i<_var; i++){
#ifdef __CUDA_ARCH__
limit = (int)pow(3.0, _var-i-1);
#else
limit = (int)pow(3.0, (double)_var-i-1);
#endif
value = (a_type)_MIN;
k = 0;
for(j=0; j<_size; j++){
pos = _var*j+i;
if(k >= limit){
value++;
if(value > _MAX){
value = (a_type)_MIN;
}
k = 0;
}
possibilities[pos] = value;
k++;
}
}
}
}
CUDA_CALLABLE_MEMBER void print(){
int i;
printf("Printing.\n");
if(_assign == 1){
for(i=0; i<_size*_var; i++){
printf("%d ", possibilities[i]);
if(i%_var == _var-1){
printf("\n");
}
}
}
else{
printf("Not assigned.\n");
}
}
CUDA_CALLABLE_MEMBER void retify(int posChanged, a_type valueRetified){
int i, pos, count, initpos, attrib;
a_type *newnode;
a_type *newlist = NULL, *morelist = NULL;
pos = posChanged;
initpos = 0;
count = 0;
if(_assign == 1){
attrib = 0;
newnode = (a_type*)malloc(_var*sizeof(a_type));
for(i=0; i<_size; i++){
if(possibilities[pos] == valueRetified){
memcpy(newnode, &possibilities[i*_var], _var*sizeof(a_type));
count++;
if(newlist!=NULL){
morelist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(morelist, newlist, (count-1)*_var*sizeof(a_type));
}
newlist = (a_type*)malloc(count*_var*sizeof(a_type));
memcpy(newlist, morelist, (count-1)*_var*sizeof(a_type));
memcpy(&newlist[initpos], newnode, _var*sizeof(a_type));
initpos+=_var;
attrib = 1;
}
pos+=_var;
}
if(attrib == 1){
_size = count;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Allocation fail in newlist retify.\n");
exit(1);
}
memcpy(possibilities, newlist, _size*_var*sizeof(a_type));
}
else{
_assign = 0;
}
}
}
CUDA_CALLABLE_MEMBER a_type* unstack(){
a_type* solution = NULL, *backup = NULL;
if(_assign == 1){
if(_size>0){
backup = (a_type*)malloc(_var*_size*sizeof(a_type));
if(backup == NULL){
printf("Erro to alloc backup pointer on unstack function in data intern\n");
return NULL;
}
solution = (a_type*)malloc(_var*sizeof(a_type));
if(solution == NULL){
printf("Erro to alloc solution pointer on unstack function in data intern\n");
return NULL;
}
memcpy(backup, possibilities, _size*_var*sizeof(a_type));
memcpy(solution, possibilities, _var*sizeof(a_type));
free(possibilities);
_size--;
possibilities = (a_type*)malloc(_size*_var*sizeof(a_type));
if(possibilities == NULL){
printf("Error to realloc possibilities pointer in data intern\n");
return NULL;
}
memcpy(possibilities, &backup[_var], _size*_var*sizeof(a_type));
free(backup);
return solution;
}
}
return NULL;
}
CUDA_CALLABLE_MEMBER int get_size(){
return _size;
}
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
};
deviceCode.h:
#ifndef DEVICECODE_H
#define DEVICECODE_H
void CallingInMain();
__global__ void kernel();
#endif
deviceCode.cu:
#include "deviceCode.h"
#include "dataIntern.h"
#include <iostream>
#include <stdio.h>
//I declared like this to my kernel:
__global__ void kernel(){
__shared__ dataIntern<int> data[32];
int *vetor;
vetor = NULL;
data[threadIdx.x] = dataIntern<int>(3);
//_var == 3 in the class above
vetor = (int*)malloc(sizeof(int)*3);
vetor = data[threadIdx.x].unstack();
while(vetor!=NULL){
//never past here
printf("%d %d %d %d\n", threadIdx.x, vetor[0], vetor[1], vetor[2]);
vetor = data[threadIdx.x].unstack();
}
//neither here in if or else
if(vetor)
printf("Not null\n");
else
printf("Null final\n");
free(vetor);
}
void CallingInMain(){
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
}
main.cu:
#include <iostream>
#include <stdio.h>
#ifndef deviceCode_H
#include "deviceCode.h"
#endif
int main(int argc, char* argv[]){
CallingInMain();
return 0;
}
Some colleagues pointed out to me that your code seems to have an error in it.
Consider this line in your kernel:
data[threadIdx.x] = dataIntern<int>(3);
This line instantiates a temporary dataIntern<int> object, runs the constructor with a value of 3 on it, and then does a copy from that object to the storage in data[threadIdx.x]. Note that the constructor performs a malloc operation:
CUDA_CALLABLE_MEMBER dataIntern(int var){
...
possibilities = (a_type*)malloc(_maxsize*sizeof(a_type));
But since the original object is temporary, the C++ standard allows the object to be deleted at the termination of the statement, i.e. at the semicolon here:
data[threadIdx.x] = dataIntern<int>(3);
^
after the copy-construction process is complete. But the deletion of the object triggers the destructor, which does a free operation on possibilities:
CUDA_CALLABLE_MEMBER ~dataIntern(){
_assign = 0;
if(possibilities)
free(possibilities);
}
Therefore usage of the pointer so allocated subsequent to this line of code:
data[threadIdx.x] = dataIntern<int>(3);
such as in unstack here:
vetor = data[threadIdx.x].unstack();
will be invalid.
This is a violation of C++ programming rules, and the error is not specific to CUDA.

why can't I return local variable?

I don't know why c++'s rule says that I can't return local variable?
this simple "max" function totally has no problem;
int max(int a,int b)
{
int maxnum;
if(a>b)maxnum= a;
else maxnum= b;
return maxnum;
}
int main( int argc, char** argv )
{
cout<<max(10,30);
system("PAUSE");
return 0;
}
but, when I want to return local variable as before in opencv,it turns out wrong data
why can't I return local variable here?
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <iostream>
#include <sstream>
using namespace cv;
using namespace std;
Mat GetIntensity(Mat *pic,int y,int x)
{
float Array[6];
for(int i=0;i<6;i++)
{
Array[i]=pic[i].at<uchar>(y,x);
}
Mat Intensity(6,1,CV_32FC1,&Array);
cout<<Intensity;
return Intensity;
}
int main( int argc, char** argv )
{
stringstream ss;
string pic_name;
Mat pic[6];
for(int i=0;i<6;i++)
{
ss.clear();
ss.str(std::string());
ss<<"pic"<<i<<".bmp"<<endl;
ss>>pic_name;
pic[i] = imread(pic_name, CV_LOAD_IMAGE_GRAYSCALE);
if(! pic[i].data )
{// Check for invalid input
cout << "Could not open or find the image" << endl ;
system("pause");
return -1;
}
//namedWindow( pic_name, CV_WINDOW_AUTOSIZE );
// Create a window for display.
//imshow(pic_name, pic[i] );
}
cout<<=GetIntensity(pic,60,60);
system("PAUSE");
//waitKey(0);
return 0;
}
your construct here uses a local array to hold the values. this will get invalid, once your Mat leaves the scope:
Mat GetIntensity(Mat *pic,int y,int x)
{
float Array[6]; //problem
for(int i=0;i<6;i++)
{
Array[i]=pic[i].at<uchar>(y,x);
}
Mat Intensity(6,1,CV_32FC1,&Array); // problem
cout<<Intensity;
return Intensity;
}
better create a Mat with it's own data:
Mat GetIntensity(Mat *pic,int y,int x)
{
Mat Intensity(6,1,CV_32FC1); // no problem
for(int i=0;i<6;i++)
{
Intensity.at<float>(i) = pic[i].at<uchar>(y,x);
}
cout<<Intensity;
return Intensity;
}

Simpson's method to integrate real valued functions with CUDA

I'm trying to code integration by Simpson's method in CUDA.
This is the formula for Simpson's rule
where x_k = a + k*h.
Here's my code
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.
What's wrong, guys?
The basic problem here is that you don't understand your own algorithm.
Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.
If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.
I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:
Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.
Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.
Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.
Step #3 is an application of thrust::transform.
Steps #4 and #5 can be accomplished together by thrust::inner_product.
This approach can be exploited also for use when other quadrature integration rules are of interest.
Here is the code
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
#define N 100
#define pi_f 3.14159265358979f // Greek pi in single precision
struct sin_functor
{
__host__ __device__
float operator()(float x) const
{
return sin(2.f*pi_f*x);
}
};
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
// --- Generate the integration coefficients
thrust::host_vector<float> h_coefficients(STRIDE);
h_coefficients[0] = 4.f;
h_coefficients[1] = 2.f;
thrust::device_vector<float> d_coefficients(N);
typedef thrust::device_vector<float>::iterator Iterator;
strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);
d_coefficients[0] = 1.f;
d_coefficients[N-1] = 1.f;
// print the generated d_coefficients
std::cout << "d_coefficients: ";
thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " ")); std::cout << std::endl;
// --- Generate sampling points
float a = 0.f;
float b = .5f;
float Dx = (b-a)/(float)(N-1);
thrust::device_vector<float> d_x(N);
thrust::transform(thrust::make_counting_iterator(a/Dx),
thrust::make_counting_iterator((b+1.f)/Dx),
thrust::make_constant_iterator(Dx),
d_x.begin(),
thrust::multiplies<float>());
// --- Calculate function values
thrust::device_vector<float> d_y(N);
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());
// --- Calculate integral
float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);
printf("The integral is = %f\n", integral);
getchar();
return 0;
}

thrust::device_reference can't be used with printf?

I am using the thrust partition function to partition array into even and odd numbers. However, when i try to display the device vector, it shows random values. Please let me know where is the error. I think i have done everything correct.
#include<stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include<thrust/partition.h>
struct is_even
{
//const int toCom;
//is_even(int val):toCom(val){}
__device__
bool operator()(const int &x)
{
return x%2;
}
};
void main(){
thrust::host_vector<int> H(6);
for(int i =0 ; i<H.size();i++){
H[i] = i+1;
}
thrust::device_vector<int> D = H;
thrust::partition(D.begin(),D.end(),is_even());
for(int i =0 ;i< D.size();i++){
printf("%d,",D[i]);
}
getchar();
}
You can't send a thrust::device_reference (i.e., the result of D[i]) through printf's ellipsis because it is not a POD type. See the documentation. Your code will produce a compiler warning to this effect.
Cast to int first:
for(int i = 0; i < D.size(); ++i)
{
printf("%d,", (int) D[i]);
}