On a mips platform, I am trying to get Unwind work. Currently if I issue print_trace manually stack trace is correctly shown as below:
backtrace_helper 0x4b6958
backtrace_helper 0x4b6ab4
backtrace_helper 0x2ac2f628
Obtained 3 stack frames.
./v(print_trace+0x38) [0x4b6958]
./v(main+0x90) [0x4b6ab4]
/lib/libc.so.0(__uClibc_main+0x24c) [0x2ac2f628]
But when a SIGSEGV occurs, stack trace does not show correct function call sequence.
backtrace_helper 0x4b7a74
backtrace_helper 0x2ab9b84c
Obtained 2 stack frames.
./v(getLineIDByPhyIdx+0x3d8) [0x4b7a74]
/lib/libpthread.so.0(__new_sem_post+0x2c8) [0x2ab9b84c]
I am compiling with -g -fexceptions -rdynamic. Also I have seen How to generate a stacktrace when my gcc C++ app crashes in which 2nd answer mentiones about wrong address but when I set as he does but it only changes 2nd frame and rest is the same, code snippet is below:
caller_address = (void *) uc->uc_mcontext.gregs[30]; // Frame pointer (from wikipedia here)
fprintf(stderr, "signal %d (%s), address is %p from %p\n",
sig_num, strsignal(sig_num), info->si_addr,
(void *)caller_address);
size = backtrace(array, 50);
/* overwrite sigaction with caller's address */
array[1] = caller_address;
messages = backtrace_symbols(array, size);
Code:
int main(int argc, char *argv[]) {
registerSignalHandler(signalHandler);
print_trace();
{
// Seg Fault
int *p = NULL;
*p = 54;
}
}
void print_trace(void) {
void *array[10];
size_t size;
char **strings;
size_t i;
/* Get the address at the time the signal was raised from the EIP (x86) */
size = backtrace(array, 10);
strings = backtrace_symbols(array, size);
printf("Obtained %zd stack frames.\n", size);
for (i = 0; i < size; i++)
printf("%s\n", strings[i]);
free(strings);
}
static _Unwind_Reason_Code
backtrace_helper (struct _Unwind_Context *ctx, void *a)
{
struct trace_arg *arg = a;
assert (unwind_getip != NULL);
/* We are first called with address in the __backtrace function. Skip it. */
if (arg->cnt != -1) {
arg->array[arg->cnt] = (void *) unwind_getip (ctx);
printf("backtrace_helper %p \n", arg->array[arg->cnt]);
}
if (++arg->cnt == arg->size)
return _URC_END_OF_STACK;
return _URC_NO_REASON;
}
/*
* Perform stack unwinding by using the _Unwind_Backtrace.
*
* User application that wants to use backtrace needs to be
* compiled with -fexceptions option and -rdynamic to get full
* symbols printed.
*/
int backtrace (void **array, int size)
{
struct trace_arg arg = { .array = array, .size = size, .cnt = -1 };
if (unwind_backtrace == NULL)
backtrace_init();
if (size >= 1)
unwind_backtrace (backtrace_helper, &arg);
return arg.cnt != -1 ? arg.cnt : 0;
}
void signalHandler( int sig, siginfo_t* siginfo, void* notused)
{
/* Print out the signal info */
signalInfo(sig, siginfo);
switch (sig) {
case SIGSEGV:
{
print_trace();
abort();
}
}
}
Frame pointer is practically never used on MIPS and obtaining backtrace without digging into symbols requires some heuristics.
Typical approach is to analyze code preceding current instruction pointer and try finding function prologue that adjusts SP. Using that info one can figure out location of preceding frame, etc.
See these slides for some of the gory details:
http://elinux.org/images/0/07/Intricacies_of_a_MIPS_Stack_Backtrace_Implementation.pdf
Related
So, I'm writing a DQL Neural network. When I run the code through the debugger, it throws this exception
Exception thrown at 0x00D05DFB in Deep Q-learning Neural
Network.exe: 0xC0000005: Access violation writing location 0x00000003.
Here is the relevant code:
The neuron structure:
typedef struct neuron_t {
// not entirely relevant to this problem, but it's nicer to have the full picture
float activation;
float* outputWeights;
float bias;
float z;
float dactv;
float* dw;
float dbias;
float dz;
}Neuron;
The layer structure:
typedef struct layer_t {
int numberOfNeurons;
Neuron* neu;
}Layer;
the main function:
Layer* testTarget = NULL;
Layer* test = NULL;
int numberOfLayers = 3;
int* neuronInlayer[] = { 2,3,4 };
test = createPredictionArchitecture(test, testTarget, numberOfLayers, neuronInlayer); /* I plan to turn this into merely
createPredictionArchitecture(test, testTarget, numberOfLayers, neuronInlayer) once this problem
is fixed.*/
The relevant bit of the createArchitecture function:
int createArchitecture(Layer* predictionNetwork, Layer* targetNetwork, int numberOfLayers, int* neuronInEachLayer) {
predictionNetwork = (Layer*)malloc(numberOfLayers * sizeof(Layer)); if (predictionNetwork == NULL) { fprintf(stderr, "Failed to allocate memory to 'predictionNetwork' in line %d\n ", __LINE__); exit(1); }
else {
printf("Memory successfully allocated to 'predictionNetwork'\n");
}
targetNetwork = (Layer*)malloc(numberOfLayers * sizeof(Layer)); if (targetNetwork == NULL) { fprintf(stderr, "Failed to allocate memory to 'predictionNetwork' in line %d\n ", __LINE__); exit(1); }
else {
printf("Memory successfully allocated to 'targetNetwork'\n");
}
Layer** targetNW = &targetNetwork;
...
}
The exception occurs when I do this:
*targetNW[i] = createLayer(neuronInEachLayer[i]);
Why does this happen, and how should I fix it?
And also, do say if you need to see more of the code.
Well, I figured out the problem; I wasn't assigning any memory to targetNW.
I want to experiment with the programs that write programs in C code, and i want to use construction like following:
int main() {
char* srcCode="int f(int x) { return x+42; }";
int (*compiledFun)(int) = compile(srcCode);
printf("result=%d", (*compiledFun)(123));
return 0;
}
Desired output should be printed "result=165".
My question is about compile() function. I may try to put srcCode in a file, then invoke external compiler, like gcc, then try to read produced binary, probably fix some addresses, and so to fill the compiledFun memory. But I feel like that would be a very inefficient stub. Is there any way to compile a program from within a program, directly from memory to memory? Maybe some library or a subset that can be ripped off gcc sources, responsible for producting binary code from source text?
That may be important addition, all source code that should be compiled is a function that takes arguments and returns. It will not call any external libraries and function like printf, but only do some calculations and return.
Use libtcc an in-memory C compiler from TinyC.
A complete example from here https://github.com/TinyCC/tinycc/blob/mob/tests/libtcc_test.c
/*
* Simple Test program for libtcc
*
* libtcc can be useful to use tcc as a "backend" for a code generator.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "libtcc.h"
/* this function is called by the generated code */
int add(int a, int b)
{
return a + b;
}
/* this strinc is referenced by the generated code */
const char hello[] = "Hello World!";
char my_program[] =
"#include <tcclib.h>\n" /* include the "Simple libc header for TCC" */
"extern int add(int a, int b);\n"
"#ifdef _WIN32\n" /* dynamically linked data needs 'dllimport' */
" __attribute__((dllimport))\n"
"#endif\n"
"extern const char hello[];\n"
"int fib(int n)\n"
"{\n"
" if (n <= 2)\n"
" return 1;\n"
" else\n"
" return fib(n-1) + fib(n-2);\n"
"}\n"
"\n"
"int foo(int n)\n"
"{\n"
" printf(\"%s\\n\", hello);\n"
" printf(\"fib(%d) = %d\\n\", n, fib(n));\n"
" printf(\"add(%d, %d) = %d\\n\", n, 2 * n, add(n, 2 * n));\n"
" return 0;\n"
"}\n";
int main(int argc, char **argv)
{
TCCState *s;
int i;
int (*func)(int);
s = tcc_new();
if (!s) {
fprintf(stderr, "Could not create tcc state\n");
exit(1);
}
/* if tcclib.h and libtcc1.a are not installed, where can we find them */
for (i = 1; i < argc; ++i) {
char *a = argv[i];
if (a[0] == '-') {
if (a[1] == 'B')
tcc_set_lib_path(s, a+2);
else if (a[1] == 'I')
tcc_add_include_path(s, a+2);
else if (a[1] == 'L')
tcc_add_library_path(s, a+2);
}
}
/* MUST BE CALLED before any compilation */
tcc_set_output_type(s, TCC_OUTPUT_MEMORY);
if (tcc_compile_string(s, my_program) == -1)
return 1;
/* as a test, we add symbols that the compiled program can use.
You may also open a dll with tcc_add_dll() and use symbols from that */
tcc_add_symbol(s, "add", add);
tcc_add_symbol(s, "hello", hello);
/* relocate the code */
if (tcc_relocate(s, TCC_RELOCATE_AUTO) < 0)
return 1;
/* get entry symbol */
func = tcc_get_symbol(s, "foo");
if (!func)
return 1;
/* run the code */
func(32);
/* delete the state */
tcc_delete(s);
return 0;
}
I am implementing a circular global memory to enable all threads read/write data to the same buffer simultaneously. It is a very simple producer/consumer algorithm in cpu. But i found something wrong in my cuda code.
The circular buffer was defined as follows:
#define BLOCK_NUM 1024
#define THREAD_NUM 64
#define BUFFER_SIZE BLOCK_NUM*THREAD_NUM*10
struct Stack {
bool bDirty[BUFFER_SIZE];
unsigned int index;
unsigned int iStackSize;
}
The read device is implemented as
__device__ void read(Stack *pStack) {
unsigned int index = atomicDec(&pStack->index, BUFFER_SIZE-1);
if(- -index >= BUFFER_SIZE)
index = BUFFER_SIZE - 1;
// check
if(pStack->bDirty[index] == false) {
printf(“no data\n”);
return;
}
//set read flag
pStack->bDirty[index] = false;
atomicSub(&pStack->iStackSize, 1);
}
The write device function is:
__device__ void write(Stack *pStack) {
unsigned int index = atomicInc(&pStack->index, BUFFER_SIZE - 1);
//check
if(pStack->bDirty[index] == true) {
printf(“why dirty\n”);
return;
}
pStack->bDirty[index] = true;
atomicAdd(&pStack->iStackSize, 1);
}
In order to test the read/write function in a more robust way, I write the following kernels:
__global__ void kernelWrite(Stack *pStack) {
if(threadIdx.x != 0) //make write less than thread number for testing purpose
write(pStack);
}
__global__ void kernelRead(Stack *pStack) {
read(pStack);
__syncthreads();
if(threadIdx.x % 3 != 0) // make write less than read
write(pStack);
__syncthreads();
}
In the main function, I used a dead loop to test if the read/write is atomic.
int main() {
Stack *pHostStack = (Stack*)malloc(sizeof(Stack));
Stack *pStack;
cudaMalloc(&pStack, sizeof(Stack));
cudaMemset(pStack, 0, sizeof(Stack));
while(true) { //dead loop
kernelWrite<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
while(pHost->iStackSize >= BLOCK_NUM*THREAD_NUM) {
kernelRead<<<BLOCK_NUM, THREAD_NUM>>>(pStack);
cudaDeviceSynchonize();
cudaMemcpy(pHostStack, pStack, sizeof(Stack), cudaMemcpyDeviceToHost);
}
return 0;
}
When I execute the above code, I got error msg “why dirty” and “no data”. What is wrong to the read/write logic?
By the way, I do not map the thread ID to the linear buffer address because in my application maybe only 10% threads write to the buffer, it is unpredictable/random.
The key problem is that the atomic operation is not real atomic because of reading and writing to the same buffer. The weird thing is that when the total thread number is less then 4096, no error message will be shown.
I really do not understand why the output for the below code is not a and b.
#include<cutil.h>
#include<iostream>
__global__ void p(unsigned char **a){
unsigned char temp[2];
temp[0] = 'a';
temp[1] = 'b';
a[0] = temp;
}
void main(){
unsigned char **a ;
cudaMalloc((void**)&a, sizeof(unsigned char*));
p<<<1,1>>>(a);
unsigned char **c;
unsigned char b[2];
cudaMemcpy(c, a, sizeof(unsigned char *), cudaMemcpyDeviceToHost);
cudaMemcpy(b, c[0], 2*sizeof(unsigned char), cudaMemcpyDeviceToHost);
for( int i=0 ; i < 2; i++){
printf("%c\n", b[i]);
}
getchar();
}
what is wrong with my logic?
Let's leave out CUDA for now. Let's just make a function that writes data to a user-provided array. The user passes the array via a pointer:
void fill_me_up(int * dst)
{
// We sure hope that `dst` points to a large enough area of memory!
dst[0] = 28;
dst[1] = 75;
}
Now, what you're doing with the local variable doesn't make sense, because you want to use the address of a local variable, which becomes invalid after you leave the function scope. The next best thing you could do is memcpy(), or some equivalent C++ algorithm:
void fill_me_up_again(int * dst)
{
int temp[] = { 28, 75 };
memcpy((void *)dst, (const void *)temp, sizeof(temp));
}
OK, now on to calling that function: We first must provide the target memory, and then pass a pointer:
int main()
{
int my_memory[2]; // here's our memory -- automatic local storage
fill_me_up(my_memory); // OK, array decays to pointer-to-beginning
fill_me_up(&my_memory[0]); // A bit more explicit
int * your_memory = malloc(sizeof(int) * 2); // more memory, this time dynamic
fill_me_up_again(your_memory);
/* ... */
free(your_memory);
}
(In C++ you would probably have uses new int[2] and delete your_memory instead, but by using C malloc() the connection to CUDA hopefully becomes clear.)
When you're moving fill_me_up to the CUDA device, you have to give it a device pointer rather than a host pointer, so you have to set that one up first and afterwards copy the results back out, but that's about the only change.
I'm trying to set my simulation params in constant memory but without luck (CUDA.NET).
cudaMemcpyToSymbol function returns cudaErrorInvalidSymbol. The first parameter in cudaMemcpyToSymbol is string... Is it symbol name? actualy I don't understand how it could be resolved. Any help appreciated.
//init, load .cubin
float[] arr = new float[1];
arr[0] = 0.0f;
int size = Marshal.SizeOf(arr[0]) * arr.Length;
IntPtr ptr = Marshal.AllocHGlobal(size);
Marshal.Copy(arr, 0, ptr, arr.Length);
var error = CUDARuntime.cudaMemcpyToSymbol("param", ptr, 4, 0, cudaMemcpyKind.cudaMemcpyHostToDevice);
my .cu file contain
__constant__ float param;
Working solution
cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "name.cubin"));
simParams = cuda.GetModuleGlobal("params");
float[] parameters = new float[N]{...}
cuda.CopyHostToDevice<float>(simParams, parameters);
Unfortunately the __ constant __ must be in the same file scope as the memcpy to the symbol, and in your case your __ constant __ is in a separate .cu file.
The simple way around this is to provide a wrapper function in your .cu file, for example:
__constant__ float param;
// Host function to set the constant
void setParam(float value)
{
cudaMemcpyToSymbol("param", ptr, 4, 0, cudaMemcpyHostToDevice);
}
// etc.
__global__ void ...
If this question is actual you can use cuModuleGetGlobal and next cudaMemcpy like this:
private bool setValueToSymbol(CUmodule module, string symbol, int value)
{
CUdeviceptr devPtr = new CUdeviceptr();
uint lenBytes = 0;
CUResult result = CUDADriver.cuModuleGetGlobal(ref devPtr, ref lenBytes, module, symbol);
if (result == CUResult.Success)
{
int[] src = new int[] { value };
cudaError error = CUDARuntime.cudaMemcpy(devPtr, src, lenBytes, cudaMemcpyKind.cudaMemcpyHostToDevice);
if (error == cudaError.cudaSuccess)
return true;
else
return false;
}
else
{
return false;
}
}
where CUmodule module = cuda.LoadModule("MyCode.cubin");
This code works with NVIDIA GPU Computing SDK 3.1 and CUDA.NET 3.0.
constant memory has implicit local scope linkage.
make sure declaration is in the same file where you use it. it sounds like you have two files.
may also have to declare param to array (or maybe not)