How to transform only the input signal source of an existing Arduino FFT source - fft

My question is one.
1. I want to FFT the unstable input voltage through the Arduino.
How do I transform from an existing Arduino FFT source?
I'm an Arduino starter.
Please give me a detailed answer.
/*
fft_adc_serial.pde
guest openmusiclabs.com 7.7.14
example sketch for testing the fft library.
it takes in data on ADC0 (Analog0) and processes them
with the fft. the data is sent out over the serial
port at 115.2kb.
*/
#define LOG_OUT 1 // use the log output function
#define FFT_N 256 // set to 256 point fft
#include <FFT.h> // include the library
void setup() {
Serial.begin(115200); // use the serial port
TIMSK0 = 0; // turn off timer0 for lower jitter
ADCSRA = 0xe5; // set the adc to free running mode
ADMUX = 0x40; // use adc0
DIDR0 = 0x01; // turn off the digital input for adc0
}
void loop() {
while(1) { // reduces jitter
cli(); // UDRE interrupt slows this way down on arduino1.0
while(!(ADCSRA & 0x10)); // wait for adc to be ready
ADCSRA = 0xf5; // restart adc
byte m = ADCL; // fetch adc data
byte j = ADCH;
int k = (j << 8) | m; // form into an int
k -= 0x0200; // form into a signed int
k <<= 6; // form into a 16b signed int
fft_window(); // window the data for better frequency respons
fft_reorder(); // reorder the data before doing the fft
fft_run(); // process the data in the fft
fft_mag_log(); // take the output of the fft
sei();
Serial.println("start"); // header send
for (byte i = 0 ; i < FFT_N ; i+=2) {
Serial.write(fft_input[i]); // send out the real part
Serial.write(fft_input[i+1]); // send out the imaginary part
}
}
}

Related

How can I get the value of pin A0 from the second sketch into the JSON array in the first sketch?

Can anyone help me figure out how to piece these two pieces of code together so I get the result I need? My eyes are crossing from looking at this. I know this is a breeze for probably everyone other than myself, but I am not a programmer and this is just for one small personal project.
So far, after hours and hours of reading and watching any videos I could find relating to Arduino, Pubnub and sensors, I have sensor reading publishing to Pubnub. I created a Freeboard account for visualization and that's all working. The problem is, the data being published is wrong.
Basically, I'm wanting to read a battery voltage and publish it to PubNub. I can get the Arduino (Uno R3) to read the voltage and I can adjust the values in the code to match the actual voltage. The problem I run into is taking that bit of code that works and stuffing it into the JSON array that gets published to PubNub.
If anyone would be willing to help me and maybe explain a little (or not - I'm okay if I just get it working), I would SO appreciate the time, help and effort.
Thanks!
//Each sketch works indepently. I need to merge them to get the correct reading published.
//VoltagePubNub.ino
(This is the one that publishes, which is what I want. I just want the published value to be the value of the second sketch.)
#include <SPI.h>
#include <Ethernet.h>
#include <PubNub.h>
#include <aJSON.h>
// Some Ethernet shields have a MAC address printed on a sticker on the shield;
// fill in that address here, or choose your own at random:
const static byte mac[] = { 0xDE, 0xAD, 0xBE, 0xEF, 0xFE, 0xED };
// Memory saving tip: remove myI and dnsI from your sketch if you
// are content to rely on DHCP autoconfiguration.
IPAddress myI(192, 168, 2, 114);
IPAddress dnsI(8, 8, 8, 8);
const static char pubkey[] = "publish_key";
const static char subkey[] = "subscribe_key";
const static char channel[] = "channel_name";
char uuid[] = "UUID";
#define NUM_CHANNELS 1 // How many analog channels do you want to read?
const static uint8_t analog_pins[] = {A0}; // which pins are you reading?
void setup()
{
Serial.begin(9600);
Serial.println("Serial set up");
Ethernet.begin((byte*) mac, myI, dnsI);
Serial.println("Ethernet set up");
delay(1000);
Serial.println("Ethernet set up");
PubNub.begin(pubkey, subkey);
Serial.println("PubNub set up");
delay(5000);
}
void loop()
{
Ethernet.maintain();
EthernetClient *client;
// create JSON objects
aJsonObject *msg, *analogReadings;
msg = aJson.createObject();
aJson.addItemToObject(msg, "analogReadings", analogReadings = aJson.createObject());
// get latest sensor values then add to JSON message
for (int i = 0; i < NUM_CHANNELS; i++) {
String analogChannel = String(analog_pins[i]);
char charBuf[analogChannel.length()+1];
analogChannel.toCharArray(charBuf, analogChannel.length()+1);
int analogValues = analogRead(analog_pins[i]);
aJson.addNumberToObject(analogReadings, charBuf, analogValues);
}
// convert JSON object into char array, then delete JSON object
char *json_String = aJson.print(msg);
aJson.deleteItem(msg);
// publish JSON formatted char array to PubNub
Serial.print("publishing a message: ");
Serial.println(json_String);
client = PubNub.publish(channel, json_String);
if (!client) {
Serial.println("publishing error");
} else
free(json_String);
client->stop();
delay(5000);
}
//VoltageSensor.ino
(This is the one with the correct value, but no publish feature.)
int analogInput = A0;
float vout = 0.0;
float vin = 0.0;
float R1 = 31000.0; //
float R2 = 8700.0; //
int value = 0;
int volt = 0;
void setup(){
pinMode(analogInput, INPUT);
Serial.begin(9600);
Serial.print("DC VOLTMETER");
Serial.println("");
}
void loop(){
// read the value at analog input
value = analogRead(analogInput);
vout = (value * 4.092) / 1024.0;
vin = vout / (R2/(R1+R2));
Serial.print("INPUT V= ");
Serial.println(vin,2);
delay(2000);
}
It may not be the most glamorous or the proper way of doing it, but I got this to do what I need. I edited the first sketch with the following code:
// create JSON objects
aJsonObject *msg, *analogReadings;
msg = aJson.createObject();
aJson.addItemToObject(msg, "analogReadings", analogReadings = aJson.createObject());
// get latest sensor values then add to JSON message
for (int i = 0; i < NUM_CHANNELS; i++) {
float vout = 0.0;
float vin = 0.0;
float R1 = 33060.0; //
float R2 = 7600.0; //
int value = 0;
int volt = 0;
//Serial.print("INPUT V= ");
//Serial.println(vin,2);
String analogChannel = String(analog_pins[i]);
value = analogRead(analog_pins[i]);
vout = (value * 4.092) / 1024.0;
vin = vout / (R2/(R1+R2));
char charBuf[analogChannel.length()+1];
analogChannel.toCharArray(charBuf, analogChannel.length()+1);
float theVoltage = (vin);
int analogValues = analogRead(analog_pins[i]);
aJson.addNumberToObject(analogReadings, charBuf, theVoltage);
}
// convert JSON object into char array, then delete JSON object
char *json_String = aJson.print(msg);
aJson.deleteItem(msg);
Now the value is published to PubNub and is graphed on Freeboard.io at this link .

RootBeer silently fails for large arrays?

I have a simple application that (for now) simulates error correction in a large array.
This bit generates the data and adds 16 bytes of Reed-Solomon parity to each block of 255 bytes.
ReedSolomonEncoder encoder = new ReedSolomonEncoder(QR_CODE_FIELD_256);
int[][] data = new int[params.getNumBlocks()][255];
int[][] original = new int[params.getNumBlocks()][];
int value = 0;
for (int i = 0; i < params.getNumBlocks(); i++) {
int[] block = data[i];
for (int j = 0; j < 239; j++) {
value = (value + 1) % 256;
block[j] = value;
}
encoder.encode(block, 16);
original[i] = Arrays.copyOf(block, block.length);
// Corrupt a byte
block[50] += 1;
}
This is my kernel:
public class RsKernel implements Kernel {
private final int[] block;
public RsKernel(int[] block) {
this.block = block;
}
#Override
public void gpuMethod() {
block[50] -= 1;
}
}
it merely manually reverts the corrupted byte in each block (it doesn't do actual Reed-Solomon error-correction).
I run the kernels with the following code:
ArrayList<Kernel> kernels = new ArrayList<>(params.getNumBlocks());
for (int[] block : data) {
kernels.add(new RsKernel(block));
}
new Rootbeer().run(kernels);
And I verify decoding with JUnit's assertArrayEquals:
Assert.assertArrayEquals(original, data);
The curious bit is that if I run this code with up to 8192 (what a suspiciously convenient number) blocks (kernels), the data is reported to have been decoded correctly; for 8193 blocks and above, it is not decoded correctly:
Exception in thread "main" arrays first differed at element [8192][50]; expected:<51> but was:<52>
at org.junit.Assert.internalArrayEquals(Assert.java:437)
at org.junit.Assert.internalArrayEquals(Assert.java:428)
at org.junit.Assert.assertArrayEquals(Assert.java:167)
at org.junit.Assert.assertArrayEquals(Assert.java:184)
at com.amphinicy.blink.rootbeer.RootBeerDemo.main(Jasmin)
What could cause this behaviour?
Here is the output of java -jar rootbeer-1.1.14.jar -printdeviceinfo:
device count: 1
device: GeForce GT 525M
compute_capability: 2.1
total_global_memory: 1073414144 bytes
num_multiprocessors: 2
max_threads_per_multiprocessor: 1536
clock_rate: 1200000 Hz
Looking at the code, I'm thinking it may be because the following:
// Corrupt a byte
block[50] += 1;
Could be adding one to 255, giving 256 which would not be a valid byte. Corrupting the byte might work better with something like this:
block[50] ^= 0x40;
Which would flip the bit in position 7 instead of adding to corrupt the byte.

Does calling a CUDA kernel multiple times affect execution speed?

I am trying to measure the performance difference of a GPU between allocating memory using 'malloc' in a kernel function vs. using pre-allocated storage from 'cudaMalloc' on the host. To do this, I have two kernel functions, one that uses malloc, one that uses a pre-allocated array, and I time the execution of each function repeatedly.
The problem is that the first execution of each kernel function takes between 400 - 2500 microseconds, but all subsequent runs take about 15 - 30 microseconds.
Is this behavior expected, or am I witnessing some sort of carryover effect from previous runs? If this is carryover, what can I do to prevent it?
I have tried putting in a kernel function that zeros out all memory on the GPU between each timed test run to eliminate that carryover, but nothing changed. I have also tried reversing the order in which I run the tests, and that has no effect on relative or absolute execution times.
const int TEST_SIZE = 1000;
struct node {
node* next;
int data;
};
int main() {
int numTests = 5;
for (int i = 0; i < numTests; ++i) {
memClear();
staticTest();
memClear();
dynamicTest();
}
return 0;
}
__global__ void staticMalloc(int* sum) {
// start a linked list
node head[TEST_SIZE];
// initialize nodes
for (int j = 0; j < TEST_SIZE; j++) {
// allocate the node & assign values
head[j].next = NULL;
head[j].data = j;
}
// verify creation by adding up values
int total = 0;
for (int j = 0; j < TEST_SIZE; j++) {
total += head[j].data;
}
sum[0] = total;
}
/**
* This is a test that will time execution of static allocation
*/
int staticTest() {
int expectedValue = 0;
for (int i = 0; i < TEST_SIZE; ++i) {
expectedValue += i;
}
// host output vector
int* h_sum = new int[1];
h_sum[0] = -1;
// device output vector
int* d_sum;
// vector size
size_t bytes = sizeof(int);
// allocate memory on device
cudaMalloc(&d_sum, bytes);
// only use 1 CUDA thread
dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);
Timer runTimer;
int runTime = 0;
// check dynamic allocation time
runTime = 0;
runTimer.start();
staticMalloc<<<gridsize, blocksize>>>(d_sum);
runTime += runTimer.lap();
h_sum[0] = 0;
cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_sum);
delete (h_sum);
return 0;
}
__global__ void dynamicMalloc(int* sum) {
// start a linked list
node* headPtr = (node*) malloc(sizeof(node));
headPtr->data = 0;
headPtr->next = NULL;
node* curPtr = headPtr;
// add nodes to test cudaMalloc in device
for (int j = 1; j < TEST_SIZE; j++) {
// allocate the node & assign values
node* nodePtr = (node*) malloc(sizeof(node));
nodePtr->data = j;
nodePtr->next = NULL;
// add it to the linked list
curPtr->next = nodePtr;
curPtr = nodePtr;
}
// verify creation by adding up values
curPtr = headPtr;
int total = 0;
while (curPtr != NULL) {
// add and increment current value
total += curPtr->data;
curPtr = curPtr->next;
// clean up memory
free(headPtr);
headPtr = curPtr;
}
sum[0] = total;
}
/**
* Host function that prepares data array and passes it to the CUDA kernel.
*/
int dynamicTest() {
// host output vector
int* h_sum = new int[1];
h_sum[0] = -1;
// device output vector
int* d_sum;
// vector size
size_t bytes = sizeof(int);
// allocate memory on device
cudaMalloc(&d_sum, bytes);
// only use 1 CUDA thread
dim3 blocksize(1, 1, 1), gridsize(1, 1, 1);
Timer runTimer;
int runTime = 0;
// check dynamic allocation time
runTime = 0;
runTimer.start();
dynamicMalloc<<<gridsize, blocksize>>>(d_sum);
runTime += runTimer.lap();
h_sum[0] = 0;
cudaMemcpy(h_sum, d_sum, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_sum);
delete (h_sum);
return 0;
}
__global__ void clearMemory(char *zeros) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
zeros[i] = 0;
}
void memClear() {
char *zeros[1024]; // device pointers
for (int i = 0; i < 1024; ++i) {
cudaMalloc((void**) &(zeros[i]), 4 * 1024 * 1024);
clearMemory<<<1024, 4 * 1024>>>(zeros[i]);
}
for (int i = 0; i < 1024; ++i) {
cudaFree(zeros[i]);
}
}
The first execution of a kernel takes more time because you have to load a lots of stuff on GPU (kernel, lib etc...). To prove it, you can just measure how long it takes to launch an empty kernel and you will see that it's take some times. Try like:
time -> start
launch emptykernel
time -> end
firstTiming = end - start
time -> start
launch empty kernel
time -> end
secondTiming = end - start
You will see that the secondTiming is significantly smaller thant the firstTiming.
The first CUDA (kernel) call initializes the CUDA system transparently. You can avoid this by calling an empty kernel first. Note that this is required in e.g. OpenCL, but there you have to do all that init-stuff manually. CUDA does it for you in the background.
Then some problems with your timing: CUDA kernel calls are asynchronous. So (assuming your Timer class is a host timer like time()) currently you measure the kernel launch time (and for the first call the init-time of CUDA) not the kernel execution time.
At the very least you HAVE to do a cudaDeviceSynchronize() before starting AND stopping the timer.
You are better of using CUDA events which can exactly measure the kernel execution time and only that. Using host-timers you still include the launch-overhead. See https://devblogs.nvidia.com/parallelforall/how-implement-performance-metrics-cuda-cc/

MSP430 and RFM22

i need help!
i have a TI msp-exp430g2 launchpad and a RFM22B and i need them to communicate with each other and i have no idea how to. after along time i came up with the code below to send data to the RFM22. i have an oscilloscope connected to the ANT pin of the RFM22 and i can see only noise and no output!
can anyone tell me what im doing wrong (alot abusively) and maybe someone has an example codes or a project that can help.
#include <msp430.h>
/*
* main.c
*
// MSP430G2xx3
// -----------------
// /|\| XIN|-
// | | |
// --|RST XOUT|-
// | |
// | P1.2|-> Data Out (UCA0SIMO)
// | |
// LED <-|P1.0 P1.3|-> nSel
// | |
// | P1.4|-> Serial Clock Out (UCA0CLK)
*
*/
//unsigned int address;
//unsigned char data;
void init(void);
void initRFM(void);
void write(int address, char data);
void txRFM(void);
int main(void) {
WDTCTL = WDTPW | WDTHOLD; // Stop watchdog timer
int t;
for (t=0;t<150;t++){ // 150 ms now
__delay_cycles(1000); // 1000 usec
}
init();
initRFM();
while(1){
txRFM();
}
return 0;
}
void txRFM(void){
unsigned char i;
write(0x07, 0x01); // To ready mode
__delay_cycles(50);
write(0x08, 0x03); // FIFO reset
write(0x08, 0x00); // Clear FIFO
write(0x34, 64); // preamble = 64nibble
write(0x3E, 17); // packet length = 17bytes
for (i=0; i<17; i++)
{
write(0x7F, 0xAA); // send payload to the FIFO
}
write(0x05, 0x04); // enable packet sent interrupt
write(0x07, 9); // Start TX
}
void write(int address, char data){
P1OUT &= ~BIT3; // start write
address |= 0x80;
UCA0TXBUF = address;
while ( ! ( IFG2 & UCA0TXIFG ) ) ;
UCA0TXBUF = data;
__delay_cycles(20);
P1OUT |= BIT3; // end write
}
void init(void) {
P1DIR |= BIT3; // P1.3 nSEL for writing to RFM22
P1OUT |= BIT3; // no write
P1SEL |= BIT2 + BIT4; // P1.4 clock out, P1.2 data out (UCA0SIMO)
P1SEL2 |= BIT2 + BIT4;
UCA0CTL0 |= UCCKPL + UCMSB + UCMST + UCSYNC; // UCCKPL- inactive high, UCMSB- MSB first, UCMST- Master mode, UCSYNC- sync mode
UCA0CTL1 |= UCSSEL_2; // SMCLK
UCA0BR0 |= 0x02; // /2
UCA0BR1 = 0; //
UCA0MCTL = 0; // No modulation
UCA0CTL1 &= ~UCSWRST; // **Initialize USCI state machine**
IE2 |= UCA0RXIE; // Enable USCI0 RX interrupt
}
void initRFM(void){
//write(0x03, 0x00); // Disable all interrupts
write(0x07, 0x01); // Set READY mode
write(0x09, 0x7F); // Cap = 12.5pF
write(0x0A, 0x05); // Clk output is 2MHz
write(0x0B, 0xF4); // GPIO0 is for RX data output
write(0x0C, 0xEF); // GPIO1 is TX/RX data CLK output
write(0x0D, 0x00); // GPIO2 for MCLK output
write(0x0E, 0x00); // GPIO port use default value
write(0x0F, 0x70); // NO ADC used
write(0x10, 0x00); // no ADC used
write(0x12, 0x00); // No temp sensor used
write(0x13, 0x00); // no temp sensor used
write(0x70, 0x20); // No manchester code, no data whiting, data rate < 30Kbps
write(0x1C, 0x1D); // IF filter bandwidth
write(0x1D, 0x40); // AFC Loop
//write(0x1E, 0x0A); // AFC timing
write(0x20, 0xA1); // clock recovery
write(0x21, 0x20); // clock recovery
write(0x22, 0x4E); // clock recovery
write(0x23, 0xA5); // clock recovery
write(0x24, 0x00); // clock recovery timing
write(0x25, 0x0A); // clock recovery timing
//write(0x2A, 0x18);
write(0x2C, 0x00);
write(0x2D, 0x00);
write(0x2E, 0x00);
write(0x6E, 0x27); // TX data rate 1
write(0x6F, 0x52); // TX data rate 0
write(0x30, 0x8C); // Data access control
write(0x32, 0xFF); // Header control
write(0x33, 0x42); // Header 3, 2, 1, 0 used for head length, fixed packet length, synchronize word length 3, 2,
write(0x34, 64); // 64 nibble = 32 byte preamble
write(0x35, 0x20); // 0x35 need to detect 20bit preamble
write(0x36, 0x2D); // synchronize word
write(0x37, 0xD4);
write(0x38, 0x00);
write(0x39, 0x00);
write(0x3A, 's'); // set tx header 3
write(0x3B, 'o'); // set tx header 2
write(0x3C, 'n'); // set tx header 1
write(0x3D, 'g'); // set tx header 0
write(0x3E, 17); // set packet length to 17 bytes
write(0x3F, 's'); // set rx header
write(0x40, 'o');
write(0x41, 'n');
write(0x42, 'g');
write(0x43, 0xFF); // check all bits
write(0x44, 0xFF); // Check all bits
write(0x45, 0xFF); // check all bits
write(0x46, 0xFF); // Check all bits
write(0x56, 0x01);
write(0x6D, 0x07); // Tx power to max
write(0x79, 0x00); // no frequency hopping
write(0x7A, 0x00); // no frequency hopping
write(0x71, 0x22); // GFSK, fd[8]=0, no invert for TX/RX data, FIFO mode, txclk-->gpio
write(0x72, 0x48); // Frequency deviation setting to 45K=72*625
write(0x73, 0x00); // No frequency offset
write(0x74, 0x00); // No frequency offset
write(0x75, 0x53); // frequency set to 434MHz
write(0x76, 0x64); // frequency set to 434MHz
write(0x77, 0x00); // frequency set to 434Mhz
write(0x5A, 0x7F);
write(0x59, 0x40);
write(0x58, 0x80);
write(0x6A, 0x0B);
write(0x68, 0x04);
write(0x1F, 0x03);
}
I haven't worked with your specific radio, but I have worked extensively with TI CCxxxx radios connected with various TI dev kits (launchpad included).
I would begin by ensuring that your hardwareInit() routine sets up the SPI peripheral correctly. When I was developing with TI radios, I would do the following:
// Setup CSn line.
P2DIR |= BIT7;
P2OUT |= BIT7;
P2SEL &= ~BIT7;
P2SEL2 &= ~BIT7;
// Setup the USCIB0 peripheral for SPI operation.
UCB0CTL1 |= UCSWRST;
UCB0CTL0 |= (UCMODE_0 | UCCKPH | UCMSB | UCMST | UCSYNC);
UCB0CTL1 |= UCSSEL_2;
UCB0BR1 = 0;
UCB0BR0 = 2;
// Setup SCLK, MISO, and MOSI lines.
P1SEL |= BIT5 | BIT6 | BIT7;
P1SEL2 |= BIT5 | BIT6 | BIT7;
UCB0CTL1 &= ~UCSWRST;
Then I would test a write() function to ensure that I was writing using the peripheral correctly:
void write(unsigned char address, const unsigned char *buffer, unsigned char count)
{
register volatile unsigned char i; // Buffer iterator
// Change MISO pin to SPI.
P1SEL |= BIT6;
P1SEL2 |= BIT6;
P2DIR &= ~BIT7;
// Look for CHIP_RDYn from radio.
while (P1IN & BIT6);
// Write the address/command byte.
UCB0TXBUF = address;
// Write data byte(s).
for (i = 0; i < count; i++)
{
while (!(IFG2 & UCB0TXIFG));
UCB0TXBUF = *(buffer+i);
}
// Wait for operation to complete.
while(UCB0STAT & UCBUSY);
P2OUT |= BIT7;
// Change MISO pin to general purpose output (LED use if available).
P1SEL &= ~BIT6;
P1SEL2 &= ~BIT6;
}

CUDA kernels not launching before CudaDeviceSynchronize

I am having some trouble with concurrent CUDA. Take a look at the attached image. The kernel is launched at the marked point, at 0.395 seconds. Then there is some green CpuWork. Finally, there is a call to cudaDeviceSynchronize. The kernels that is launched before CpuWork doesnt start before the synchronize call. Ideally, it should run in parallel with the CPU work.
void KdTreeGpu::traceRaysOnGpuAsync(int firstRayIndex, int numRays, int rank, int buffer)
{
int per_block = 128;
int num_blocks = numRays/per_block + (numRays%per_block==0?0:1);
Ray* rays = &this->deviceRayPtr[firstRayIndex];
int* outputHitPanelIds = &this->deviceHitPanelIdPtr[firstRayIndex];
kdTreeTraversal<<<num_blocks, per_block, 0>>>(sceneBoundingBox, rays, deviceNodesPtr, deviceTrianglesListPtr,
firstRayIndex, numRays, rank, rootNodeIndex,
deviceTHitPtr, outputHitPanelIds, deviceReflectionPtr);
CUDA_VALIDATE(cudaMemcpyAsync(resultHitDistances[buffer], deviceTHitPtr, numRays*sizeof(double), cudaMemcpyDeviceToHost));
CUDA_VALIDATE(cudaMemcpyAsync(resultHitPanelIds[buffer], outputHitPanelIds, numRays*sizeof(int), cudaMemcpyDeviceToHost));
CUDA_VALIDATE(cudaMemcpyAsync(resultReflections[buffer], deviceReflectionPtr, numRays*sizeof(Vector3), cudaMemcpyDeviceToHost));
}
The memcopies are async. The result buffers are allocated like this
unsigned int flag = cudaHostAllocPortable;
CUDA_VALIDATE(cudaHostAlloc(&resultHitPanelIds[0], MAX_RAYS_PER_ITERATION*sizeof(int), flag));
CUDA_VALIDATE(cudaHostAlloc(&resultHitPanelIds[1], MAX_RAYS_PER_ITERATION*sizeof(int), flag));
Hoping for a solution for this. Have tried many things, including not running in the default stream. When i added cudaHostAlloc i recognized that the async method returned back to the CPU. But that doesnt help when the kernel does not launch before the deviceSynchronize call later.
resultHitDistances[2] contains two allocated memory areas so that when 0 is read by the CPU, the GPU should put the result in 1.
Thanks!
Edit: This is the code that calls traceRaysAsync.
int numIterations = ceil(float(this->numPrimaryRays) / MAX_RAYS_PER_ITERATION);
int numRaysPrevious = min(MAX_RAYS_PER_ITERATION, this->numPrimaryRays);
nvtxRangePushA("traceRaysOnGpuAsync First");
traceRaysOnGpuAsync(0, numRaysPrevious, rank, 0);
nvtxRangePop();
for(int iteration = 0; iteration < numIterations; iteration++)
{
int rayFrom = (iteration+1)*MAX_RAYS_PER_ITERATION;
int rayTo = min((iteration+2)*MAX_RAYS_PER_ITERATION, this->numPrimaryRays) - 1;
int numRaysIteration = rayTo-rayFrom+1;
// Wait for results to finish and get them
waitForGpu();
// Trace the next iteration asynchronously. This will have data prepared for next iteration
if(numRaysIteration > 0)
{
int nextBuffer = (iteration+1) % 2;
nvtxRangePushA("traceRaysOnGpuAsync Interior");
traceRaysOnGpuAsync(rayFrom, numRaysIteration, rank, nextBuffer);
nvtxRangePop();
}
nvtxRangePushA("CpuWork");
// Store results for current iteration
int rayOffset = iteration*MAX_RAYS_PER_ITERATION;
int buffer = iteration % 2;
for(int i = 0; i < numRaysPrevious; i++)
{
if(this->activeRays[rayOffset+i] && resultHitPanelIds[buffer][i] >= 0)
{
this->activeRays[rayOffset+i] = false;
const TrianglePanelPair & t = this->getTriangle(resultHitPanelIds[buffer][i]);
double hitT = resultHitDistances[buffer][i];
Vector3 reflectedDirection = resultReflections[buffer][i];
Result res = Result(rays[rayOffset+i], hitT, t.panel);
results[rank].push_back(res);
t.panel->incrementIntensity(1.0);
if (t.panel->getParent().absorbtion < 1)
{
numberOfRaysGenerated++;
Ray reflected (res.endPoint() + 0.00001*reflectedDirection, reflectedDirection);
this->newRays[rayOffset+i] = reflected;
this->activeRays[rayOffset+i] = true;
numNewRays++;
}
}
}
numRaysPrevious = numRaysIteration;
nvtxRangePop();
}
This is the expected behavior on Windows with the WDDM driver model, where the driver tries to mitigate the kernel launch overhead by trying to batch kernel launches. Try inserting cudaStreamQuery(0) straight after the kernel invocation to trigger early launching of the kernel before the batch is full.