I, a complete beginner, was doing a project with ESP8266 where I interfaced with a relay and two sensors. The sensors are working fine but the relay is not getting actuated by the Blynk button when I press it. Below is the code where I got proper output for the sensors but not the relay.
I get all the values of sensors in the Blynk app but not the relay actuation where I connected it to a motor and in the D2 pin. Thanks in advance.:)
#define BLYNK_PRINT Serial
#define BLYNK_TEMPLATE_ID "my_template"
#define BLYNK_DEVICE_NAME "my_device name"
#define BLYNK_AUTH_TOKEN "Auth token"
#include <SPI.h>
#include <ESP8266WiFi.h>
#include <BlynkSimpleEsp8266.h>
#include <DHT.h>
#define BLYNK_PRINT Serial
#include <OneWire.h>
#include <DallasTemperature.h>
#define ONE_WIRE_BUS D2
OneWire oneWire(ONE_WIRE_BUS);
DallasTemperature sensors(&oneWire);
char auth[] = "Authtoken";
char ssid[] = "my_ssid";
char pass[] = "my_pass";
#define sensorPin D3
int sensorState = 0;
int lastState = 0;
#define DHTPIN 2
#define DHTTYPE DHT11
DHT dht(DHTPIN, DHTTYPE);
BlynkTimer timer;
void sendSensor()
{
float h = dht.readHumidity();
float t = dht.readTemperature();
Blynk.virtualWrite(V5, h); //hum
Blynk.virtualWrite(V6, t); //temp
}
void setup()
{
pinMode(D2,OUTPUT); //these two lines are the one which use for actuating the relay
digitalWrite(D2, HIGH);//
Blynk.begin(auth, ssid, pass);
pinMode(sensorPin, INPUT);
dht.begin();
timer.setInterval(1000L, sendSensor);
Blynk.begin(auth, ssid, pass);
sensors.begin();
}
int sensor = 0;
void sendTemps()
{
sensor = analogRead(A0);
sensors.requestTemperatures();
float temp = sensors.getTempCByIndex(0);
Blynk.virtualWrite(V1, temp);
Blynk.virtualWrite(V2, sensor);
delay(1000);
}
void loop()
{
Blynk.run();
timer.run();
sendTemps();
sensorState = digitalRead(sensorPin);
if (sensorState == 1 && lastState == 0) {
lastState = 1;
delay(1000);
}
else if (sensorState == 1 && lastState == 1) {
delay(1000);
}
else {
lastState = 0;
delay(1000);
}
delay(100);
}
I am new to this but I need to emulate RISC-V using qemu. As a start for my fuzzing project, how can I do give qemu an instruction set and get the changes in the registries as an output.
I probably understand your question. Because I don't have a riscv-related environment here, I can only provide a solution.
For example, in riscv, we design a function to get the values of all registers, relying on qemu's plugin module (such as qemu_plugin_register_vcpu_insn_exec_cb()).
plugin_test.c
#include <inttypes.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#include <glib.h>
#include <qemu-plugin.h>
QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
#define CPU_SIZE 32
static int cpu_num;
static int cpu_value[CPU_SIZE]={0};
static void vcpu_insn_exec_before(unsigned int cpu_index, void *)
{
for (size_t i = 0; i < cpu_num; i++)
{
/* code */
for (size_t j = 0; j < CPU_SIZE; i++)
{
if(cpu_value[j] != get_cpu_register(i,j)) {
// The value of cpu has changed
...
} else {
// The value of cpu has not changed
...
}
}
}
}
static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
{
size_t n = qemu_plugin_tb_n_insns(tb);
size_t i;
for (i = 0; i < n; i++) {
struct qemu_plugin_insn *insn = qemu_plugin_tb_get_insn(tb, i);
qemu_plugin_register_vcpu_insn_exec_cb(
insn, vcpu_insn_exec_before, QEMU_PLUGIN_CB_NO_REGS,void *);
}
}
static void plugin_exit(qemu_plugin_id_t id, void *p)
{
}
QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id,
const qemu_info_t *info,
int argc, char **argv)
{
if(info->system_emulation) {
cpu_num = info->system.smp_vcpus;
} else {
cpu_num = 1;
}
qemu_plugin_register_vcpu_tb_trans_cb(id, vcpu_tb_trans);
qemu_plugin_register_atexit_cb(id, plugin_exit, NULL);
return 0;
}
api-ext.c
void *qemu_get_cpu(int index);
static uint32_t get_cpu_register(unsigned int cpu_index, unsigned int reg) {
uint8_t* cpu = qemu_get_cpu(cpu_index);
return *(uint32_t*)(cpu + 33488 + 5424 + reg * 4);
}
It should be noted that the content in api-ext.c is obtained from others. This is the function used to obtain the value of arm cpu. You need to check the source code or documentation for riscv.
I want to calculate the GPU load. How get gpu load percent in cuda?
enter image description here
http://eliang.blogspot.com.by/2011/05/getting-nvidia-gpu-usage-in-c.html?m=1
//
// Getting Nvidia GPU Usage
//
// Reference: Open Hardware Monitor (http://code.google.com/p/open-hardware-monitor)
//
#include <windows.h>
#include <iostream>
// magic numbers, do not change them
#define NVAPI_MAX_PHYSICAL_GPUS 64
#define NVAPI_MAX_USAGES_PER_GPU 34
// function pointer types
typedef int *(*NvAPI_QueryInterface_t)(unsigned int offset);
typedef int (*NvAPI_Initialize_t)();
typedef int (*NvAPI_EnumPhysicalGPUs_t)(int **handles, int *count);
typedef int (*NvAPI_GPU_GetUsages_t)(int *handle, unsigned int *usages);
int main()
{
HMODULE hmod = LoadLibraryA("nvapi.dll");
if (hmod == NULL)
{
std::cerr << "Couldn't find nvapi.dll" << std::endl;
return 1;
}
// nvapi.dll internal function pointers
NvAPI_QueryInterface_t NvAPI_QueryInterface = NULL;
NvAPI_Initialize_t NvAPI_Initialize = NULL;
NvAPI_EnumPhysicalGPUs_t NvAPI_EnumPhysicalGPUs = NULL;
NvAPI_GPU_GetUsages_t NvAPI_GPU_GetUsages = NULL;
// nvapi_QueryInterface is a function used to retrieve other internal functions in nvapi.dll
NvAPI_QueryInterface = (NvAPI_QueryInterface_t) GetProcAddress(hmod, "nvapi_QueryInterface");
// some useful internal functions that aren't exported by nvapi.dll
NvAPI_Initialize = (NvAPI_Initialize_t) (*NvAPI_QueryInterface)(0x0150E828);
NvAPI_EnumPhysicalGPUs = (NvAPI_EnumPhysicalGPUs_t) (*NvAPI_QueryInterface)(0xE5AC921F);
NvAPI_GPU_GetUsages = (NvAPI_GPU_GetUsages_t) (*NvAPI_QueryInterface)(0x189A1FDF);
if (NvAPI_Initialize == NULL || NvAPI_EnumPhysicalGPUs == NULL ||
NvAPI_EnumPhysicalGPUs == NULL || NvAPI_GPU_GetUsages == NULL)
{
std::cerr << "Couldn't get functions in nvapi.dll" << std::endl;
return 2;
}
// initialize NvAPI library, call it once before calling any other NvAPI functions
(*NvAPI_Initialize)();
int gpuCount = 0;
int *gpuHandles[NVAPI_MAX_PHYSICAL_GPUS] = { NULL };
unsigned int gpuUsages[NVAPI_MAX_USAGES_PER_GPU] = { 0 };
// gpuUsages[0] must be this value, otherwise NvAPI_GPU_GetUsages won't work
gpuUsages[0] = (NVAPI_MAX_USAGES_PER_GPU * 4) | 0x10000;
(*NvAPI_EnumPhysicalGPUs)(gpuHandles, &gpuCount);
// print GPU usage every second
for (int i = 0; i < 100; i++)
{
(*NvAPI_GPU_GetUsages)(gpuHandles[0], gpuUsages);
int usage = gpuUsages[3];
std::cout << "GPU Usage: " << usage << std::endl;
Sleep(1000);
}
return 0;
}
I really tried to implement a function in C to multiply to row-major matrix in cublas. I don't know where I mistaking.
In the function below A, B and C are pointers to an row matrix correctly
allocated.
I'd like to keep the option of translate a matrix before perform the product.
The function below is not working.
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
The entire source code
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
void getFromDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//A = (m,n)
//B = (n,k)
//C = (m,k)
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
float *mallocfDevice(int size){
float *d_C = NULL;
cudaError_t err = cudaMalloc((void **)&d_C, size * sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}else{
size_t freeM, total;
cudaMemGetInfo ( &freeM, &total);
printf("MEM:%.3f\n",freeM,total,100 - ((double)freeM/total)*100 );
}
return d_C;
}
void printHostMatrix(int nl, int nc, float *h_s){
for(int j = 0; j < nl ; j++) {
for(int i = 0; i < (nc) ; i++){
int idx = j*nc + i;
printf("%.2f ", h_s[idx]);
}
printf("\n");
}
}
void printfDeviceMatrix(float *d_s,int m, int p){
float *h_s =(float*) malloc(sizeof(float)*m*p);
getFromDevice(h_s,d_s,sizeof(float)*m*p);
printHostMatrix(m,p,h_s);
free(h_s);
}
void sendTofDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(d_A, h_A, size*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(int argc,char **argv){
int ma = 2,
na = 3,
mb = 3,
nb = 2;
float A[] = { 1,2,3,
4,5,6};
float B[] = {7, 8,
9,10,
11,12};
float *C = new float[ma*nb];
float *d_a = mallocfDevice(ma*mb),
*d_b = mallocfDevice(mb*nb),
*d_c = mallocfDevice(ma*nb);
sendTofDevice(A,d_a,ma*na);
sendTofDevice(B,d_b,mb*nb);
cublasHandle_t handle ; // CUBLAS context
cublasCreate (&handle );
puts("A");
printfDeviceMatrix(d_a,ma,na);
puts("B");
printfDeviceMatrix(d_b,mb,nb);
matrixMul(handle, d_a,d_b,d_c,
ma,na,nb,0,0);
puts("AB=C");
printfDeviceMatrix(d_c,ma,nb);
}
CUBLAS assumes that the matrix in the device is stored in column major:
"
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. Also, for matrix A
Read more at: http://docs.nvidia.com/cuda/cublas/index.html#ixzz3mSDJTWrM "
That means the matrix needs to be treated as differently on the device than on the host.
I'm trying to figure out the best way to do the following using Thrust: vector A has a million floats, they have some particular order. I want to move to vector B every element x in A for which x>7.0 such that the order of elements is maintain in both vectors A and B. Importantly, only a tiny fraction of elements need be moved. Efficiency is more important for my code than elegance.
My idea was to use thrust::copy_if from A to B and then thrust::remove_if on A. But I don't know the exact number of elements to be copy, and since apparently the memory for B must be allocated in advance, another counting operation is necessary. An inelegant way to skip the counting operation is to pre-allocate "enough" memory for vector B.
Using thrust::remove_copy_if has much the same problems: you need to allocate memory for B in advance, and also it doesn't actually remove anything from A so another thrust::remove_if is required anyway.
Another idea I had was to use thrust::stable_sort with some custom-made comparison functor, to push all elements I want out to the end of A, and then somehow figure out how many there are and thrust::copy them to B. This also looks pretty inelegant...
You're on the right track with thrust::copy_if. Just allocate two more buffers of the same size as the first one. Then copy_if > 7.0f to the first one and copy_if <= 7.0f to the second one. Allocating buffers of the same size as the original buffer is fine as long as you know there's room, and 1 million floats only takes up 4MB.
Edit:
I did a performance comparison of the copy_if and stable_partition approaches. On my card, a GTX660, stable_partition took around 150% as long as copy_if for "split" values of 0.1f, 0.5f and 0.9f. I added tests to ensure that both methods are stable (maintain the order of the values).
#include <cuda.h>
#include <curand.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define SPLIT 0.1f
struct is_low
{
__host__ __device__ bool operator()(const float x)
{
return x <= SPLIT;
}
};
struct is_high
{
__host__ __device__ bool operator()(const float x)
{
return x > SPLIT;
}
};
class EventTimer {
public:
EventTimer() : mStarted(false), mStopped(false) {
cudaEventCreate(&mStart);
cudaEventCreate(&mStop);
}
~EventTimer() {
cudaEventDestroy(mStart);
cudaEventDestroy(mStop);
}
void start(cudaStream_t s = 0) {
cudaEventRecord(mStart, s);
mStarted = true;
mStopped = false;
}
void stop(cudaStream_t s = 0) {
assert(mStarted);
cudaEventRecord(mStop, s);
mStarted = false;
mStopped = true;
}
float elapsed() {
assert(mStopped);
if (!mStopped) return 0;
cudaEventSynchronize(mStop);
float elapsed = 0;
cudaEventElapsedTime(&elapsed, mStart, mStop);
return elapsed;
}
private:
bool mStarted, mStopped;
cudaEvent_t mStart, mStop;
};
int main(int argc, char *argv[])
{
const size_t n = 1024 * 1024 * 50;
// Create prng
curandGenerator_t gen;
CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
// Set seed
CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
// Generate n floats on device
thrust::device_vector<float> vec_rnd_d(n);
float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());
CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n));
thrust::device_vector<float> vec_low_d(n);
thrust::device_vector<float> vec_high_d(n);
for (int i = 0; i < 5; ++i) {
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_end;
iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());
thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());
timer.stop();
std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_low_h = vec_low_d;
thrust::host_vector<float> vec_high_h = vec_high_d;
thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
for (int i = 0; i < 5; ++i) {
thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_split =
thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());
timer.stop();
size_t n_low = iter_split - vec_rnd_copy.begin();
std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;
thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
CHECK_CURAND_CALL(curandDestroyGenerator(gen));
return EXIT_SUCCESS;
}
Output:
C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe
copy_if: 40.2919ms
copy_if: 38.0157ms
copy_if: 38.5036ms
copy_if: 37.6751ms
copy_if: 38.1054ms
stable_partition: 59.5473ms
stable_partition: 61.4016ms
stable_partition: 59.1854ms
stable_partition: 61.3195ms
stable_partition: 59.1205ms
To answer my own question, I finally found thrust::stable_partition, which is more efficient and elegant than all "copy_if"-alternatives. It just moves all elements that fail to satisfy a predicate to the end of the array and returns the start of the second sequence. Pointer arithmetic gives the size of B, but in fact it's not necessary anymore:
thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred)
thrust::device_vector<float> B(iter, A.end())
A.erase(iter, A.end());