I have two questions because I'm new with C language (for Linux) and I don't know if these piece of codes can be correctly:
void sigchld_h (int signum);
int main(int argc, char *argv[]){
...;
signal(SIGCHLD,sigchld_h);
...;
}
void sigchld_h (int signum){
pid_t pid;
int status;
while ( (pid = waitpid(-1,&status,WNOHANG)) > 0)
printf("Child %d terminated\n",pid);
}
Capture "ctrl-C" and close socket and kill child processes:
void termina(int sig);
int main(int argc, char *argv[]){
...;
signal(SIGINT,termina);
...;
}
void termina(int sig){
Close(s);
Close(socket_connesso);
return;
}
Thank you very much to all people that will analyze my code. Thanks
Related
I am trying to count the number of times curand_uniform() returns 1.0. However i cant seem to get the following code to work for me:
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
using namespace std;
__global__
void counts(int length, int *sum, curandStatePhilox4_32_10_t* state) {
int tempsum = int(0);
int i = blockIdx.x * blockDim.x + threadIdx.x;
curandStatePhilox4_32_10_t localState = state[i];
for(; i < length; i += blockDim.x * gridDim.x) {
double thisnum = curand_uniform( &localState );
if ( thisnum == 1.0 ){
tempsum += 1;
}
}
atomicAdd(sum, tempsum);
}
__global__
void curand_setup(curandStatePhilox4_32_10_t *state, long seed) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, id, 0, &state[id]);
}
int main(int argc, char *argv[]) {
const int N = 1e5;
int* count_h = 0;
int* count_d;
cudaMalloc(&count_d, sizeof(int) );
cudaMemcpy(count_d, count_h, sizeof(int), cudaMemcpyHostToDevice);
int threads_per_block = 64;
int Nblocks = 32*6;
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
curand_setup<<<Nblocks, threads_per_block>>>(d_state.data().get(), time(0));
counts<<<Nblocks, threads_per_block>>>(N, count_d, d_state.data().get());
cudaMemcpy(count_h, count_d, sizeof(int), cudaMemcpyDeviceToHost);
cout << count_h << endl;
cudaFree(count_d);
free(count_h);
}
I am getting the terminal error (on
linux):
terminate called after throwing an instance of 'thrust::system::system_error'
what(): parallel_for failed: cudaErrorInvalidValue: invalid argument
Aborted (core dumped)
And i am compiling like this:
nvcc -Xcompiler "-fopenmp" -o test uniform_one_hit_count.cu
I don't understand this error message.
This line:
thrust::device_vector<curandStatePhilox4_32_10_t> d_state(Nblocks*threads_per_block);
is initializing a new vector on the device. When thrust does that, it calls the constructor for the object in use, in this case curandStatePhilox4_32_10, a struct whose definition is in /usr/local/cuda/include/curand_philox4x32_x.h (on linux, anyway). Unfortunately that struct definition doesn't provide any constructors decorated with __device__, and this is causing trouble for thrust.
A simple workaround would be to assemble the vector on the host and copy it to the device:
thrust::host_vector<curandStatePhilox4_32_10_t> h_state(Nblocks*threads_per_block);
thrust::device_vector<curandStatePhilox4_32_10_t> d_state = h_state;
Alternatively, just use cudaMalloc to allocate space:
curandStatePhilox4_32_10_t *d_state;
cudaMalloc(&d_state, (Nblocks*threads_per_block)*sizeof(d_state[0]));
You have at least one other problem as well. This is not actually providing a proper allocation of storage for what the pointer should be pointing to:
int* count_h = 0;
after that, you should do something like:
count_h = (int *)malloc(sizeof(int));
memset(count_h, 0, sizeof(int));
and on your print-out line, you most likely want to do this:
cout << count_h[0] << endl;
The other way to address the count_h issue would be to start with:
int count_h = 0;
and this would necessitate a different set of changes to your code (to the cudaMemcpy operations).
The CUDA C Programming Guide says
__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.
I tried to make the kernel hang by the following code:
#include <stdio.h>
__global__ void test(int warpSize)
{
int i = threadIdx.x;
if (i < warpSize) {
__syncthreads();
}
else {
__syncthreads();
}
}
int main(int argc,char **argv)
{
int device;
cudaDeviceProp prop;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
test<<<1, 2 * prop.warpSize>>>(prop.warpSize);
printf("done");
return 0;
}
But the program exited normally.
To my understanding, there are two barriers in the kernel. The barrier within the if-block will wait for the completion of warp#1, and the barrier within the else-block will wait for the completion of warp#0. Did I misunderstand __syncthreads()? Or __syncthreads() in conditional code always be run even if it's inside an 'inactive' execution path?
According to the comments, the code should be more complicated so that the compiler won't optimize the kernel away. Besides, the CPU thread will not be blocked by some hung kernel if there is no synchronization.
Modified code:
#include <stdio.h>
__global__ void test(int warpSize, int *d_dummy)
{
int i = threadIdx.x;
__shared__ int tmp;
tmp = 0;
__syncthreads();
if (i < warpSize) {
tmp += 1;
__syncthreads();
tmp += 2;
}
else {
tmp -= 3;
__syncthreads();
tmp -= 4;
}
__syncthreads();
d_dummy[0] = tmp;
}
int main(int argc,char **argv)
{
int device;
cudaDeviceProp prop;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
int h_dummy[1], *d_dummy;
cudaMalloc(&d_dummy, 1 * sizeof(int));
test<<<1, 2 * prop.warpSize>>>(prop.warpSize, d_dummy);
cudaMemcpy(h_dummy, d_dummy, 1 * sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("done %d", h_dummy[0]);
return 0;
}
However, the behavior of __syncthreads() is undefined when warps within a block are not on the same execution path. So we cannot expect the program to hang.
I want to have an instance of a Container class allocating some device and host memory on initialization. I want to use the allocated memory in device code, without passing the actual pointer (for API reasons).
How do I create a global __device__ pointer to the member pointing to the device memory? I am happy to use thrust if that helps.
Here is a small example:
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
Container stuff;
__device__ auto d_int = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
There are two problems here:
You can't statically inititalize a __device__ variable in the way you are trying to (and the value you are trying to apply isn't correct either). The CUDA runtime API contains a function for initialising global scope device symbols. Use that instead.
Your global scope declaration of stuff shouldn't work either for a number of subtle reasons discussed here (it is technically undefined behaviour). Declare it at main scope instead.
Putting these two things together should lead your to do something like this instead:
__device__ int* d_int;
// ...
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));
edit<<<4, 1>>>();
// ...
Here is a fully worked example:
$ cat t1199.cu
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
//Container stuff;
__device__ int *d_int; // = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
$ nvcc -std=c++11 -o t1199 t1199.cu
$ cuda-memcheck ./t1199
========= CUDA-MEMCHECK
1337
========= ERROR SUMMARY: 0 errors
$
I'm trying to test functions returning void and void* using googletest. I'm just a beginner and I have just used EXPECT till now to test the code.
Please let me know how to write test cases for void and void * functions.
A sample code would help a lot. :)
Thanks
Here's an example with your Add function, and also with a throwing Divide function:
#include <stdexcept>
#include "gtest/gtest.h"
int global_sum(0), global_quotient(0);
void Add(int a, int b) {
global_sum = a + b;
}
void Divide(int numerator, int divisor) {
if (divisor == 0)
throw std::logic_error("Can't divide by 0.");
global_quotient = numerator / divisor;
}
TEST(Calculator, Add) {
EXPECT_EQ(0, global_sum);
Add(1, 2);
EXPECT_EQ(3, global_sum);
Add(-1, 1);
EXPECT_EQ(0, global_sum);
}
TEST(Calculator, Divide) {
EXPECT_EQ(0, global_quotient);
EXPECT_NO_THROW(Divide(2, 1));
EXPECT_EQ(2, global_quotient);
EXPECT_THROW(Divide(1, 0), std::logic_error);
EXPECT_EQ(2, global_quotient);
EXPECT_NO_THROW(Divide(1, 2));
EXPECT_EQ(0, global_quotient);
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
I want to run some function using mpi from main but I don't know how it should be. It looks like:
#define MAXSIZE 100
int main (int argc, char **argv) {
int i;
float matrixA[MAXSIZE][MAXSIZE], matrixB[MAXSIZE][MAXSIZE], matrixC[MAXSIZE][MAXSIZE];
for(i=0;i<10;i++){
multiply(matrixA, matrixB, matrixC);
}
}
void multiply(float matrixA[MAXSIZE][MAXSIZE], float matrixB[MAXSIZE][MAXSIZE], float matrixC[MAXSIZE][MAXSIZE]) {
int rank; //process rank
int size; //number of processes
MPI_Init(&argc, &argv); //initialize MPI operations
MPI_Comm_rank(MPI_COMM_WORLD, &rank); //get the rank
MPI_Comm_size(MPI_COMM_WORLD, &size); //get number of processes
...someoperation...
MPI_Finalize();
}
I know how to run basic MPI without using other functions but I need this construction.
In an application instance, MPI can be initialized at most once. So the code structure you provided will not work.
the correct structure for your program is as follows:
#define MAXSIZE 100
int main (int argc, char **argv) {
int i;
float matrixA[MAXSIZE][MAXSIZE], matrixB[MAXSIZE][MAXSIZE], matrixC[MAXSIZE][MAXSIZE];
int rank; //process rank
int size; //number of processes
MPI_Init(&argc, &argv); //initialize MPI operations
MPI_Comm_rank(MPI_COMM_WORLD, &rank); //get the rank
MPI_Comm_size(MPI_COMM_WORLD, &size); //get number of processes
for(i=0;i<10;i++){
multiply(matrixA, matrixB, matrixC);
}
MPI_Finalize();
}
void multiply(float matrixA[MAXSIZE][MAXSIZE], float matrixB[MAXSIZE][MAXSIZE], float matrixC[MAXSIZE][MAXSIZE]) {
...someoperation...
}
This might be helpful to you.In your program, there is a for loop
for(i=0;i<10;i++)
{
multiply(matrixA, matrixB, matrixC);
}
I feel that you are trying to execute multiplication 10 times.You can give each multiplication to one process.So you can use the command, mpirun -np 10 executable.