How to use a function with input - function

I have a book inbound but I really want to do this.
Can you explainhow I need to go about to use the value I input to go through the function? And then print it out.
#include <iostream>
using namespace std;
int addition(int a, int b)
{
int sum;
sum = a + b;
return sum;
}
int main() {
int a;
int b;
cin >> a;
cin >> b;
cout << addition(a,b) << endl;
system("pause");
cin.get();
return 0;
}

Related

CUDA 2nd order recursion with thrust inclusive_scan

I'm trying to understand how to parallelise a recursive calculation. Serially, the calculation takes the form:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
For the i-1 index there's a solution here to a previous question of mine: CUDA force instruction execution order
I want to modify this to use the i-2 and I can't understand how to apply the same process to a 2nd order calculation. It should be possible using the thrust::inclusive_scan function, but I can't work out how. Does anyone know the solution?
Picking up where we left off in the previous question/answer, we shift our attention to equation 1.11 in the referenced paper by Blelloch. We observe that your problem formulation:
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
seems to match that in equation 1.11 if we set m=2 and in that case we can also observe that for your formulation, all ai,1 are zero (and, as previously, all ai,2 are k).
As per equation 1.12 in that paper, our state variable si now becomes a two-tuple:
si = |xi xi-1|
Taking note of these things, we observe the "correctness" of equation 1.13:
si = |xi-1 xi-2| . |0 1, k 0| + |bi 0|
rewriting:
si,1 = xi = k*xi-2 + bi
si,2 = xi-1 = xi-1
(In my view, the other answer leaves you at this point. That realization, i.e. result.data[0] = right + k * left.data[1]; is sufficient for a serial scan but not for a parallel scan. It's also evident that the functor/scan op there is not associative.)
We now need to come up with a binary operator bop that is an extension of the definition in (1.7) to this case. Referring to the previous definition in equation 1.7, we extend that based on the treatment in 1.13 as follows:
Ci = |Ai , Bi|
where:
Ai = |0 1, k 0|
and:
Bi = |bi 0|
We then have:
Ci bop Cj = | Ai . Aj , Bi . Aj + Bj |
This then becomes the formula for our functor/scan operator. We will need to carry 6 scalar "state" quantities throughout: 2 for the B vector and 4 for the A matrix.
What follows then is a realization of the above:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
thrust::device_vector<mt> db(b1, b1+ds);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
thrust::host_vector<mt> hx1 = dx1;
thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$
Yes, there are some results above that differ in the 6th digit. I attribute this to the limitations of float resolution when taking into account the very different order of operations between the serial and parallel method. If you change the typedef to double, the results will appear to match exactly.
Since you've asked about it here's an equivalent realization where it is demonstrated using device data previously allocated using cudaMalloc:
$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // cpu result
for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_vector<mt> dx1(ds);
thrust::device_vector<mt> dx0(ds);
thrust::device_vector<mt> dy0(ds);
thrust::device_vector<mt> dy1(ds);
thrust::device_vector<mt> dy2(ds);
thrust::device_vector<mt> dy3(ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors
There should be no significant performance difference between these two approaches. (However I happened to switch the typedef to double for this example, so that makes a difference.) Using cudaMalloc as an alternative to the device_vector for the various state vectors (dx0, dx1, dy0, dy1 ...) may be slightly faster, because device_vector first does a cudaMalloc style allocation, then launches a kernel to zero out the allocation. This zero-ing step is unnecessary for the state vectors. The pattern given here should demonstrate how you could do that, if you are interested.
Here's a version that eliminates use of thrust::device_vector and thrust::host_vector altogether:
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
for (int i = 2; i<size; i++)
{
result[i] = oldArray[i] + k * result[i-2];
}
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
return ret;
}
};
typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){
mt *b1 = new mt[ds]; // b as in blelloch (1.5)
mt *cr = new mt[ds]; // result
for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
cr[0] = b1[0];
cr[1] = b1[1];
cpufunction(cr, b1, ds, k);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
mt *db, *dstate;
cudaMalloc(&db, ds*sizeof(db[0]));
cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
auto b0 = thrust::constant_iterator<mt>(0);
auto a0 = thrust::constant_iterator<mt>(0);
auto a1 = thrust::constant_iterator<mt>(1);
auto a2 = thrust::constant_iterator<mt>(k);
auto a3 = thrust::constant_iterator<mt>(0);
thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
std::cout << std::endl;
}
Here is some cpu code which shows a possible implementation of the formular derived from https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf to express higher-order recurrences as a scan operation.
The key idea is that each element of the scan result is not a scalar, but a vector which contains the n previous scalar results. This way, all the required previous results are available in the scan operator to compute the next result.
#include <iostream>
#include <algorithm>
#include <numeric>
#include <array>
void calculate1(std::vector<int> vec, int k){
std::vector<int> result(vec.size(), 0);
for(int i = 2; i < vec.size(); i++){
result[i] = vec[i] + k * result[i-2];
}
std::cerr << "calculate1 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
struct S{
//data[0] stores result of last iteration
//data[1] stores result of second last iteration
std::array<int, 2> data;
};
std::ostream& operator<<(std::ostream& os, S s){
os << "(" << s.data[0] << "," << s.data[1] << ")";
}
void calculate2(std::vector<int> vec, int k){
S initvalue{{0,0}};
std::vector<S> result(vec.size(), initvalue);
std::exclusive_scan(
vec.begin() + 2,
vec.end(),
result.begin(),
initvalue,
[k](S left, int right){
S result;
/*A = (
0 1
k 0
)
Compute result = left * A + (right 0)
*/
result.data[0] = right + k * left.data[1];
result.data[1] = left.data[0];
return result;
}
);
std::cerr << "calculate2 result: ";
for(auto x : result){
std::cerr << x << ", ";
}
std::cerr << "\n";
}
int main(){
const int k = 5;
const std::vector<int> vec1{1,3,5,7,9,11,3,6,7,1,2,4};
calculate1(vec1, k);
calculate2(vec1, k);
}
https://godbolt.org/z/cszzn8Ec8
Output:
calculate1 result: 0, 0, 5, 7, 34, 46, 173, 236, 872, 1181, 4362, 5909,
calculate2 result: (0,0), (5,0), (7,5), (34,7), (46,34), (173,46), (236,173), (872,236), (1181,872), (4362,1181), (0,0), (0,0),
There is still an of-by-one error somewhere, but one can get the idea behind it.
I previously said that this approach can be used for a parallel scan in CUDA. This is not correct. For a parallel scan, the scan operator must have an additional property, which is associativity, i.e. (a OP b) OP c == a OP (b OP c). This is not the case in this approach.
Robert Crovella's answer shows how to derive an associative scan operator which can be used for a parallel scan.

How to create a function that outputs the largest number using conditions in c++

/*Description: Write a function called getMax that takes three parameters of type int, and returns the biggest of the
three parameters which is of type int.
*/
#include <iostream>
using namespace std;
// Declare the function getMax and put in three variables.
int getMax(int number, int number2, int number3){
if( number >= number2 && number >= number3){
cout << number;
if(number2 >= number && number2 >= number3)
cout << number2 ;
}
else {
cout << number3;
}
return number, number2, number3;
}
// we now use the function to check for largest values below:
int main(){
cout << getMax(-13, -22, -3) << endl; //Prints -3
cout << getMax(9, 8, 9) << endl; //prints 9
cout << getMax(-5, 4, -7) << endl; //prints 4
cout << getMax(15, 15, 15) << endl; //prints 15
return 0;
}
You can solve it this way:
int getMax(int number, int number2, int number3){
return number > number2
? (number > number3 ? number : number3)
: (number2 > number3 ? number2 : number3);
}
Without ternary operators you can write:
int getMax(int number, int number2, int number3){
if (number > number2) {
if (number > number3) {
return number;
}
return number3;
}
if (number2 > number3) {
return number2;
}
return number3;
}

Should I declare a double array with the GPU block number on the inner or outer dimension?

Should I declare a double array with the GPU block number on the inner or outer dimension?
E.g., should I do
int payload[LEN][BLOCKS];
or
int payload[BLOCKS][LEN];
where LEN is a very large number.
I plan to have each block traverse the double array, holding the block dimension constant and iterating over the LEN dimension.
Assuming you're going to access the data in a block-oriented manner, you want to do the latter. This is presumably because when you load the first element of the "len" dimension, you've already paid the cost for missing in the cache for the subsequent 7ish elements. In the first option, there's probably sharing of cache lines between GPU blocks, but the sharing is relatively limited and not as low level.
Indeed, the below code reports that the second option requires 0.481 seconds to execute, and the first option requires 0.979 seconds. Arranging the data with the block on the outer dimension is about twice as performant.
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <string>
#include <chrono>
#include <iostream>
#define BLOCKS 80
#define LEN (1 << 20)
void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
if (err == cudaSuccess)
return;
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
}
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
struct Data1 {
int payload[LEN][BLOCKS];
};
struct Data2 {
int payload[BLOCKS][LEN];
};
__global__ void f1(Data1 * data1) {
int sum = 0;
for (int i = 0; i < LEN; ++i) {
sum += data1->payload[i][blockIdx.x];
}
printf("block %i has f1 sum %i\n", blockIdx.x, sum);
}
__global__ void f2(Data2 * data2) {
int sum = 0;
for (int i = 0; i < LEN; ++i) {
sum += data2->payload[blockIdx.x][i];
}
printf("block %i has f2 sum %i\n", blockIdx.x, sum);
}
int main() {
Data1 * data1 = (Data1 *) malloc(sizeof(Data1));
Data2 * data2 = (Data2 *) malloc(sizeof(Data2));;
for (int i = 0; i < LEN; ++i) {
for (int b = 0; b < BLOCKS; ++b) {
data1->payload[i][b] = i * b;
data2->payload[b][i] = i * b;
}
}
Data1 * data1_on_gpu;
CUDA_CHECK_RETURN(cudaMalloc(&data1_on_gpu, sizeof(Data1)));
Data2 * data2_on_gpu;
cudaMalloc(&data2_on_gpu, sizeof(Data2));
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
cudaMemcpy(data1_on_gpu, data1, sizeof(Data1), cudaMemcpyHostToDevice);
cudaMemcpy(data2_on_gpu, data2, sizeof(Data1), cudaMemcpyHostToDevice);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t1 = std::chrono::system_clock::now();
f1<<<80,1>>>(data1_on_gpu);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t2 = std::chrono::system_clock::now();
f2<<<80,1>>>(data2_on_gpu);
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
std::chrono::time_point<std::chrono::system_clock> t3 = std::chrono::system_clock::now();
std::chrono::duration<double> duration_1_to_2 = t2 - t1;
std::chrono::duration<double> duration_2_to_3 = t3 - t2;
duration_1_to_2.count();
printf("timer for 1st took %.3lf\n", duration_1_to_2.count());
printf("timer for 2nd took %.3lf\n", duration_2_to_3.count());
}

float2 cufftcomplex to fftw_complex

I want to using FFTW library on my code. I have cast float2 data type to fftw_complex. But I get:
Segmentation fault
This is my code.
test.cu
typedef float2 cplx;
int DoFFT_Operation( cplx* DatafftOneSlice, float* out, int *dim)
{
cout << "DO CPU FFT RSS Operation" << endl;
int xdim = dim[0];
int ydim = dim[1];
cout << "XDIM " << std::to_string(xdim) << " YDIM " << std::to_string(ydim) << endl;
// int slicedim = dim[2];
int bitdim = 1;
// int sizeOneSlice = xdim*ydim*bitdim;
int sizeOneImage = xdim*ydim;
//FFTW PLAN
fftw_plan pfftw;
pfftw = fftw_plan_dft_1d(sizeOneImage, reinterpret_cast<fftw_complex*>(DatafftOneSlice), reinterpret_cast<fftw_complex*>(DatafftOneSlice), FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(pfftw);
// fft_it(DatafftOneSlice, sizeOneImage);
// cplx* input, float* out, int N, int x, int y, int bit
DoSomething(DatafftOneSlice, out, sizeOneImage, xdim, ydim, bitdim);
fftw_destroy_plan(pfftw);
fftw_cleanup();
return 0;
}
How to casting float2 (CUDA) to fftw_complex?
I have solved my problem,
I am just change fftw_ to fftwf_, because fftw_ has double data type and fftwf_ has float data type.
test.cu
typedef float2 cplx;
int DoFFT_Operation( cplx* DatafftOneSlice, float* out, int *dim)
{
cout << "DO CPU FFT RSS Operation" << endl;
int xdim = dim[0];
int ydim = dim[1];
cout << "XDIM " << std::to_string(xdim) << " YDIM " << std::to_string(ydim) << endl;
// int slicedim = dim[2];
int bitdim = 1;
// int sizeOneSlice = xdim*ydim*bitdim;
int sizeOneImage = xdim*ydim;
//FFTW PLAN
fftwf_plan pfftw;
pfftw = fftwf_plan_dft_1d(sizeOneImage, reinterpret_cast<fftwf_complex*>(DatafftOneSlice), reinterpret_cast<fftwf_complex*>(DatafftOneSlice), FFTW_BACKWARD, FFTW_ESTIMATE);
fftwf_execute(pfftw);
// fft_it(DatafftOneSlice, sizeOneImage);
// cplx* input, float* out, int N, int x, int y, int bit
DoSomething(DatafftOneSlice, out, sizeOneImage, xdim, ydim, bitdim);
fftwf_destroy_plan(pfftw);
fftwf_cleanup();
return 0;
}

Char Function, number to letter grades

I am relatively new to c++ programming and I am struggling with my code. The objective of this code is to take scores input by the user and calculate the mean, the standard deviation and converting it to a letter grade using the calculations under char gradeFunction. When i try to debug this program using visual studios 2013, i am having a couple problems with the the gradefunction. Again i am new to programming so troubleshooting errors is very hard for me and I would appreciate any help or advice! The program looks like this so far.
#include <iostream>
#include <iomanip>
#include <cmath>
#include <string.h>
#include <string>
using namespace std;
void printArray(int Array[], int count);
double average(double scoreTotal, int count);
double stddev(int Array[], int count, double mean);
char gradeFunction(int scores, double stddev, double mean);
int main()
{
int scores[8];
int count;
double scoreTotal = 0;
int standarddev[8];
double mean;
cout << "Enter scores seperated by blanks:" " ";
for (count = 0; count <= 7; count++)
{
cin >> scores[count];
scoreTotal += scores[count];
mean = scoreTotal / 8;
}
cout << endl;
cout << "Grade Scores by Student" << endl;
cout << "Score" "\t" "Grade" << endl;
cout << "----------------------------------" << endl;
printArray(scores, 8);
cout << gradeFunction(scores, stddev, mean);
cout << endl;
cout << "The mean is" " "<< fixed << setprecision(1) << average(scoreTotal, count) << endl;
cout << "The standard deviation is" " " << stddev(scores, count, mean) << endl;
cout << endl;
system("pause");
return 0;
}
void printArray(int Array[], int count)
{
for (int x = 0; x < count; x++)
{
cout << fixed << setprecision(1) << Array[x] << endl;
}
}
char gradeFunction(int scores, double stddev, double mean)
{
char F, D, C, B, A;
if (scores <= (mean - (1.5 * stddev)))
return 'F';
else if (scores <= (mean - (.5 * stddev)))
return 'D';
else if (scores <= (mean + (.5 * stddev)))
return 'C';
else if (scores <= (mean + (1.5 * stddev)))
return 'B';
else return 'A';
}
double average(double scoreTotal, int count)
{
return scoreTotal / count;
}
double stddev(int Array[], int count , double mean)
{
double stddev;
double sum2 = 0;
for (int i = 0; i < count; i++)
{
sum2 += pow((Array[i] - mean), 2);
}
stddev = sqrt(sum2 / (count - 1));
return stddev;
}
The error messages this leaves me with are...
3 IntelliSense: argument of type "double (*)(int *Array, int count, double mean)" is incompatible with parameter of type "double"
Error 1 error C2664: 'char gradeFunction(int [],double,double)' : cannot convert argument 2 from 'double (__cdecl *)(int [],int,double)' to 'double'