for_each bind vector of vector resize - function

This is my first question. I gave up and will use a hand rolled functor for this, but I am curious as to how it is supposed to be done. The contrived example below is intended to resize all of the vectors in a vector to be of size 9, by filling them with nulls. The indicated line causes MinGW GCC 4.5.0 to spew a lot of template errors. I've tried several different permutations, but only posted the code that I consider to be "closest to correct" below. How should it be written? Note, I want to retain the two-argument version of resize.
#include <vector>
using std::vector;
#include <algorithm>
using std::for_each;
#include <tr1/functional>
using std::tr1::bind;
using std::tr1::placeholders::_1;
int main() {
vector<vector<void *> > stacked_vector(20);
for_each(stacked_vector.begin(),stacked_vector.end(),
bind(&std::vector<void *>::resize,_1,9,0/*NULL*/)); // voluminous error output
return 0;
}
Thank you very much for your input.

It's hard to say without seeing the error output (and frankly, even with it). However, try passing the NULL as a void* type: static_cast<void*>(0). Otherwise the object returned by bind tries to give an int value as the second parameter to resize.

Try this.
#include <functional>
#include <algorithm>
#include <iostream>
#include <vector>
int main()
{
typedef std::vector<int> vec_int;
typedef std::vector<vec_int> vec_vec_int;
// Do this to make the _1 work
using namespace std::placeholders;
static const int FIRST_DIM = 5;
static const int SECOND_DIM = 10;
static const int DEFAULT_VALUE = 66;
vec_vec_int v(FIRST_DIM);
std::for_each(v.begin(), v.end(),
std::bind(&vec_int::resize, _1, SECOND_DIM, DEFAULT_VALUE));
std::cout << v[4][9];
return (0);
}
If you do not want to specify the default value, you do not need to.

Related

Need help optimizing thrust cuda code with nested iterator transform_reduce operations

I am working on code I would like to execute efficiently on a GPU. Most of the code has been easy to vectorize and prepare for parallel execution. There are several nice examples on Stack Overflow that have helped me with the standard nested iterators. I have one section I have not been able to successfully condense into an efficient thrust construct. I have taken that section of my code and made a minimum reproducible example. Any advice or hint on how to structure this code would be appreciated.
Thanks
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <ctime>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
typedef thrust::device_vector<double> tDoubleVecDevice;
typedef tDoubleVecDevice::iterator tDoubleVecDeviceIter;
struct functorB{
template <typename T>
__host__ __device__
double operator()(const T &my_tuple){ // do some math
return ( fmod((thrust::get<0>(my_tuple) * thrust::get<1>(my_tuple)),1.0) );
}
};
struct functorC {
template <typename T>
__host__ __device__
double operator()(const T &my_tuple){ // do some math
double distance = fabs( fmod((thrust::get<0>(my_tuple) - thrust::get<1>(my_tuple)),1.0));
return((fmin( distance, 1.0 - distance)) / (5.0));
}
};
int main(void)
{
tDoubleVecDevice resF(36);
tDoubleVecDevice freqI(36);
tDoubleVecDevice trialTs(128);
std::srand(std::time(nullptr));
for(tDoubleVecDeviceIter tIter = trialTs.begin();tIter < trialTs.end(); tIter++ ){
(*tIter) = rand() % 10 + 1.5; // make some random numbers
}
for(tDoubleVecDeviceIter rIter = resF.begin(), fIter = freqI.begin();fIter < resF.end(); rIter++ ,fIter++){
(*fIter) = rand() % 10 + 1.5; // make some random numbers
(*rIter) = rand() % 10 + 1.5; // make some random numbers
}
tDoubleVecDevice trialRs(36);
tDoubleVecDevice errorVect(128);
for( tDoubleVecDeviceIter itTrial = trialTs.begin(), itError = errorVect.begin(); itTrial != trialTs.end(); itTrial++,itError++){
thrust::transform( (thrust::make_zip_iterator(thrust::make_tuple(thrust::make_constant_iterator<double>(*itTrial), freqI.begin()))),
(thrust::make_zip_iterator(thrust::make_tuple(thrust::make_constant_iterator<double>(*itTrial)+36, freqI.end()))),
trialRs.begin() ,functorB());
(*itError) =thrust::transform_reduce(
thrust::make_zip_iterator(thrust::make_tuple(trialRs.begin(),resF.begin())),
thrust::make_zip_iterator(thrust::make_tuple(trialRs.end(),resF.end())),
functorC(),(double) 0,thrust::plus<double>()
);
}
// finds the index of the minimum element;
int minElementIndex = thrust::min_element(errorVect.begin(),errorVect.end()) - errorVect.begin();
double result = trialTs[minElementIndex];
std::cout << "result = " << result;
return 0;
}
It looks like you need to expand your trialsTs,trialsRs,errorVect,freqI and resF vectors to 4608 elements. This will allow you to vectorize the loops. Derive a class from thrust::iterator_adaptor to make a cyclic iterator to expand your freqI and resF to create repeated sequences of the data in those vectors.
After you run your functors use a reduce by key transform to create your error result with each 36 element trial.
Give that a try and if you get stuck I will provide some additional code.

Problems in CUDA function cudaMemcpyToSymbol()

I'm transporting data to specific CUDA symbol, my CUDA version is 10.1, GPU is Tesla K80. I compiled the code on VS2017, code generated by compute_35 & sm35. When I wrote my code like this,
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(&scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
printf("%d: %s.\n",cudaErr,cudaGetErorString(cudaErr));
it compiled well but got cudaErrInvalidSymbol when I run the program,
13: Invalid device symbol
If I modified my code like this,
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
then the compile would fail due to incompatible parameter type as the first parameter is FLOAT while function asks VOID*, here I found the function definition in cuda_runtime_api.h,
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
Could anyone please give some advice, much appreciated.
This:
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(&scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
printf("%d: %s.\n",cudaErr,cudaGetErorString(cudaErr));
is illegal/wrong in two ways. You must use nvcc to compile the code using a device code aware trajectory, and the first argument of the cudaMemcpyToSymbol call is incorrect. If you simply rename your .cpp source file to have a .cu file extension and change the contents so that it looks like this:
<.cu>
#include <.h>
....
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(scoreRatio, &ScoreRatio, sizeof(ScoreRatio));
printf("%d: %s.\n", cudaErr, cudaGetErorString(cudaErr));
it will both compile and run correctly. See here for an explanation of why it is necessary to change the first argument of the cudaMemcpyToSymbol call.

Don't know how to create a function file cpp on visual studio (c++)

I am new in c++ language and have a problem in how to create a file for a function to use it (I know I can create it within a class I just need to create cpp file and header file for a function so I can use it within the main function like this screen shots enter image description here
This is my code that cannot run because as I think I didn't declare the function in a separate cpp file so it be treated as object and the linker link them together(I think that based on my poor knowledge )
#include "stdafx.h"
#include <iostream>
#include <string>
#include <conio.h>
using namespace std;
int PassByValue(int a);
int main()
{
int num1 = 1;
int *pNum = new int;
*pNum = 5;
int PassByValue(int a){
string op = "The name of this fuction is PassByValue";
cout << op << endl;
return 0;
}
return 0;
}
How I can use the function without getting compiler errors (c1010--c2601)
can any one tell me whats wrong with the code and how to fix it?
With my regards

Debugging CUFFTW interface plan creation

I am begining to port an existing fftw3 application to make use of the cuda fftw libraries. The initial stage is to simply replace the fftw3.h header with the cufft.h header and link the cufft libraries instead of the fftw3 libraries.
That is simple enough, and the code compiles with nvcc. However when I execute the code the application is unable to create a plan using the fftw_plan_guru_dft command (it just returns 0 instead of a valid plan).
Since there are no errors reported I am at a loss as to how I might debug this issue. cuda-gdb and gdb do not provide any further insight. They simply report
Error: Internal error reported by CUDA debugger API (error=7). The application cannot be further debugged.
UPDATE: So here is the minimum working example. As mentioned in my comment to Talonmies, this code is autogenerated by a scientific differential equation solver. So please excuse the function names etc.
#define real Re
#define imag Im
#include <complex>
#undef real
#undef imag
#include <cufftw.h>
#include <stdio.h>
int main(void) {
int _transform_sizes_index = 1, _loop_sizes_index = 0;
fftw_iodim _transform_sizes[1], _loop_sizes[2];
_transform_sizes[0].n = 128;
_transform_sizes[0].is = 0;
_transform_sizes[0].os = 0;
fftw_complex _data_in[128] = {0.};
static fftw_plan _fftw_forward_plan = NULL;
_fftw_forward_plan = fftw_plan_guru_dft(
_transform_sizes_index, _transform_sizes,
_loop_sizes_index, _loop_sizes,
reinterpret_cast<fftw_complex*>(_data_in),
reinterpret_cast<fftw_complex*>(_data_in),
FFTW_FORWARD, FFTW_PATIENT);
if (!_fftw_forward_plan)
printf("Error: Unable to create forward plan\n");
return 0;
}
Unless anyone else knows what I am doing wrong, it looks like this particular functionality of fftw3 may not be supported by cufftw.
As talonmies pointed out, the fftw_plan_guru_dft only has partial support in the cufftw library. The above example will run if you instead make use of the basic level fftw_plan_dft. More concretely
#define real Re
#define imag Im
#include <complex>
#undef real
#undef imag
#include <cufftw.h>
#include <stdio.h>
int main(void) {
int _transform_sizes_index = 1, _loop_sizes_index = 0;
int _transform_sizes[1] = {128};
fftw_complex _data_in[128] = {0.};
static fftw_plan _fftw_forward_plan = NULL;
_fftw_forward_plan = fftw_plan_dft(
_transform_sizes_index, _transform_sizes,
reinterpret_cast<fftw_complex*>(_data_in),
reinterpret_cast<fftw_complex*>(_data_in),
FFTW_FORWARD, FFTW_PATIENT);
if (!_fftw_forward_plan)
printf("Error: Unable to create forward plan\n");
return 0;
}

how to use the array in cudaPitchedPtr type data

I have encountered a problem when trying to use the array within the data type cudaPitchedptr.
I transferred the data from the main function to the global function and print the value. As I set the value to be 12 in the cudaMemset3D, however, the result printed is 0.0000. Attached is my code. I really appreciate it if someone can help me.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuPrintf.cu"
#include "stdio.h"
__global__ void printtest(double devptr[])
{
printf("%f\n",devptr[1]);
}
int main()
{
int width=191, height=192, depth=192;
cudaExtent extent= make_cudaExtent( width*sizeof(double),height,depth);
cudaPitchedPtr Ex;
cudaMalloc3D(&Ex,extent);
cudaMemset3D(Ex,12 ,extent);
printtest<<<1,1>>>( (double*) Ex.ptr);
}
The problem is that cudaMemset3D is used to set every byte in a range to a value. Note in the description:
value- Value to set for each byte of specified memory
So you are setting every byte in your allocated region to 12 (decimal). Then you're taking 8 of those bytes in a row and attempting to interpet it as a double-precision floating point type. You're going to get results that aren't what you expect.
If you want to see something sensible, then after your cudaMalloc3D, instead of the cudaMemset3D, insert this code:
double myval = 1.3579f; //or whatever value you want to see
double *hostdata;
hostdata = (double *)malloc(width*sizeof(double)* height*depth);
if (hostdata == 0) {printf("malloc fail"); return 1;}
hostdata[1] = myval;
cudaMemcpy3DParms p = {0};
p.srcPtr = make_cudaPitchedPtr(hostdata, width*sizeof(double), width, height);
p.dstPtr = Ex;
p.extent = extent;
p.srcPos = make_cudaPos(0,0,0);
p.dstPos = make_cudaPos(0,0,0);
p.kind=cudaMemcpyHostToDevice;
cudaMemcpy3D(&p);
I'd also recommend using cuda error checking after every api call and kernel launch in your code.
You may also be interested in this question/answer.