Thrust: selectively copy based on another vector - cuda

I'd like to use a set of thrust operations to selectively copy the elements of one vector A into a new vector B based on a predicate on elements in a third vector C.
Here's an example case: I want to copy elements (in order) from A when the corresponding element in B is 1 to C and don't if it is 0. I want |C| < |A| if there are 0s in B. We can pre-determine the size of C by a reduction on B. e.g:
A = [2, 3, 6, 0, 11]
B = [1, 0, 1, 1, 0]
C = [2, 6, 0]
Any help is greatly appreciated

This algorithm is known as stream compaction. It is implemented in thrust::copy_if.
The following example is taken from the Thrust documentation.
#include <thrust/copy.h>
...
struct is_even
{
__host__ __device__
bool operator()(const int x)
{
return (x % 2) == 0;
}
};
...
int N = 6;
int data[N] = { 0, 1, 2, 3, 4, 5};
int stencil[N] = {-2, 0, -1, 0, 1, 2};
int result[4];
thrust::copy_if(data, data + N, stencil, result, is_even());
// data remains = { 0, 1, 2, 3, 4, 5};
// stencil remains = {-2, 0, -1, 0, 1, 2};
// result is now { 0, 1, 3, 5}

Although Abator had given the right function to use. Let me try a complete example.
//~~~START:Wed, 06-Oct-2021, 21:41:22 IST
//~~~Author:Rajesh Pandian M | mrprajesh.co.in
//~~CompiledWith: nvcc a.cu -std=c++14 --expt-extended-lambda
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <stdio.h>
int main(void) {
const int N=5;
int A[]={2, 3, 6, 0, 11}; //Data
int B[]={1, 0, 1, 1, 0 }; //Stencil
thrust::device_vector<int> dA(A, A + N);
thrust::device_vector<int> dB(B, B + N);
// Allocates memory based on #number of 1's
thrust::device_vector<int> dC(thrust::count(B, B+N,1));
//Condition on the stencil elements. If 1 is seen copy, else do not!
thrust::copy_if( dA.begin()
,dA.end()
,dB.begin()
,dC.begin()
,[] __host__ __device__ (const int& x){
return 1 == x;
});
//Prints
thrust::for_each(dC.begin(), dC.end(),
[] __host__ __device__(const int& x){ printf("%d ",x);});
return 0;
}

Related

CUDA cuDNN: cudnnGetConvolutionForwardWorkspaceSize fails with bad parameter

I am currently trying to implement a very basic 2D convolution using CUDA cuDNN between an "image" of size 3x3 and a kernel of size 2x2, resulting in a 2x2 output.
This is my code:
// Create a cuDNN handle:
cudnnHandle_t handle;
cudnnCreate(&handle);
// Create your tensor descriptors:
cudnnTensorDescriptor_t cudnnIdesc;
cudnnFilterDescriptor_t cudnnFdesc;
cudnnTensorDescriptor_t cudnnOdesc;
cudnnConvolutionDescriptor_t cudnnConvDesc;
cudnnCreateTensorDescriptor( &cudnnIdesc );
cudnnCreateFilterDescriptor( &cudnnFdesc );
cudnnCreateTensorDescriptor( &cudnnOdesc );
cudnnCreateConvolutionDescriptor( &cudnnConvDesc );
// Set tensor dimensions as multiples of eight (only the input tensor is shown here):
// W, H, D, C, N
const int dimI[] = { I_M, I_N, 1, 1 };
// Wstride, Hstride, Dstride, Cstride, Nstride
const int strideI[] = { 1, 1, 1, 1 };
checkCUDAError( "SetImgDescriptor failed", cudnnSetTensorNdDescriptor(cudnnIdesc, CUDNN_DATA_HALF, 4, dimI, strideI) );
const int dimF[] = { K_M, K_N, 1, 1 };
checkCUDAError( "SetFilterDescriptor failed", cudnnSetFilterNdDescriptor(cudnnFdesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, 4, dimF) );
const int dimO[] = { I_M - K_M + 1, I_N - K_N + 1, 1, 1 };
const int strideO[] = { 1, 1, 1, 1 };
checkCUDAError( "SetOutDescriptor failed", cudnnSetTensorNdDescriptor(cudnnOdesc, CUDNN_DATA_HALF, 4, dimO, strideO) );
checkCUDAError( "SetConvDescriptor failed", cudnnSetConvolution2dDescriptor(cudnnConvDesc, 0, 0, 1, 1, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_HALF) );
// Set the math type to allow cuDNN to use Tensor Cores:
checkCUDAError( "SetConvMathType failed", cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH) );
// Choose a supported algorithm:
int algoCount = 0;
cudnnConvolutionFwdAlgoPerf_t algoPerf;
checkCUDAError( "GetConvForwardAlgo failed", cudnnFindConvolutionForwardAlgorithm(handle, cudnnIdesc, cudnnFdesc, cudnnConvDesc, cudnnOdesc, 1, &algoCount, &algoPerf) );
// Allocate your workspace:
void *workSpace;
size_t workSpaceSize = 0;
checkCUDAError( "WorkspaceSize failed", cudnnGetConvolutionForwardWorkspaceSize(handle, cudnnIdesc, cudnnFdesc, cudnnConvDesc, cudnnOdesc, algoPerf.algo, &workSpaceSize) );
if (workSpaceSize > 0) {
cudaMalloc(&workSpace, workSpaceSize);
}
However, cudnnGetConvolutionForwardWorkspaceSize fails with CUDNN_STATUS_BAD_PARAM.
According to https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetConvolutionForwardWorkspaceSize
this can only be because of one of the reasons:
CUDNN_STATUS_BAD_PARAM:
At least one of the following conditions are met:
(1) One of the parameters handle, xDesc, wDesc, convDesc, yDesc is NULL.
(2) The tensor yDesc or wDesc are not of the same dimension as xDesc.
(3) The tensor xDesc, yDesc or wDesc are not of the same data type.
(4) The numbers of feature maps of the tensor xDesc and wDesc differ.
(5) The tensor xDesc has a dimension smaller than 3.
I don't see how any of them are true.
(1) is obviously not the case. Because yDesc, wDesc and xDesc all have 4 dimensions, (2) is also not the case.
Every tensor has the data type CUDNN_DATA_HALF, which is why (3) is also not true.
I don't know exactly what (4) refers to but I think the number of feature maps for image and kernel is 1 in my case.
And (5) is also not true.
Any idea why the function fails nevertheless?
I solved the error by doing this:
checkCUDAError( "SetImgDescriptor failed", cudnnSetTensor4dDescriptor(cudnnIdesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, 1, 1, I_M, I_N) );
checkCUDAError( "SetFilterDescriptor failed", cudnnSetFilter4dDescriptor(cudnnFdesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, 1, 1, K_M, K_N) );
checkCUDAError( "SetOutDescriptor failed", cudnnSetTensor4dDescriptor(cudnnOdesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, 1, 1, I_M - K_M + 1, I_N - K_N + 1) );

function IBuilder::buildEngineWithConfig() returns null

I am using tensorRT to build a small model as below:
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <fstream>
#include <map>
#include <chrono>
#include <iostream>
#include "include/Utils.h"
#include <memory>
#include <vector>
#include <cassert>
#include "src/InferDeleter.cpp"
using namespace std;
using namespace nvinfer1;
class MyLogger : public ILogger {
void log(Severity severity, const char *msg) override {
if (severity != Severity::kINFO) {
cout << msg << endl;
}
}
} gLogger;
int main() {
//load weights
map<string, Weights> mWeightMap = Utils::getInstance().loadWeights("Weights/mnistapi.wts");
//a few configuration parameters
const char *INPUT_BLOB_NAME = "input";
const char *OUTPUT_BLOB_NAME = "output";
DataType dataType = nvinfer1::DataType::kFLOAT;
int INPUT_H = 28, INPUT_W = 28;
int batchSize = 1;
//define the network
IBuilder *builder = createInferBuilder(gLogger);
INetworkDefinition *network = builder->createNetworkV2(0U);
// Create input tensor of shape { 1, 1, 28, 28 }
ITensor *data = network->addInput(
INPUT_BLOB_NAME, DataType::kFLOAT, Dims3{1, INPUT_H, INPUT_W});
// Create scale layer with default power/shift and specified scale parameter.
const float scaleParam = 0.0125f;
const Weights power{DataType::kFLOAT, nullptr, 0};
const Weights shift{DataType::kFLOAT, nullptr, 0};
const Weights scale{DataType::kFLOAT, &scaleParam, 1};
IScaleLayer *scale_1 = network->addScale(*data, ScaleMode::kUNIFORM, shift, scale, power);
// Add convolution layer with 20 outputs and a 5x5 filter.
IConvolutionLayer *conv1 = network->addConvolutionNd(
*scale_1->getOutput(0), 20, Dims{2, {5, 5}, {}}, mWeightMap["conv1filter"], mWeightMap["conv1bias"]);
conv1->setStride(DimsHW{1, 1});
// Add max pooling layer with stride of 2x2 and kernel size of 2x2.
IPoolingLayer *pool1 = network->addPoolingNd(*conv1->getOutput(0), PoolingType::kMAX, Dims{2, {2, 2}, {}});
pool1->setStride(DimsHW{2, 2});
// Add second convolution layer with 50 outputs and a 5x5 filter.
IConvolutionLayer *conv2 = network->addConvolutionNd(
*pool1->getOutput(0), 50, Dims{2, {5, 5}, {}}, mWeightMap["conv2filter"], mWeightMap["conv2bias"]);
conv2->setStride(DimsHW{1, 1});
// Add second max pooling layer with stride of 2x2 and kernel size of 2x3>
IPoolingLayer *pool2 = network->addPoolingNd(*conv2->getOutput(0), PoolingType::kMAX, Dims{2, {2, 2}, {}});
pool2->setStride(DimsHW{2, 2});
// Add fully connected layer with 500 outputs.
IFullyConnectedLayer *ip1
= network->addFullyConnected(*pool2->getOutput(0), 500, mWeightMap["ip1filter"], mWeightMap["ip1bias"]);
// Add activation layer using the ReLU algorithm.
IActivationLayer *relu1 = network->addActivation(*ip1->getOutput(0), ActivationType::kRELU);
// Add second fully connected layer with 20 outputs.
IFullyConnectedLayer *ip2 = network->addFullyConnected(
*relu1->getOutput(0), 10, mWeightMap["ip2filter"], mWeightMap["ip2bias"]);
// Add softmax layer to determine the probability.
ISoftMaxLayer *prob = network->addSoftMax(*ip2->getOutput(0));
prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*prob->getOutput(0));
//build engine
IBuilderConfig *builderConfig = builder->createBuilderConfig();
builder->setMaxBatchSize(batchSize);
builderConfig->setMaxWorkspaceSize(1<<24);
//engine null
ICudaEngine *engine = builder->buildEngineWithConfig(*network, *builderConfig);
//later uses of engine.
return 0;
}
However, the function builder->buildEngineWithConfig(*network, *builderConfig) returns nullptr. I tried to change maxWorkSpace to other values but it still does not work. I also visited this post but nothing help. Anyone points out the causes of the problem. Thank you!
After a few days of rolling over this problem. I have found that if layers in the model does not match the weight passed in, there is no error will appear but you can not create an TensorRT engine to do later tasks. Therefore, the best way to do in this situation is carefully checking layer by layer and the .wts file.

internal error when trying to perform matrix transpose using cusparseCsr2cscEx2() function of cuSPARSE

I am need to perform transpose of a matrix(CSR) using cuSPARSE, but get “internal error”. I write my code referring to How to transpose a sparse matrix in cuSparse? and https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2. To make it more clearly, I am trying to perform transpose by convert the matrix from format csr to format csc.
I am running on Nvidia GeForce GTX 1080, with driver cuda_11.1.0. I am using Windows 10.
The following is my codes. You can download the folder from https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/sparse2dense, and replace the sparse2dense_example.c with my codes. Then configure and make using CMake, in this way maybe you can reproduce my problems.
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSparseToDense
#include <stdio.h> // printf
#include <stdlib.h> // EXIT_FAILURE
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
printf("CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \
__LINE__, cusparseGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
int main(void) {
// CUSPARSE APIs
cusparseHandle_t handle = NULL;
cusparseStatus_t status = (cusparseCreate(&handle));
if (status != CUSPARSE_STATUS_SUCCESS) {
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, cusparseGetErrorString(status), status);
}
// Initialize matrix A
// this matrix is the same as https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/sparse2dense/sparse2dense_example.c
int num_rows = 5;
int num_cols = 4;
int nnz = 11;
int h_csr_offsets[] = { 0, 3, 4, 7, 9, 11 };
int h_csr_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3, 1, 2 };
float h_csr_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
7.0f, 8.0f, 9.0f, 10.0f, 11.0f };
// Device memory management
int* d_csr_offsets, * d_csr_columns;
float* d_csr_values;
CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets, (num_rows + 1) * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_columns, nnz * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_values, nnz * sizeof(float)))
CHECK_CUDA(cudaMemcpy(d_csr_offsets, h_csr_offsets, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(d_csr_columns, h_csr_columns, nnz * sizeof(int), cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(d_csr_values, h_csr_values, nnz * sizeof(float), cudaMemcpyHostToDevice))
// Memory allocation of transpose A
int* d_csr_offsets_AT, * d_csr_columns_AT;
float* d_csr_values_AT;
//first allocate memory to ATT
CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets_AT, (num_cols + 1) * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_columns_AT, nnz * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&d_csr_values_AT, nnz * sizeof(float)))
size_t buffer_temp_size;
cusparseCsr2cscEx2_bufferSize(
handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_temp_size);
void* buffer_temp = NULL;
printf("buffer_temp_size is %zd\n", buffer_temp_size);
CHECK_CUDA(cudaMalloc(&buffer_temp, buffer_temp_size))
CHECK_CUSPARSE(cusparseCsr2cscEx2(handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, buffer_temp))
}
The error is due to the fact that you are passing pointers to host data, to a routine that intends to work on device data:
cusparseCsr2cscEx2_bufferSize(
handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
^ ^ ^
and
CHECK_CUSPARSE(cusparseCsr2cscEx2(handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
^ ^ ^
When I change those instances to your allocated device data:
d_csr_values, d_csr_offsets, d_csr_columns
the "internal error" that you are asking about goes away, according to my testing.

Cuda Thrust - max vec3

When i want to perform a reduction on an array of float i usually do the following :
float res = *thrust::max_element(thrust::device,
thrust::device_ptr<float>(dDensities),
thrust::device_ptr<float>(dDensities+numParticles)
);
However what i would like to do now is pretty much the same thing on a vec3 (the glm library type) array :
float res = *thrust::max_element(thrust::device,
thrust::device_ptr<glm::vec3>(dDensities),
thrust::device_ptr<glm::vec3>(dDensities+numParticles)
);
As you can see, this has no sense because the '<' operator is not defined on. But i would like to get the maximum vec3 based on his length :
len = sqrtf(v.x*v.x + v.y*v.y + v.z*v.z);
Is that possible ?
Yes, its possible. You may want to read the thrust quickstart guide if you're not already familiar with it.
If you review the thrust extrema documentation, you'll note that thrust::max_element comes in several different varieties (as do most thrust algorithms). One of these accepts a binary comparison functor. We can define a comparison functor which will do what you want.
Here's a trivial worked example:
$ cat t134.cu
#include <thrust/extrema.h>
#include <thrust/device_ptr.h>
#include <glm/glm.hpp>
#include <iostream>
struct comp
{
template <typename T>
__host__ __device__
bool operator()(T &t1, T &t2){
return ((t1.x*t1.x+t1.y*t1.y+t1.z*t1.z) < (t2.x*t2.x+t2.y*t2.y+t2.z*t2.z));
}
};
int main(){
int numParticles = 3;
glm::vec3 d[numParticles];
d[0].x = 0; d[0].y = 0; d[0].z = 0;
d[1].x = 2; d[1].y = 2; d[1].z = 2;
d[2].x = 1; d[2].y = 1; d[2].z = 1;
glm::vec3 *dDensities;
cudaMalloc(&dDensities, numParticles*sizeof(glm::vec3));
cudaMemcpy(dDensities, d, numParticles*sizeof(glm::vec3), cudaMemcpyHostToDevice);
glm::vec3 res = *thrust::max_element(thrust::device,
thrust::device_ptr<glm::vec3>(dDensities),
thrust::device_ptr<glm::vec3>(dDensities+numParticles),
comp()
);
std::cout << "max element x: " << res.x << " y: " << res.y << " z: " << res.z << std::endl;
}
$ nvcc -arch=sm_61 -o t134 t134.cu
$ ./t134
max element x: 2 y: 2 z: 2
$

odeint streaming observer and related questions

I have a system of 4 coupled equations to solve and a parameter Gamma[i] to iterate over. Since I am quite new to C++, my code is a very rudimentary. If it looks sophisticated and elegant in certain parts, it is only because I have adapted code from the author of odeint. :)
This question is related to (http://stackoverflow.com/questions/12060111/using-odeint-function-definition/12066958#comment16253600_12066958) but not exactly the same. Please do not delete this. :(
Questions have been inserted between the lines of code.
#include <iostream>
#include <iterator>
#include <algorithm>
#include <boost/numeric/odeint.hpp>
#include <cmath>
#include <vector>
#include <fstream>
#include <iomanip>
using namespace std;
using namespace boost::numeric::odeint;
class NLI_class {
private:
double gamma;
public:
NLI_class (double r) : gamma(r) {}
void operator()( vector<double> &u , vector<double> &du , double z ) {
du[0] = u[0]*u[1]*cos(u[3]); //u1
du[1] = -u[0]*u[0]*cos(u[3]); //u2
du[2] = gamma * (2/(u[0]*u[0]) - 1/(u[1]*u[1])); //theta
du[3] = gamma * (1.0/(u[0]*u[0])); //phi1
du[4] = gamma * (1.0/(u[1]*u[1])); //phi2;
}
};
Question #1:
In my original program, I had something like this to pipe the output to a csv file:
inline void save(vector<double>& v, string filename)
{
ofstream output(filename);
for(int i=0;i<v.size();++i){
output << setprecision(64) << v[i] << endl;
}
}
How do I adapt streaming_observer to do what my save() does? Basically, I want to generate .csv files for each iteration i. At this point, I am doing it the ugly way, i.e compiling everything, opening a windows command prompt and then piping the exe output to a text file. This generates one big file with all iterations thrown in there.
This becomes very painful to analyze for a large number of iterations.
struct streaming_observer {
std::ostream &m_out;
streaming_observer( std::ostream &out ) : m_out( out ) {}
void operator()( const vector<double> &x , double t ) const
{
m_out << t;
for( size_t i=0 ; i < x.size() ; ++i )
m_out << "\t" << x[i];
m_out << "\n";
}
};
int main(){
vector<double> x( 5 );
vector<double> Gamma;
vector<double>delta;
const double pi=acos(-1.0);
short delta_n=5;
const double delta_step=(2*pi)/delta_n;
const double dz = 0.01;
const double zeta = 3.0;
const double theta_initial=0.0;
const double u20=tanh(zeta);
const double u10=sqrt(1.0-(u20*u20));
double d=0.0;
double G=0.0;
for(int i=0;i<=delta_n;i++){
//When i=0, the d=0.0 and G=0.0 are pushed into the vector.
delta.push_back(d);
Gamma.push_back(G);
// Compute delta and Gamma
d=d+delta_step;
G=-u10*u10*u20*sin(theta_initial+d);
}
save(delta,"delta.csv");
save(Gamma,"Gamma.csv");
Question#2:
The results I get here do not agree with what I get with what I get using a simple explicit Euler method. Hence, I would like to see the RK4 coefficients (preferably dump them to a file) or the intermediate steps. How can I get this information?
//Numeric Integration
for (unsigned i = 0; i < Gamma.size(); ++i) {
x[0] = u10;
x[1] = u20;
x[2] = 0.0;
x[3] = 0.0;
x[4] = 0.0;
NLI_class nli_obj(Gamma[i]);
integrate_const( runge_kutta4< vector<double > >(), nli_obj, x , 0.0 , 3.0 , dz,streaming_observer( std::cout ) );
}
}
Thank you for all those who helped!
Edit:
Is there some way to get a running error estimate? Note that u[0]*u[0]+u[1]*u[1]=1 at all times.
Question #1 :
I do not understand exactly what kind of output you need. But if you want to write the result after each iteration you can implement an output observer like this:
struct output_observer
{
string filename_;
size_t count_;
output_observer( const string &filename ) : filename_( filename ) , count_( 0 ) { }
void operator()( const state_type &x , time_type dt )
{
char fn[512] = "";
sprintf( fn , "%s_%04lu.csv" , filename_.c_str() , count_ );
ofstream fout( fn );
for( size_t i=0 ; i<x.size() ; ++i ) fout << x[i] << "\n";
++count_;
}
};
You can apply this observer simply by
integrate_const( runge_kutta4< vector<double > >() , nli_obj , x ,
0.0 , 3.0 , dz , output_observer( "filename" ) );
Is this the desired functionality?
Question #2 :
It is not possible to see the intermediate e steps of runge_kutta4. The coefficients are the standard ones for the classical Runge-Kutta method: http://en.wikipedia.org/wiki/Runge%E2%80%93Kutta_methods
Question #3 :
odeint has several error steppers, which estimate the error made during one step. You can use for example the Runge_Kutta Cash Karp algorithm;
runge_kutta_cash_karp54< state_type > rk;
state_type xerr;
rk.do_step( nli_obj , x , t , xerr );
which makes ONE step and estimates the error and writes the error result in xerr.