Blending Function/Bezier - function

Am I calculating the Bezier blend wrong? Any help would be appreciated.
Thank you very much.
double bezierBlend(int i, double u, int m) {
double blend = 1;
blend = factorial(m) * pow(u, i) * pow(1 - u, (m - i)) / (factorial(i) * factorial(m - i));
return blend;
}

Here's a sample to compute the Bezier blend function, following directly from the formulation:
double choose( long n, long k )
{
long j;
double a;
a = 1;
for (j = k + 1; j <= n; j++)
a *= j;
for (j = 1; j <= n - k; j++)
a /= j;
return a;
};
double bezierBlend( int i, double t, int n )
{
return choose( n, i ) * pow(1 - t, n - i) * pow( t, i );
}
For most applications though, computing the powers and the binomial coefficients each time is absurdly inefficient. In typical applications, the degree of the curve is constant (e.g., 2 for quadratic or 3 for cubic), and you can compute the function much more efficiently by pre-expanding the formula. Here's an example for cubic curves:
double BezCoef(int i, double t)
{
double tmp = 1-t;
switch (i)
{
case 0: return tmp*tmp*tmp;
case 1: return 3*tmp*tmp*t;
case 2: return 3*tmp*t*t;
case 3: return t*t*t;
}
return 0; // not reached
}

Related

Cuda Implementation of Partitioned Subgroup

is there a more efficient way to implement the "Partitioned Subgroup" functions of Vulkan/OpenGL, which do not have to loop over all elements in the subgroup? My current implementation just uses a loop from 0 to WARP_SIZE.
References:
(slide 37+38) https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf
https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt
Simple Implementation:
__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
uint32_t result = 0;
for (int i = 0; i < 32; ++i)
{
int x = __shfl_sync(0xFFFFFFFF, p(0), i);
int y = __shfl_sync(0xFFFFFFFF, p(1), i);
uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
if (i == threadIdx.x & 31) result = b;
}
return result;
}
__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
float result = 0;
for ( unsigned int i = 0; i < 32; ++i)
{
float other_value = __shfl_sync(0xFFFFFFFF, value, i);
if ((1U << i) & ballot) result += other_value;
}
return result;
}
Thanks to the hint of Abator I came up with a more efficient solution. It's a little ugly because labeled_partition is only implemented for int but works quite well.
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
using namespace cooperative_groups;
thread_block block = this_thread_block();
thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);
coalesced_group g1 = labeled_partition(tile32, p(0));
coalesced_group g2 = labeled_partition(tile32, p(1));
details::_coalesced_group_data_access acc;
return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}
template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
int s = group.size();
int r = group.thread_rank();
for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
{
auto v = group.template shfl_down(value, offset);
if (r + offset < s) value += v;
}
return value;
}

Implementing Dijkstra's algorithm with C++ STL

I have implemented the Dijkstra's algorithm as follows
#include <iostream>
#include <bits/stdc++.h>
#include<cstdio>
#define ll long long int
#define mod 1000000007
#define pi 3.141592653589793
#define f first
#define s second
#define pb push_back
#define pf push_front
#define pob pop_back
#define pof pop_front
#define vfor(e, a) for (vector<ll> :: iterator e = a.begin(); e != a.end(); e++)
#define vfind(a, e) find(a.begin(), a.end(), e)
#define forr(i, n) for (ll i = 0; i < n; i++)
#define rfor(i, n) for (ll i = n - 1; i >= 0; i--)
#define fors(i, b, e, steps) for(ll i = b; i < e; i += steps)
#define rfors(i, e, b, steps) for(ll i = e; i > b; i -= steps)
#define mp make_pair
using namespace std;
void up(pair<ll, ll> a[], ll n, ll i, ll indArray[]) {
ll ind = (i - 1) / 2;
while (ind >= 0 && a[ind].s > a[i].s) {
swap(a[ind], a[i]);
indArray[a[ind].f] = ind;
indArray[a[i].f] = i;
i = ind;
ind = (i - 1) / 2;
}
}
void down(pair<ll, ll> a[], ll n, ll i, ll indArray[]) {
ll left = 2 * i + 1;
ll right = 2 * i + 2;
ll m = a[i].s;
ll ind = i;
if (left < n && a[left].s < m) {
ind = left;
m = a[left].s;
}
if (right < n && a[right].s < m) {
ind = right;
}
if (ind != i) {
swap(a[i], a[ind]);
indArray[a[i].f] = i;
indArray[a[ind].f] = ind;
}
}
int main() {
ios_base::sync_with_stdio(false);
cin.tie(NULL);
cout.tie(NULL);
// cout << setprecision(10);
ll n, m;
cin >> n >> m;
vector<pair<ll, ll>> a[n];
forr(i, m) {
ll u, v, w;
cin >> u >> v >> w;
a[u].pb(mp(v, w));
a[v].pb(mp(u, w));
}
ll parent[n];
parent[0] = -1;
pair<ll, ll> dist[n];
forr(i, n) {
dist[i] = mp(i, INT_MAX);
}
dist[0].s = 0;
ll ind[n];
iota(ind, ind + n, 0);
ll ans[n];
ans[0] = 0;
bool visited[n];
fill(visited, visited + n, false);
ll size = n;
forr(i, n) {
ll u = dist[0].f;
visited[u] = true;
ll d1 = dist[0].s;
ans[u] = dist[0].s;
swap(dist[0], dist[size - 1]);
size--;
down(dist, size, 0, ind);
for (auto e : a[u]) {
if (visited[e.f]){
continue;
}
ll v = e.f;
ll j = ind[v];
if (dist[j].s > d1 + e.s) {
dist[j].s = d1 + e.s;
up(dist, size, j, ind);
parent[v] = u;
}
}
}
stack<ll> st;
forr(i, n) {
ll j = i;
while (j != -1) {
st.push(j);
j = parent[j];
}
while (!st.empty()) {
cout << st.top() << "->";
st.pop();
}
cout << " Path length is " << ans[i];
cout << '\n';
}
}
This implementation is correct and giving correct output.
As it can be seen every time I select the node with lowest key value(distance from source) and then I update the keys on all the adjacent nodes of the selected node. After updating the keys of the adjacent nodes I am calling the 'up' function as to maintain the min heap properties. But priority queue is present in the c++ stl. How can I use them to avoid the functions up and down.
The thing is I need to be able to find the index of the node-key pair in the mean heap whose key needs to be updated. Here in this code I have used a seperate ind array which is updated every time the min heap is updated.
But how to make use of c++ stl
Like you implied, we cannot random-access efficiently with std::priority_queue. For this case I would suggest that you use std::set. It is not actually a heap but a balanced binary search tree. However it works the desired way you wanted. find, insert and erase methods are all O(log n) so you can insert/erase/update a value with desired time since update can be done with erase-then-insert. And accessing minimum is O(1).
You may refer to this reference implementation like the exact way I mentioned. With your adjacency list, the time complexity is O(E log V) where E is number of edges, V is number of vertices.
And please note that
With default comparator, std::set::begin() method returns the min element if non-empty
In this code, it puts the distance as first and index as second. By doing so, the set elements are sorted with distance in ascending order
% I did not look into the implementation of up and down of your code in detail.

CUDA loop unrolling

I have some problem in loop unroll in CUDA.
In normal serial code:
//serial basic:
for(int i = 0; i < n; i++){
c[i] = a[i] + b[i];}
//serial loop unroll:
for(int i = 0; i < n/4; i++){
c[i] = a[i] + b[i];
c[i+1] = a[i+1] + b[i+1];
c[i+2] = a[i+2] + b[i+2];
c[i+3] = a[i+3] + b[i+3];}
So I think the CUDA loop unrolling looks like this:
int i = 2*(threadIdx.x + blockIdx.x * gridDim.x);
a[i+0] = b[i+0] + c[i+0];
a[i+1] = b[i+1] + c[i+1];
But in the CUDA hand-book the unrolling example I can't understand
This is a normal GlobalWrite kernel:
__global__ void GlobalWrites( T *out, T value, size_t N )
{
for(size_t i = blockIdx.x*blockDim.x+threadIdx.x;
i < N;
i += blockDim.x*gridDim.x ) {
out[i] = value;
}
}
unrolling kernel:
template<class T, const int n> __global__ void Global_write(T* out, T value, size_t N){
size_t i;
for(i = n*blockDim.x*blockIdx.x + threadIdx.x;
i < N - n*blockDim.x*blockIdx.x;
i += n*gridDim.x*blockDim.x;)
for(int j = 0; j < n; i++){
size_t index = i + j * blockDim.x;
outp[index] = value;
}
for ( int j = 0; j < n; j++ ) {
size_t index = i+j*blockDim.x;
if ( index<N ) out[index] = value;
}}
I know this kernel uses less blocks but may someone explain why it works better (n=4,10% speed up).
If it wasn't obvious, because n is a template parameter, it is constant at compile time. This means that the compiler is free to optimize the constant trip count loop away by unrolling. It is, therefore, instructive to remove the template magic and unroll the loop by hand for the n=4 case you mentioned:
template<class T>
__global__ void Global_write(T* out, T value, size_t N)
{
size_t i;
for(i = 4*blockDim.x*blockIdx.x + threadIdx.x;
i < N - 4*blockDim.x*blockIdx.x;
i += 4*gridDim.x*blockDim.x;) {
out[i + 0 * blockDim.x] = value;
out[i + 1 * blockDim.x] = value;
out[i + 2 * blockDim.x] = value;
out[i + 3 * blockDim.x] = value;
}
if ( i+0*blockDim.x < N ) out[i+0*blockDim.x] = value;
if ( i+1*blockDim.x < N ) out[i+1*blockDim.x] = value;
if ( i+2*blockDim.x < N ) out[i+2*blockDim.x] = value;
if ( i+3*blockDim.x < N ) out[i+3*blockDim.x] = value;
}
The unrolled inner loop yields four completely independent writes which are coalesced. It is this instruction level parallelism which give the code higher instruction throughput and improved performance. I highly recommend Vasily Volkov's Unrolling Parallel Loops from the GTC conference of a few years ago, if you haven't already seen it. His presentation lays out the theoretical background for why this type of loop unrolling is an optimisation in CUDA.
In the templated kernel, const int n is known at compile time, allowing the compiler to actually unroll the for(int j = 0; j < n; i++) loop removing the conditional checks on that loop. If the loop size is not known at compile time, the compiler cannot unroll the loop. Simple as that.

Upper Triangular matrix

i want to create upper triangular matrix with cuda
In the upper triangular matrix, the elements located
ed below the diagonal are zeros. This function should assign
the given value to the other elements.
but below code assigns all values as 0 why?
__global__ void initUpperTrinagleGPU(int *devMatrix, int numR, int numC, int value) {
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
int offset = y * numC + x;
if(numC <= numR) {
devMatrix[offset] = 0;
}
else
devMatrix[offset] = value;
}
This condition is wrong if(numC <= numR), it is true if there are less or equal cols than rows.
This might work, but it's just out of my head, not tested:
if(x >= y) {
devMatrix[offset] = 0;
}
else {
devMatrix[offset] = value;
}
note, that you should wrap this into another condition like:
if(y < numR && x < numC) { ...

How to simplify this loop?

Considering an array a[i], i=0,1,...,g, where g could be any given number, and a[0]=1.
for a[1]=a[0]+1 to 1 do
for a[2]=a[1]+1 to 3 do
for a[3]=a[2]+1 to 5 do
...
for a[g]=a[g-1]+1 to 2g-1 do
#print a[1],a[2],...a[g]#
The problem is that everytime we change the value of g, we need to modify the code, those loops above. This is not a good code.
Recursion is one way to solve this(although I was love to see an iterative solution).
!!! Warning, untested code below !!!
template<typename A, unsigned int Size>
void recurse(A (&arr)[Size],int level, int g)
{
if (level > g)
{
// I am at the bottom level, do stuff here
return;
}
for (arr[level] = arr[level-1]+1; arr[level] < 2 * level -1; arr[level]++)
{
recurse(copy,level+1,g);
}
}
Then call with recurse(arr,1,g);
Imagine you are representing numbers with an array of digits. For example, 682 would be [6,8,2].
If you wanted to count from 0 to 999 you could write:
for (int n[0] = 0; n[0] <= 9; ++n[0])
for (int n[1] = 0; n[1] <= 9; ++n[1])
for (int n[2] = 0; n[2] <= 9; ++n[2])
// Do something with three digit number n here
But when you want to count to 9999 you need an extra for loop.
Instead, you use the procedure for adding 1 to a number: increment the final digit, if it overflows move to the preceding digit and so on. Your loop is complete when the first digit overflows. This handles numbers with any number of digits.
You need an analogous procedure to "add 1" to your loop variables.
Increment the final "digit", that is a[g]. If it overflows (i.e. exceeds 2g-1) then move on to the next most-significant "digit" (a[g-1]) and repeat. A slight complication compared to doing this with numbers is that having gone back through the array as values overflow, you then need to go forward to reset the overflowed digits to their new base values (which depend on the values to the left).
The following C# code implements both methods and prints the arrays to the console.
static void Print(int[] a, int n, ref int count)
{
++count;
Console.Write("{0} ", count);
for (int i = 0; i <= n; ++i)
{
Console.Write("{0} ", a[i]);
}
Console.WriteLine();
}
private static void InitialiseRight(int[] a, int startIndex, int g)
{
for (int i = startIndex; i <= g; ++i)
a[i] = a[i - 1] + 1;
}
static void Main(string[] args)
{
const int g = 5;
// Old method
int count = 0;
int[] a = new int[g + 1];
a[0] = 1;
for (a[1] = a[0] + 1; a[1] <= 2; ++a[1])
for (a[2] = a[1] + 1; a[2] <= 3; ++a[2])
for (a[3] = a[2] + 1; a[3] <= 5; ++a[3])
for (a[4] = a[3] + 1; a[4] <= 7; ++a[4])
for (a[5] = a[4] + 1; a[5] <= 9; ++a[5])
Print(a, g, ref count);
Console.WriteLine();
count = 0;
// New method
// Initialise array
a[0] = 1;
InitialiseRight(a, 1, g);
int index = g;
// Loop until all "digits" have overflowed
while (index != 0)
{
// Do processing here
Print(a, g, ref count);
// "Add one" to array
index = g;
bool carry = true;
while ((index > 0) && carry)
{
carry = false;
++a[index];
if (a[index] > 2 * index - 1)
{
--index;
carry = true;
}
}
// Re-initialise digits that overflowed.
if (index != g)
InitialiseRight(a, index + 1, g);
}
}
I'd say you don't want nested loops in the first place. Instead, you just want to call a suitable function, taking the current nesting level, the maximum nesting level (i.e. g), the start of the loop, and whatever if needs as context for the computation as arguments:
void process(int level, int g, int start, T& context) {
if (level != g) {
for (int a(start + 1), end(2 * level - 1); a < end; ++a) {
process(level + 1, g, a, context);
}
}
else {
computation goes here
}
}