Frama-c loop invariant - proof

I just tried to prove a sort function in frama-c. However, when I proved the outer loop.
loop invariant 0 <= i <l;
loop invariant 0 < i < l ==> \forall int a,b; 0<=b <=l-i-1 <=a < l ==>
t[a]>=t[b];
There is always with the orange bullets. I refer to many examples and I cannot find the reason. Is there someone that can help me? Thanks!!
The following is my source code:
/*# predicate Swap{L1,L2}(int *a, integer l, integer i, integer j) =
\at(a[i],L1) == \at(a[j],L2) &&
\at(a[j],L1) == \at(a[i],L2) &&
\forall integer k; k != i && k != j
==> \at(a[k],L1) == \at(a[k],L2);
*/
/*# predicate Sorted{L}(int *a, integer l, integer h) =
\forall integer i,j; l <= i <= j < h ==> a[i] <= a[j] ;
*/
/*# requires \valid(t + (0..l-1));
requires 0 <= i < l;
requires 0 <= j < l;
assigns t[i],t[j];
ensures Swap{Old,Here}(t,l,i,j);
*/
void swap(int *t, int l, int i,int j){
int tmp;
tmp = t[i];
t[i] = t[j];
t[j] = tmp;
return;
}
/*# requires l >0;
requires \valid(t + (0..l-1));
ensures (\forall integer a; 0<=a <l
==> (\exists integer b; 0<= b < l
==> \at(t[b],Old)== \at(t[a],Here) ));
ensures Sorted{Here}(t, 0, l-1);
*/
void sort(int *t, int l) {
int i;
int j;
i=j=0;
/*# loop invariant 0 <= i <l;
loop invariant 0 < i < l ==> \forall int a,b; 0<=b <=l-i-1 <=a < l ==>
t[a]>=t[b];
*/
for (i=0;i<l;i++) {
/*#
loop invariant 0<= j < l;
loop invariant 0 < j < l ==>\forall int a; 0<= a <= j ==> t[a]<=t[j];
*/
for (j=0;j<l-1;j++) {
if (t[j] > t[j+1]){
swap(t,l ,j, j+1);}
}
}
}
and I use
frama-c-gui -wp sort.c

Related

Cuda Implementation of Partitioned Subgroup

is there a more efficient way to implement the "Partitioned Subgroup" functions of Vulkan/OpenGL, which do not have to loop over all elements in the subgroup? My current implementation just uses a loop from 0 to WARP_SIZE.
References:
(slide 37+38) https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf
https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt
Simple Implementation:
__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
uint32_t result = 0;
for (int i = 0; i < 32; ++i)
{
int x = __shfl_sync(0xFFFFFFFF, p(0), i);
int y = __shfl_sync(0xFFFFFFFF, p(1), i);
uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
if (i == threadIdx.x & 31) result = b;
}
return result;
}
__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
float result = 0;
for ( unsigned int i = 0; i < 32; ++i)
{
float other_value = __shfl_sync(0xFFFFFFFF, value, i);
if ((1U << i) & ballot) result += other_value;
}
return result;
}
Thanks to the hint of Abator I came up with a more efficient solution. It's a little ugly because labeled_partition is only implemented for int but works quite well.
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
using namespace cooperative_groups;
thread_block block = this_thread_block();
thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);
coalesced_group g1 = labeled_partition(tile32, p(0));
coalesced_group g2 = labeled_partition(tile32, p(1));
details::_coalesced_group_data_access acc;
return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}
template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
int s = group.size();
int r = group.thread_rank();
for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
{
auto v = group.template shfl_down(value, offset);
if (r + offset < s) value += v;
}
return value;
}

Recursion of a function fails

I am working on a sudoku solver using backtracking. For some unknown by me reasons my code blocks can't use recursion. I mean that a function, even if the program reach the code line where I wrote the recursion, won't call itself. The program just continue as if nothing was there.
#include <bits/stdc++.h>
using namespace std;
ifstream in("data.in");
ofstream out("data.out");
int sudoku[10][10];
int f[10];
vector< pair<int, int> > v;
bool continuare(int pas){
int x = v[pas].first;
int y = v[pas].second;
for(int i = x; i <= 9; i++)
f[ sudoku[i][y] ]++;
for(int i = x - 1; i >= 1; i--)
f[ sudoku[i][y] ]++;
for(int j = x + 1; j <= 9; j++)
f[ sudoku[x][j] ]++;
for(int j = x - 1; j >= 1; j--)
f[ sudoku[x][j] ]++;
for( int i = x - 3 + x%3, c1 = 0; c1 < 3; c1++, i++ )
for( int j = y - 3 + y%3, c2 = 0; c2 < 3; c2++, j++ )
f[ sudoku[i][j] ]++;
for(int i = 1; i <= 9; i++){
if( f[i] > 3 )
return false;
f[i] = 0;
}
return true;
}
void afisare(){
for(int i = 1; i <= 9; i++){
for(int j = 1; j <= 9; j++)
out<<sudoku[i][j]<<" ";
out<<"\n";
}
}
void backtracking( int pas ){
if( pas > v.size() )
afisare();
else
for(int i = 1; i <= 9; i++){
sudoku[ v[pas].first ][ v[pas].second ] = i;
if( continuare(pas) )
backtracking( pas + 1 );
}
}
int main()
{
for(int i = 1; i <= 9; i++)
for(int j = 1; j <= 9; j++){
in>>sudoku[i][j];
if(sudoku[i][j] == 0)
v.push_back( make_pair(i, j) );
}
backtracking(1);
return 0;
}
As you may have noticed, the problem is when backtracking() calls itself and as I said nothing happens there.
Copied from comment which seemed to have solved your question:
compile with the -g flag and run your executable against gdb, I just did that and saw that it seg faults at f[ sudoku[i][j] ]++; in continuare function.

Reverse engineering history pattern length in branch predictor

I'm trying to find the length of the history pattern in the branch predictor of my computer's processor. I generated variable length array of bits and have if conditions based on the value of the bit. I will then plot the run time of different execution of the function and search for the knee in the graph. but I don't see any such point in the graph. What am I doing wrong? Any idea?
Here is my code:
vector<int> randomArr(int n)
{
vector<int> arr (n);
for ( int i=0; i <n; i++){
arr[i] = rand() % 2;
}
return arr;
}
int branchy(vector<int> & arr){
int a = 0 ;
int b = 0 ;
for ( int i = 0 ; i < arr.size() ; i++ ) {
if ( arr[i] == 0)
a++;
else
b++;
}
return a^b;
}
int main() {
long int iterations = 100000;
int start_s;
int stop_s;
ofstream runtimesFile;
runtimesFile.open("runtimesFile.txt");
for (int j=0; j <iterations ; j++){
vector<int> arr = randomArr(j);
start_s=clock();
branchy(arr);
stop_s=clock();
runtimesFile<< to_string(stop_s-start_s)<<"\n";
}
runtimesFile.close();
return 0;
}

using shared memory in cuda gives memory write error

I had a kernel which works fine as
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
const Integer TID = CudaGetTargetID();
const Integer ID = TID;
if(ID >= ComputeParticleNumber)
{
return ;
}
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction = make_Scalar3(0,0,0);
if(c_daOutputParticleID[ID] < -1)
{
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
return;
}
Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&TargetPosition,CIDX, CIDY, CIDZ);
Integer Range = 1;
if(CID >=0 && CID < c_CellNum)
{
for(Integer k = -Range; k <= Range; ++k)
{
for(Integer j = -Range; j <= Range; ++j)
{
for(Integer i = -Range; i <= Range; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &TargetPosition, &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
here c_daParticlePosition is constant memory float3 data type . so here I want to use shared memory so I tried to create float3 type shared memory and tried to copy constant date to shared memory however it shows unknown error and with cuda-memcheck it says
here thread number is 255 with 2 block size
shared_memory code
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
//const Integer TID = CudaGetTargetID();
const Integer ID =CudaGetTargetID();
extern __shared__ float3 s[];
/*if(ID >= ComputeParticleNumber)
{
return ;
}*/
s[ID] = c_daParticlePosition[ID];
__syncthreads();
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction.x = 0;
NearestDistance.Direction.y = 0;
NearestDistance.Direction.z = 0;//make_Scalar3(0,0,0);
//if(c_daOutputParticleID[ID] < -1)
//{
// c_daSTLDistance[ID] = NearestDistance;
// c_daSTLID[ID] = NearestID;
// return;
//}
//Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&s[ID],CIDX, CIDY, CIDZ);
if(CID >=0 && CID < c_CellNum)
{
//Integer Range = 1;
for(Integer k = -1; k <= 1; ++k)
{
for(Integer j = -1; j <= 1; ++j)
{
for(Integer i = -1; i <= 1; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
/*if(c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag)
{
continue;
}*/
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &s[ID], &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
error
Invalid __shared__ write of size 4
========= at 0x00000128 in CalcSTLDistance_Kernel(int)
========= by thread (159,0,0) in block (0,0,0)
========= Address 0x0000077c is out of bounds
You may find useful info on how to work with shared memory in this article. Focus especially on static shared memory and dynamic shared memory sections.
Based on above article you should find out that you are simply writing out of bounds of your array s, exactly as the error message says. To fix the issue you can:
either specify the size of shared memory array s at compile time,
if you know it in advance, such as __shared__ float3 s[123456];
or use dynamically sized s array, thats basically what you are doing at the moment, but ALSO specify the third kernel launch parameter as CalcSTLDistance_Kernel<<<gridSize, blockSize, sharedMemorySizeInBytes>>>. In case you will be using an array of 123456 float3s then use int sharedMemorySizeInBytes = 123456 * sizeof(float3)

CUDA Broken Float Math on Kepler/Fermi Arch

I have a program that does a lot of single precision math. It produces correct results if I specify 1.0 architecture but is broken for 2.X and 3.X architectures. What would cause this?
Included below:
Very long code sample.
Compile command and good output.
Compile command and bad output.
If I run the same routing in the CPU using gcc, I get results that match the 1.0 architecture.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
/*
* svdcomp - SVD decomposition routine.
* Takes an mxn matrix a and decomposes it into udv, where u,v are
* left and right orthogonal transformation matrices, and d is a
* diagonal matrix of singular values.
*
* This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is
* code from Numerical Recipes adapted by Luke Tierney and David Betz.
* Originally from: "Numerical Recipes in C: The Art of Scientific Computing",
* Press, Flannery, Teukolosky, Vetterling. 1992.
*
* Input to dsvd is as follows:
* a = mxn matrix to be decomposed, gets overwritten with u
* m = row dimension of a
* n = column dimension of a
* w = returns the vector of singular values of a
* v = returns the right orthogonal transformation matrix
*/
#define SIGN(a, b) ((b) >= 0.0f ? fabsf(a) : -fabsf(a))
#define MIN(x,y) ( (x) < (y) ? (x) : (y) )
#define MAX(x,y) ((x)>(y)?(x):(y))
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
#define ERRCHECK \
if (cudaPeekAtLastError()) { \
fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
__device__ int
svd(float *a, int m, int n, float *w, float *v, int skip_u)
{
int flag, i, its, j, jj, k, l, nm;
float c, f, h, s, x, y, z;
float anorm = 0.0f, g = 0.0f, scale = 0.0f;
float rv1[3];
/* Householder reduction to bidiagonal form */
for (i = 0; i < n; i++)
{
/* left-hand reduction */
l = i + 1;
rv1[i] = scale * g;
g = s = scale = 0.0f;
if (i < m)
{
for (k = i; k < m; k++)
scale += fabsf(a[k*n+i]);
if (scale)
{
for (k = i; k < m; k++)
{
a[k*n+i] /= scale;
s += powf(a[k*n+i], 2);
}
f = a[i*n+i];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+i] = f - g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = i; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = s / h;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (k = i; k < m; k++)
a[k*n+i] *= scale;
}
}
w[i] = scale * g;
/* right-hand reduction */
g = s = scale = 0.0f;
if (i < m && i != n - 1)
{
for (k = l; k < n; k++)
scale += fabsf(a[i*n+k]);
if (scale)
{
for (k = l; k < n; k++)
{
a[i*n+k] /= scale;
s += powf(a[i*n+k], 2);
}
f = a[i*n+l];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+l] = f - g;
for (k = l; k < n; k++)
rv1[k] = a[i*n+k] / h;
if (i != m - 1)
{
for (j = l; j < m; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[j*n+k] * a[i*n+k];
for (k = l; k < n; k++)
a[j*n+k] += s * rv1[k];
}
}
for (k = l; k < n; k++)
a[i*n+k] *= scale;
}
}
anorm = MAX(anorm, fabsf(w[i]) + fabsf(rv1[i]));
}
/* accumulate the right-hand transformation */
for (i = n - 1; i >= 0; i--)
{
if (i < n - 1)
{
if (g)
{
for (j = l; j < n; j++)
v[j*n+i] = (a[i*n+j] / a[i*n+l]) / g;
/* float division to avoid underflow */
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[i*n+k] * v[k*n+j];
for (k = l; k < n; k++)
v[k*n+j] += s * v[k*n+i];
}
}
for (j = l; j < n; j++)
v[i*n+j] = v[j*n+i] = 0.0f;
}
v[i*n+i] = 1.0f;
g = rv1[i];
l = i;
}
/* accumulate the left-hand transformation */
if (!skip_u) {
for (i = n - 1; i >= 0; i--)
{
l = i + 1;
g = w[i];
if (i < n - 1)
for (j = l; j < n; j++)
a[i*n+j] = 0.0f;
if (g)
{
g = 1.0f / g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = (s / a[i*n+i]) * g;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (j = i; j < m; j++)
a[j*n+i] = a[j*n+i]*g;
}
else
{
for (j = i; j < m; j++)
a[j*n+i] = 0.0f;
}
++a[i*n+i];
}
}
/* diagonalize the bidiagonal form */
for (k = n - 1; k >= 0; k--)
{ /* loop over singular values */
for (its = 0; its < 30; its++)
{ /* loop over allowed iterations */
flag = 1;
for (l = k; l >= 0; l--)
{ /* test for splitting */
nm = l - 1;
if (fabsf(rv1[l]) + anorm == anorm)
{
flag = 0;
break;
}
if (fabsf(w[nm]) + anorm == anorm)
break;
}
if (flag)
{
c = 0.0f;
s = 1.0f;
for (i = l; i <= k; i++)
{
f = s * rv1[i];
if (fabsf(f) + anorm != anorm)
{
g = w[i];
h = hypotf(f, g);
w[i] = h;
h = 1.0f / h;
c = g * h;
s = (- f * h);
if (!skip_u) {
for (j = 0; j < m; j++)
{
y = a[j*n+nm];
z = a[j*n+i];
a[j*n+nm] = y * c + z * s;
a[j*n+i] = z * c - y * s;
}
}
}
}
}
z = w[k];
if (l == k)
{ /* convergence */
if (z < 0.0f)
{ /* make singular value nonnegative */
w[k] = -z;
for (j = 0; j < n; j++)
v[j*n+k] = -v[j*n+k];
}
break;
}
if (its >= 30) {
}
/* shift from bottom 2 x 2 minor */
x = w[l];
nm = k - 1;
y = w[nm];
g = rv1[nm];
h = rv1[k];
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
g = hypotf(f, 1.0f);
f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
/* next QR transformation */
c = s = 1.0f;
for (j = l; j <= nm; j++)
{
i = j + 1;
g = rv1[i];
y = w[i];
h = s * g;
g = c * g;
z = hypotf(f, h);
rv1[j] = z;
c = f / z;
s = h / z;
f = x * c + g * s;
g = g * c - x * s;
h = y * s;
y = y * c;
for (jj = 0; jj < n; jj++)
{
x = v[jj*n+j];
z = v[jj*n+i];
v[jj*n+j] = x * c + z * s;
v[jj*n+i] = z * c - x * s;
}
z = hypotf(f, h);
w[j] = z;
if (z)
{
z = 1.0f / z;
c = f * z;
s = h * z;
}
f = (c * g) + (s * y);
x = (c * y) - (s * g);
if (!skip_u) {
for (jj = 0; jj < m; jj++)
{
y = a[jj*n+j];
z = a[jj*n+i];
a[jj*n+j] = y * c + z * s;
a[jj*n+i] = z * c - y * s;
}
}
}
rv1[l] = 0.0f;
rv1[k] = f;
w[k] = x;
}
}
return(0);
}
__global__ void
svd_kernel(float *v)
{
float a[9], w[3];
a[0] = 8.0f;
a[1] = 3.0f;
a[2] = 7.0f;
a[3] = 7.0f;
a[4] = 9.0f;
a[5] = 1.0f;
a[6] = 3.0f;
a[7] = 7.0f;
a[8] = 2.0f;
svd(a, 3, 3, w, v, 1);
}
int main()
{
int i, j;
float *v_d, v[9];
PERR(cudaMalloc(&v_d, 9*sizeof(float)));
svd_kernel<<<1,1>>>(v_d);
cudaDeviceSynchronize();
ERRCHECK;
PERR(cudaMemcpy(v, v_d, 9*sizeof(float), cudaMemcpyDeviceToHost));
for (i = 0; i < 3; i++) {
for (j = 0; j < 3; j++) {
printf("%6.3f\t", v[i*3+j]);
}
printf("\n");
}
return 0;
}
Correct Results:
$ nvcc -arch=sm_10 -o svd svd.cu
$ ./svd
-0.657 -0.685 0.314
-0.668 0.337 -0.664
-0.349 0.646 0.679
Broken Results:
$ nvcc -arch=sm_20 -o svd svd.cu
$ ./svd
-0.661 -0.660 0.356
-0.642 0.253 -0.724
0.019 0.460 0.888
It seems that CUDA 6 fixes the issue.