using shared memory in cuda gives memory write error - cuda

I had a kernel which works fine as
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
const Integer TID = CudaGetTargetID();
const Integer ID = TID;
if(ID >= ComputeParticleNumber)
{
return ;
}
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction = make_Scalar3(0,0,0);
if(c_daOutputParticleID[ID] < -1)
{
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
return;
}
Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&TargetPosition,CIDX, CIDY, CIDZ);
Integer Range = 1;
if(CID >=0 && CID < c_CellNum)
{
for(Integer k = -Range; k <= Range; ++k)
{
for(Integer j = -Range; j <= Range; ++j)
{
for(Integer i = -Range; i <= Range; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &TargetPosition, &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
here c_daParticlePosition is constant memory float3 data type . so here I want to use shared memory so I tried to create float3 type shared memory and tried to copy constant date to shared memory however it shows unknown error and with cuda-memcheck it says
here thread number is 255 with 2 block size
shared_memory code
__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
//const Integer TID = CudaGetTargetID();
const Integer ID =CudaGetTargetID();
extern __shared__ float3 s[];
/*if(ID >= ComputeParticleNumber)
{
return ;
}*/
s[ID] = c_daParticlePosition[ID];
__syncthreads();
CDistance NearestDistance;
Integer NearestID = -1;
NearestDistance.Magnitude = 1e8;
NearestDistance.Direction.x = 0;
NearestDistance.Direction.y = 0;
NearestDistance.Direction.z = 0;//make_Scalar3(0,0,0);
//if(c_daOutputParticleID[ID] < -1)
//{
// c_daSTLDistance[ID] = NearestDistance;
// c_daSTLID[ID] = NearestID;
// return;
//}
//Scalar3 TargetPosition = c_daParticlePosition[ID];
Integer TriangleID;
Integer CIDX, CIDY, CIDZ;
Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&s[ID],CIDX, CIDY, CIDZ);
if(CID >=0 && CID < c_CellNum)
{
//Integer Range = 1;
for(Integer k = -1; k <= 1; ++k)
{
for(Integer j = -1; j <= 1; ++j)
{
for(Integer i = -1; i <= 1; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID = c_daCell[MCID].m_TriangleID[l];
/*if(c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag)
{
continue;
}*/
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &s[ID], &Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
}
error
Invalid __shared__ write of size 4
========= at 0x00000128 in CalcSTLDistance_Kernel(int)
========= by thread (159,0,0) in block (0,0,0)
========= Address 0x0000077c is out of bounds

You may find useful info on how to work with shared memory in this article. Focus especially on static shared memory and dynamic shared memory sections.
Based on above article you should find out that you are simply writing out of bounds of your array s, exactly as the error message says. To fix the issue you can:
either specify the size of shared memory array s at compile time,
if you know it in advance, such as __shared__ float3 s[123456];
or use dynamically sized s array, thats basically what you are doing at the moment, but ALSO specify the third kernel launch parameter as CalcSTLDistance_Kernel<<<gridSize, blockSize, sharedMemorySizeInBytes>>>. In case you will be using an array of 123456 float3s then use int sharedMemorySizeInBytes = 123456 * sizeof(float3)

Related

Cuda Implementation of Partitioned Subgroup

is there a more efficient way to implement the "Partitioned Subgroup" functions of Vulkan/OpenGL, which do not have to loop over all elements in the subgroup? My current implementation just uses a loop from 0 to WARP_SIZE.
References:
(slide 37+38) https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf
https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt
Simple Implementation:
__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
uint32_t result = 0;
for (int i = 0; i < 32; ++i)
{
int x = __shfl_sync(0xFFFFFFFF, p(0), i);
int y = __shfl_sync(0xFFFFFFFF, p(1), i);
uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
if (i == threadIdx.x & 31) result = b;
}
return result;
}
__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
float result = 0;
for ( unsigned int i = 0; i < 32; ++i)
{
float other_value = __shfl_sync(0xFFFFFFFF, value, i);
if ((1U << i) & ballot) result += other_value;
}
return result;
}
Thanks to the hint of Abator I came up with a more efficient solution. It's a little ugly because labeled_partition is only implemented for int but works quite well.
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
using namespace cooperative_groups;
thread_block block = this_thread_block();
thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);
coalesced_group g1 = labeled_partition(tile32, p(0));
coalesced_group g2 = labeled_partition(tile32, p(1));
details::_coalesced_group_data_access acc;
return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}
template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
int s = group.size();
int r = group.thread_rank();
for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
{
auto v = group.template shfl_down(value, offset);
if (r + offset < s) value += v;
}
return value;
}

Why is my output printing in wrong order in selection sort?

Here's my code:
#include <stdio.h>
int main(void)
{
int a[6] = {6,1,3,4,5,2};
int size = 6;
for(int i = 0; i < size - 1; i++)
{
int smallest = i;
for(int j = i + 1; j < size; j++)
{
if(a[j] < a[smallest])
{
smallest = j;
}
if(smallest != i)
{
int z = a[smallest];
a[smallest] = a[j];
a[j] = z;
}
else
{
a[i] = a[smallest];
}
}
}
for(int i = 0; i < size; i++)
{
printf("%d, ", a[i]);
}
printf("\n");
return 0;
}
So I have 3 problems.
Output printing in descending order. I want to print it as 1,2,3,4,5,6 but the actual output is 6,5,4,3,2,1. Why?
2)When I changed the printf statement as printf("%d, ", a[size - i]); it gave output as 32767, 1,2,3,4. Why?
When I changed the "for" condition in the last "for statement" above "printf" satement as for(int i = 0; i < size; i++) it gave output as 0,1,2,3,4,5, . Why?

Recursion of a function fails

I am working on a sudoku solver using backtracking. For some unknown by me reasons my code blocks can't use recursion. I mean that a function, even if the program reach the code line where I wrote the recursion, won't call itself. The program just continue as if nothing was there.
#include <bits/stdc++.h>
using namespace std;
ifstream in("data.in");
ofstream out("data.out");
int sudoku[10][10];
int f[10];
vector< pair<int, int> > v;
bool continuare(int pas){
int x = v[pas].first;
int y = v[pas].second;
for(int i = x; i <= 9; i++)
f[ sudoku[i][y] ]++;
for(int i = x - 1; i >= 1; i--)
f[ sudoku[i][y] ]++;
for(int j = x + 1; j <= 9; j++)
f[ sudoku[x][j] ]++;
for(int j = x - 1; j >= 1; j--)
f[ sudoku[x][j] ]++;
for( int i = x - 3 + x%3, c1 = 0; c1 < 3; c1++, i++ )
for( int j = y - 3 + y%3, c2 = 0; c2 < 3; c2++, j++ )
f[ sudoku[i][j] ]++;
for(int i = 1; i <= 9; i++){
if( f[i] > 3 )
return false;
f[i] = 0;
}
return true;
}
void afisare(){
for(int i = 1; i <= 9; i++){
for(int j = 1; j <= 9; j++)
out<<sudoku[i][j]<<" ";
out<<"\n";
}
}
void backtracking( int pas ){
if( pas > v.size() )
afisare();
else
for(int i = 1; i <= 9; i++){
sudoku[ v[pas].first ][ v[pas].second ] = i;
if( continuare(pas) )
backtracking( pas + 1 );
}
}
int main()
{
for(int i = 1; i <= 9; i++)
for(int j = 1; j <= 9; j++){
in>>sudoku[i][j];
if(sudoku[i][j] == 0)
v.push_back( make_pair(i, j) );
}
backtracking(1);
return 0;
}
As you may have noticed, the problem is when backtracking() calls itself and as I said nothing happens there.
Copied from comment which seemed to have solved your question:
compile with the -g flag and run your executable against gdb, I just did that and saw that it seg faults at f[ sudoku[i][j] ]++; in continuare function.

Reverse engineering history pattern length in branch predictor

I'm trying to find the length of the history pattern in the branch predictor of my computer's processor. I generated variable length array of bits and have if conditions based on the value of the bit. I will then plot the run time of different execution of the function and search for the knee in the graph. but I don't see any such point in the graph. What am I doing wrong? Any idea?
Here is my code:
vector<int> randomArr(int n)
{
vector<int> arr (n);
for ( int i=0; i <n; i++){
arr[i] = rand() % 2;
}
return arr;
}
int branchy(vector<int> & arr){
int a = 0 ;
int b = 0 ;
for ( int i = 0 ; i < arr.size() ; i++ ) {
if ( arr[i] == 0)
a++;
else
b++;
}
return a^b;
}
int main() {
long int iterations = 100000;
int start_s;
int stop_s;
ofstream runtimesFile;
runtimesFile.open("runtimesFile.txt");
for (int j=0; j <iterations ; j++){
vector<int> arr = randomArr(j);
start_s=clock();
branchy(arr);
stop_s=clock();
runtimesFile<< to_string(stop_s-start_s)<<"\n";
}
runtimesFile.close();
return 0;
}

Creating identity matrix with CUDA

Hi i try to create an identity matrix with CUDA but the output is just : zeros
__global__ void initIdentityGPU(int *devMatrix, int numR, int numC) {
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x * y;
for (int i = 0; i < x ; i++) {
for (int j = 0; j < numR; j++) {
if (i == j)
devMatrix[offset] = 1;
else
devMatrix[offset] = 0;
}
}
}
Why only it puts 0s ?
The simplest way how to do it is:
__global__ void initIdentityGPU(int **devMatrix, int numR, int numC) {
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
if(x == y)
devMatrix[y][x] = 1;
else
devMatrix[y][x] = 0;
}
}
and you launch it as:
dim3 blockDim(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 gridDim((numC + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (numR + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y);
initIdentityGPU<<<gridDim, blockDim>>>(matrix, numR, numC);
It simply runs as many threads as matrix cells, each thread obtains the coordinates of its cell and in a case the cell is in the diagonal of matrix it assigns 1 or 0 otherwise. Note the code is untested.