Print Out Cryptographic Caesar Cipher Output Array in Cuda C Program - cuda

Can anyone help lead me in the right direction for getting the right output?
The global method what_is_my_id00() is not giving the correct output when the data gets copied back from device to host in outStr_block[].
I am also unsure if I am doing the comparison correctly in cuda with the thread and block id. Please give me any tips if you have any.
#include <stdio.h>
#define ARRAY_SIZE 832
#define ARRAY_SIZE_IN_BYTES (sizeof(unsigned int) * (ARRAY_SIZE))
unsigned char inpStr_block[] = "abcdefghijklmnopqrstuvwxyz";
unsigned char alpha_thread[] = "iamnotableinthestateofcalifornia";
unsigned char outStr_block[32];
unsigned int increment = 0;
__global__
void what_is_my_id00(unsigned char * input, unsigned char * alpha, unsigned char * out, int incr)
{
if(input[threadIdx.x] == alpha[blockIdx.x]){
out[incr] = alpha[(threadIdx.x + 3) % 25];
printf("Alphabet ID: %2u\n", out[incr]);
incr++;
}
}
void main_sub0()
{
int sizeChar = ARRAY_SIZE * sizeof(char);
unsigned char *dev_alp_char;
unsigned char *dev_inp_char;
unsigned char *dev_out_char;
cudaMalloc((void**)&dev_alp_char, sizeChar);
cudaMalloc((void**)&dev_inp_char, sizeChar);
cudaMalloc((void**)&dev_out_char, sizeChar);
cudaMemcpy( inpStr_block, dev_inp_char, ARRAY_SIZE_IN_BYTES, cudaMemcpyHostToDevice );
cudaMemcpy( alpha_thread, dev_alp_char, ARRAY_SIZE_IN_BYTES, cudaMemcpyHostToDevice );
cudaMemcpy( outStr_block, dev_out_char, ARRAY_SIZE_IN_BYTES, cudaMemcpyHostToDevice );
const unsigned int num_blocks = ARRAY_SIZE/26;
const unsigned int num_threads = ARRAY_SIZE/num_blocks;
/* Execute our kernel */
what_is_my_id00<<<num_blocks, num_threads>>>(dev_inp_char, dev_alp_char, dev_out_char, increment);
// total blocks, threads per block
/* Free the arrays on the GPU as now we're done with them */
cudaMemcpy(outStr_block, dev_out_char, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost );
cudaFree(dev_alp_char);
cudaFree(dev_inp_char);
cudaFree(dev_out_char);
/* Iterate through the arrays and print */
for( int i = 0; i < 32; i++)
{
printf("Alphabet ID: %2u\n", outStr_block[i]);
}
}
int main()
{
main_sub0();
return EXIT_SUCCESS;
}
OUTPUT
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Alphabet ID: 32
Update Code
#include <stdio.h>
#define ARRAY_SIZE 832
#define ARRAY_SIZE_IN_BYTES (sizeof(unsigned char) * (ARRAY_SIZE))
char host_alpha_thread[] = "abcdefghijklmnopqrstuvwxyz";
char host_inp_block[] = "iamnotableinthestateofcalifornia";
char out_block[32];
__global__
void what_is_my_id00(char * input, char * alpha, char * out)
{
if(input[threadIdx.x] == alpha[blockIdx.x]){
out[blockIdx.x] = 'c';
}
}
void main_sub0()
{
int sizeChar = ARRAY_SIZE * sizeof(char);
char *dev_alp_char;
char *dev_inp_char;
char *dev_out_char;
cudaMalloc((void**)&dev_alp_char, sizeChar);
cudaMalloc((void**)&dev_inp_char, sizeChar);
cudaMalloc((void**)&dev_out_char, sizeChar);
cudaMemcpy( dev_alp_char, host_alpha_thread, sizeof(char) * 26, cudaMemcpyHostToDevice );
cudaMemcpy( dev_inp_char, host_inp_block, sizeof(char) * 32, cudaMemcpyHostToDevice );
cudaMemcpy( dev_out_char, out_block, sizeof(int) * 32, cudaMemcpyHostToDevice );
int total_blocks = 32;
int tpb = 26;//threads per block
/* Execute our kernel */
what_is_my_id00<<<total_blocks, tpb>>>(dev_inp_char, dev_alp_char, dev_out_char);
// total blocks, threads per block
/* Free the arrays on the GPU as now we're done with them */
cudaMemcpy(out_block, dev_out_char, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost );
cudaFree(dev_alp_char);
cudaFree(dev_inp_char);
cudaFree(dev_out_char);
/* Iterate through the arrays and print */
for( int i = 0; i < 32; i++)
{
printf("Alphabet ID: %2c\n", out_block[i]);
}
}
int main()
{
main_sub0();
return EXIT_SUCCESS;
}
Updated Output
Alphabet ID: c
Alphabet ID: c
Alphabet ID: c
Alphabet ID:
Alphabet ID: c
Alphabet ID: c
Alphabet ID:
Alphabet ID: c
Alphabet ID: c
Alphabet ID:
Alphabet ID:
Alphabet ID: c
Alphabet ID: c
Alphabet ID: c
Alphabet ID: c
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID: c
Alphabet ID: c
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:
Alphabet ID:

Related

Generate Cartesian Product using more than two lists on GPU

I would like to know how to generate a Cartesian product of more than two lists using CUDA.
How do I make this code work with three or more lists?
It works with two lists but not with three, I tried /, % without success.
Its basic.
#include <thrust/device_vector.h>
#include <thrust/pair.h>
#include <thrust/copy.h>
#include <iterator>
__global__ void cartesian_product(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < a_size * b_size * c_size)
{
unsigned int a_idx = idx / a_size;
unsigned int b_idx = idx % a_size;
// ?
unsigned int c_idx = idx % a_size;
printf("a[a_idx] and b[b_idx] and c[c_idx] are: %d %d %d\n\n",a[a_idx], b[b_idx], c[c_idx]);
//1 3 5 , 1 3 6 , 1 4 5 , 1 4 6 , 2 3 5 , 2 3 6 , 2 4 5 , 2 4 6
//0 0 0 , 0 0 1 , 0 1 0 , 0 1 1 , 1 0 0 , 1 0 1 , 1 1 0 , 1 1 1
}
}
int main()
{
// host_vector is stored in host memory while device_vector livesin GPU device memory.
// a has storage for 2 integers
thrust::device_vector<int> a(2);
// initialize individual elements
a[0] = 1;
a[1] = 2;
// b has storage for 2 integers
thrust::device_vector<int> b(2);
// initialize individual elements
b[0] = 3;
b[1] = 4;
// d has storage for 2 integers
thrust::device_vector<int> c(2);
// initialize individual elements
c[0] = 5;
c[1] = 6;
unsigned int block_size = 256;
unsigned int num_blocks = (8 + (block_size - 1)) / block_size;
// raw_pointer_cast creates a "raw" pointer from a pointer-like type, simply returning the wrapped pointer, should it exist.
cartesian_product<<<num_blocks, block_size>>>(thrust::raw_pointer_cast(a.data()), a.size(),
thrust::raw_pointer_cast(b.data()), b.size(),
thrust::raw_pointer_cast(c.data()), c.size());
return 0;
}
How do I get the right c_idx in the kernel and subsequent arrays if I want more than three lists?
It seems to me that you want "lexic indexing":
idx == (a_idx * b_size + b_idx) * c_size + c_idx
So you get your indices like this:
c_idx = idx % c_size;
b_idx = (idx / c_size) % b_size;
a_idx = (idx / c_size) / b_size;
This is easily generalizable to more dimensions. E.g. in four dimensions you have
idx == ((a_idx * b_size + b_idx) * c_size + c_idx) * d_size + d_idx
Then:
d_idx = idx % d_size;
c_idx = (idx / d_size) % c_size;
b_idx = ((idx / d_size) / c_size) % b_size;
a_idx = ((idx / d_size) / c_size) / b_size;
In C/C++ programming one likes to use this to calculate indices into an one-dimensional dynamic array representing a multidimensional dataset. In CUDA you normally don't need it as much, as CUDA gives you up to three-dimensional threadIdx/blockIdx/etc.. So for the Cartesian product of three arrays, you won't need this technique, but could just use the intrinsic CUDA features. Even in more than three the most performant solution will get two indices from two of the three dimensions of the kernel and use lexic indexing on the third one:
__global__ void cartesian_product_5d(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size,
const int *d, size_t d_size,
const int *e, size_t e_size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d_idx = blockIdx.y * blockDim.y + threadIdx.y;
int e_idx = blockIdx.z * blockDim.z + threadIdx.z;
/* idx == (c_idx * b_size + b_idx) * a_size + a_idx */
int a_idx = idx % a_size;
int b_idx = (idx / a_size) % b_size;
int c_idx = (idx / a_size) / b_size;
/* ... */
}
int main()
{
/* ... */
dim3 threadsPerBlock(8, 8, 8);
dim3 numBlocks((a_size + b_size + c_size + threadsPerBlock.x - 1) /
threadsPerBlock.x,
(d_size + threadsPerBlock.y - 1) / threadsPerBlock.y,
(e_size + threadsPerBlock.z - 1) / threadsPerBlock.z);
cartesian_product_5d<<<numBlocks, threadsPerBlock>>>(/* ... */);
/* ... */
}

Thrust adapting thrust::remove_if so predicate is checking for existence in range [duplicate]

I'm using CUDA and THRUST to perform paired set operations. I would like to retain duplicates, however. For example:
int keys[6] = {1, 1, 1, 3, 4, 5, 5};
int vals[6] = {1, 2, 3, 4, 5, 6, 7};
int comp[2] = {1, 5};
thrust::set_intersection_by_key(keys, keys + 6, comp, comp + 2, vals, rk, rv);
Desired result
rk[1, 1, 1, 5, 5]
rv[1, 2, 3, 6, 7]
Actual Result
rk[1, 5]
rv[5, 7]
I want all of the vals where the corresponding key is contained in comp.
Is there any way to achieve this using thrust, or do I have to write my own kernel or thrust function?
I'm using this function: set_intersection_by_key.
Quoting from the thrust documentation:
The generalization is that if an element appears m times in [keys_first1, keys_last1) and n times in [keys_first2, keys_last2) (where m may be zero), then it appears min(m,n) times in the keys output range
Since comp does only contain each key once, n=1 and therefore min(m,1) = 1.
In order to get "all of the vals where the corresponding key is contained in comp", you can use the approach of my answer to a similar problem.
Similarly, the example code does the following steps:
Get the largest element of d_comp. This assumes that d_comp is already sorted.
Create vector d_map of size largest_element+1. Copy 1 to all positions of the entries of d_comp in d_map.
Copy all entries from d_vals for which there is a 1 entry in d_map into d_result.
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/scatter.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
void print(const char* name, const thrust::device_vector<int>& v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<int>(std::cout, "\t"));
std::cout << std::endl;
}
int main()
{
int keys[] = {1, 1, 1, 3, 4, 5, 5};
int vals[] = {1, 2, 3, 4, 5, 6, 7};
int comp[] = {1, 5};
const int size_data = sizeof(keys)/sizeof(keys[0]);
const int size_comp = sizeof(comp)/sizeof(comp[0]);
// copy data to GPU
thrust::device_vector<int> d_keys (keys, keys+size_data);
thrust::device_vector<int> d_vals (vals, vals+size_data);
thrust::device_vector<int> d_comp (comp, comp+size_comp);
PRINTER(d_keys);
PRINTER(d_vals);
PRINTER(d_comp);
int largest_element = d_comp.back();
thrust::device_vector<int> d_map(largest_element+1);
thrust::constant_iterator<int> one(1);
thrust::scatter(one, one+size_comp, d_comp.begin(), d_map.begin());
PRINTER(d_map);
thrust::device_vector<int> d_result(size_data);
using namespace thrust::placeholders;
int final_size = thrust::copy_if(d_vals.begin(),
d_vals.end(),
thrust::make_permutation_iterator(d_map.begin(), d_keys.begin()),
d_result.begin(),
_1
) - d_result.begin();
d_result.resize(final_size);
PRINTER(d_result);
return 0;
}
output:
d_keys: 1 1 1 3 4 5 5
d_vals: 1 2 3 4 5 6 7
d_comp: 1 5
d_map: 0 1 0 0 0 1
d_result: 1 2 3 6 7

Retain Duplicates with Set Intersection in CUDA

I'm using CUDA and THRUST to perform paired set operations. I would like to retain duplicates, however. For example:
int keys[6] = {1, 1, 1, 3, 4, 5, 5};
int vals[6] = {1, 2, 3, 4, 5, 6, 7};
int comp[2] = {1, 5};
thrust::set_intersection_by_key(keys, keys + 6, comp, comp + 2, vals, rk, rv);
Desired result
rk[1, 1, 1, 5, 5]
rv[1, 2, 3, 6, 7]
Actual Result
rk[1, 5]
rv[5, 7]
I want all of the vals where the corresponding key is contained in comp.
Is there any way to achieve this using thrust, or do I have to write my own kernel or thrust function?
I'm using this function: set_intersection_by_key.
Quoting from the thrust documentation:
The generalization is that if an element appears m times in [keys_first1, keys_last1) and n times in [keys_first2, keys_last2) (where m may be zero), then it appears min(m,n) times in the keys output range
Since comp does only contain each key once, n=1 and therefore min(m,1) = 1.
In order to get "all of the vals where the corresponding key is contained in comp", you can use the approach of my answer to a similar problem.
Similarly, the example code does the following steps:
Get the largest element of d_comp. This assumes that d_comp is already sorted.
Create vector d_map of size largest_element+1. Copy 1 to all positions of the entries of d_comp in d_map.
Copy all entries from d_vals for which there is a 1 entry in d_map into d_result.
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/scatter.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
void print(const char* name, const thrust::device_vector<int>& v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<int>(std::cout, "\t"));
std::cout << std::endl;
}
int main()
{
int keys[] = {1, 1, 1, 3, 4, 5, 5};
int vals[] = {1, 2, 3, 4, 5, 6, 7};
int comp[] = {1, 5};
const int size_data = sizeof(keys)/sizeof(keys[0]);
const int size_comp = sizeof(comp)/sizeof(comp[0]);
// copy data to GPU
thrust::device_vector<int> d_keys (keys, keys+size_data);
thrust::device_vector<int> d_vals (vals, vals+size_data);
thrust::device_vector<int> d_comp (comp, comp+size_comp);
PRINTER(d_keys);
PRINTER(d_vals);
PRINTER(d_comp);
int largest_element = d_comp.back();
thrust::device_vector<int> d_map(largest_element+1);
thrust::constant_iterator<int> one(1);
thrust::scatter(one, one+size_comp, d_comp.begin(), d_map.begin());
PRINTER(d_map);
thrust::device_vector<int> d_result(size_data);
using namespace thrust::placeholders;
int final_size = thrust::copy_if(d_vals.begin(),
d_vals.end(),
thrust::make_permutation_iterator(d_map.begin(), d_keys.begin()),
d_result.begin(),
_1
) - d_result.begin();
d_result.resize(final_size);
PRINTER(d_result);
return 0;
}
output:
d_keys: 1 1 1 3 4 5 5
d_vals: 1 2 3 4 5 6 7
d_comp: 1 5
d_map: 0 1 0 0 0 1
d_result: 1 2 3 6 7

using constant memory prints address instead of value in cuda

I am trying to use the constant memory in the code with constant memory assigned value from kernel not using cudacopytosymbol.
#include <iostream>
using namespace std;
#define N 10
//__constant__ int constBuf_d[N];
__constant__ int *constBuf;
__global__ void foo( int *results )
{
int tdx = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tdx;
if( idx < N )
{
constBuf[idx]=1;
results[idx] = constBuf[idx];
}
}
// main routine that executes on the host
int main(int argc, char* argv[])
{
int *results_h = new int[N];
int *results_d;
cudaMalloc((void **)&results_d, N*sizeof(int));
foo <<< 1, 10 >>> ( results_d );
cudaMemcpy(results_h, results_d, N*sizeof(int), cudaMemcpyDeviceToHost);
for( int i=0; i < N; ++i )
printf("%i ", results_h[i] );
delete(results_h);
}
output shows
6231808 6226116 0 0 0 0 0 0 0 0
I want the program to print the value assigned to constant memory through the kenel in the code.
Constant memory is, as the name implies, constant/read-only with respect to device code. What you are trying to do is illegal and can't be made to work.
To set values in constant memory, you currently have two choices:
set the value from host code via the cudaMemcpyToSymbol API call (or its equivalents)
use static initialisation at compile time
In the latter case something like this would work:
__constant__ int constBuf[N] = { 16, 2, 77, 40, 12, 3, 5, 3, 6, 6 };
__global__ void foo( int *results )
{
int tdx = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tdx;
if( tdx < N )
{
results[idx] = constBuf[tdx]; // Note changes here!
}
}

solve Ax=b using CUDA, works for matrix size less than 128x128

I wrote a code that uses GPU to do the forward solve (get U matrix from A = LU where here the diagonal entries of U are not set to unity). My code works fine for matrices of dimension less than 128x128 which are a factor of the block size which is 16. I am puzzled why larger matrices give me the wrong result. Something goes wrong in the last row of blocks.
------------ gaussian.h --------------------
// Thread block size
#define BLOCK_SIZE 16 //for forward solve A = LU
#define BLOCK_SIZE2 32 //for finding the pivot factor
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
////////////////////////////////////////////////////////////////////////////////////////
------------ gaussian.cu --------------------
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#include "gaussian_kernel.cu"
#define OUTPUT
void runTest(int argc, char** argv);
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
int main(int argc, char** argv)
{
runTest(argc, argv);
}
void runTest(int argc, char** argv)
{
int dim;
if (argc == 2)
{
dim = atoi(argv[1]);
}
else{
printf("Wrong Usage\n");
exit(1);
}
// Note: A is a square matrix of size NxN where N % BLOCK_SIZE = 0
// Every row of A has a pivot column
// It is known that all the arithmetic operations involved for
// Gaussian Elimination are of type int (not float)
// allocate host memory for matrix A augmented with vector b (Ax=b)
unsigned int size_A = dim * (dim + BLOCK_SIZE);
unsigned int mem_size_A = sizeof(int) * size_A;
int* h_A = (int*) malloc(mem_size_A);
// initialize host memory, generate a test case such as below
// augmented matrix A with padding to make the size evenly divisible by BLOCK_SIZE
// ----A----- | b --padding--
// 1 1 1 1 .. | 0 * * * ..
// 1 2 2 2 .. | 1 * * * ..
// 1 2 3 3 .. | 2 * * * ..
// 1 2 3 4 .. | 3 * * * ..
// .......... | ...
// * means uninitialized entry
// A is size NxN
// Augmented matrix with padding is size Nx(N+BLOCK_SIZE)
int dimRow = dim;
int dimCol = dim + BLOCK_SIZE;
for(int i = 0; i < dim; i++)
{
h_A[(i + 1) * dimCol - BLOCK_SIZE] = i; // b vector stored in (N+1)th column
// of augmented matrix
for (int j = 0; j < dimRow - i; j++)
{
h_A[j + i + i * dimCol] = i + 1; // A[i][j] entry
h_A[j * dimCol + i + i * dimCol] = i + 1; // A[j][i] entry
}
}
//display the test case [ A | b ]
for ( int m = 0 ; m < dimRow; m++)
{
for ( int n = 0 ; n <= dimRow; n++)
//for ( int n = 0 ; n < dimCol; n++)
{
printf("%d ", h_A[m * dimCol + n]);
}
printf("\n");
}
// allocate device memory for the augmented matrix A
int* d_A;
cudaMalloc(&d_A, mem_size_A);
// start timer
double timer1 = gettime();
// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
// setup execution parameters for obtaining the pivot factor
int gridP = (dimRow / BLOCK_SIZE2) + ( (dimRow % BLOCK_SIZE2) == 0 ? 0 : 1 );
// add 1 if dimRow/BLOCK_SIZE2 is not evenly divisible
// setup execution parameters for the forward solve (Gaussian Elimination)
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(dimCol / threads.x, dimRow / threads.y);
// execute the kernel
for ( int i = 0 ; i < dim; i++)
{
Gaussian_Pivot_CUDA<<< gridP, BLOCK_SIZE2 >>>(d_A, dimCol, i);
Gaussian_Elim_CUDA<<< grid, threads >>>(d_A, dimCol, i);
}
// copy result from device to host
cudaMemcpy(h_A, d_A, mem_size_A, cudaMemcpyDeviceToHost);
// stop timer
double timer2 = gettime();
printf("GPU time = %lf\n",(timer2-timer1)*1000);
// Back substitution
int X[dimRow];
for (int row = dimRow - 1; row >= 0; row--)
{
X[row] = h_A[(row + 1) * dimCol - BLOCK_SIZE]; // b[row] entry
for (int col = dimRow - 1; col > row; col--)
{
X[row] -= h_A[row * dimCol + col] * X[col];
}
X[row] /= h_A[row * dimCol + row];
printf("X[%d] = %d\t",row,X[row]);
}
printf("\n");
#ifdef OUTPUT
// result of Gaussian Elimination
// ----A----- | b --padding--
// 1 1 1 1 .. | 0 * * * ..
// 0 1 1 1 .. | 1 * * * ..
// 0 0 1 1 .. | 1 * * * ..
// 0 0 0 1 .. | 1 * * * ..
// .......... | ...
// * means garbage entry
for ( int m = 0 ; m < dimRow; m++)
{
for ( int n = 0 ; n <= dimRow; n++)
{
printf("%d ", h_A[m * dimCol + n]);
}
printf("\n");
}
#endif
free(h_A);
cudaFree(d_A);
}
///////////////////////////////////////////////////////////////////////////////
------------ gaussian_kernel.cu --------------------
#include "gaussian.h"
__global__
void Gaussian_Pivot_CUDA(int* A, int widthA, int Pcol)
{
// find the pivot factor
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
int bx = blockIdx.x;
int tx = threadIdx.x;
int index_row = BLOCK_SIZE2 * bx + tx;
int index_rowA = index_row * widthA;
__shared__ int A_RowRow;
if (tx == 0)
{
A_RowRow = A[Pcol * widthA + Pcol]; // get A[k,k] where k is the pivot column
// for this problem pivot column = pivot row
}
__syncthreads();
A[index_rowA - 1] = A[index_rowA + Pcol] / A_RowRow;
// INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// store pivot factor at A[dimCol - 1][row]
}
__global__
void Gaussian_Elim_CUDA(int* A, int widthA, int Pcol)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int index_col = BLOCK_SIZE * bx + tx;
int index_row = BLOCK_SIZE * by + ty;
int index = widthA * index_row + index_col;
// d_A[index] "=" d_A[index_row][index_col]
// store pivot factor for rows that we are working on in this block
__shared__ int pivotFactor[BLOCK_SIZE];
if ( ty == 0 ) // use ty instead of tx for coalesced accesses
{
pivotFactor[tx] = A[(index_row + tx) * widthA - 1];
// INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
}
__syncthreads();
// implement the gaussian elimination for the current row
if ( index_row > Pcol )
{
A[index] -= A[Pcol * widthA + index_col] * pivotFactor[ty];
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
}
}
cuda-memcheck told me that I was reading and writing stuff out of bounds.
My indexing was off. In Gaussian_Pivot_CUDA, the last line should be A[widthA*(index_row+1)-1)]=...
In Gaussian_Elim_CUDA, when writing pivotFactor I have to change ty to tx and read A[widthA*(index_row+1)-1)].
Thank you #Eugene !!!
If sizeof(int)==2, then This array index will overflow when dim is 128:
h_A[j * dimCol + i + i * dimCol] = i + 1; // A[j][i] entry
(127*144)+127+(127*144) = 36703 > 32767.