why is my CS50 filter edges code not working with check50? - function

My cs50 filter edges function is not working, it compiles ok but when i run check50 the first test (edges correctly filters middle pixel) us correct while the others are incorrect just by the last value, like this:
:( edges correctly filters pixel on edge
expected "213 228 255\n", not "213 228 140\n"
However, when I print the gx and gy for the red, green and blue alone, and the value of the squareroot, none of the values for the colors match.
now, this is my code for edges
void edges(int height, int width, RGBTRIPLE image[height][width])
{
int sr = 0;
int sb = 0;
int sg = 0;
int yr = 0;
int yb = 0;
int yg = 0;
struct RGBTRIPle
{
int rgbtRed;
int rgbtGreen;
int rgbtBlue;
};
struct RGBTRIPle copia[height][width];
struct RGBTRIPLe
{
int rgbtRed;
int rgbtGreen;
int rgbtBlue;
};
struct RGBTRIPLe copia2[height][width];
//Implementing Gx
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
sr = 0;
sb = 0;
sg = 0;
for (int m = i - 1; m <= i + 1; m++)
{
for (int c = j - 1; c <= j + 1; c++)
{
if (m >= 0 && m < height && c >= 0 && c < width)
{
if (c == j - 1)
{
if (m == i - 1 || m == i + 1)
{
sr += -1 * image[m][c].rgbtRed;
sb += -1 * image[m][c].rgbtBlue;
sg += -1 * image[m][c].rgbtGreen;
}
else
{
sr += -2 * image[m][c].rgbtRed;
sb += -2 * image[m][c].rgbtBlue;
sg += -2 * image[m][c].rgbtGreen;
}
}
if (c == j + 1)
{
if (m == i - 1 || m == i + 1)
{
sr += image[m][c].rgbtRed;
sb += image[m][c].rgbtBlue;
sg += image[m][c].rgbtGreen;
}
else
{
sr += 2 * image[m][c].rgbtRed;
sb += 2 * image[m][c].rgbtBlue;
sg += 2 * image[m][c].rgbtGreen;
}
}
else //c = j
{
sr += 0 * image[m][c].rgbtRed;
sb += 0 * image[m][c].rgbtBlue;
sg += 0 * image[m][c].rgbtGreen;
}
}
}
}
copia[i][j].rgbtRed = sr;
copia[i][j].rgbtGreen = sg;
copia[i][j].rgbtBlue = sb;
}
}
//Implementing Gy
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
yr = 0;
yb = 0;
yg = 0;
for (int m = i - 1; m <= i + 1; m++)
{
for (int c = j - 1; c <= j + 1; c++)
{
if (m >= 0 && m < height && c >= 0 && c < width)
{
if (m == i - 1)
{
if (c == j - 1 || c == j + 1)
{
yr += -1 * image[m][c].rgbtRed;
yb += -1 * image[m][c].rgbtBlue;
yg += -1 * image[m][c].rgbtGreen;
}
else
{
yr += -2 * image[m][c].rgbtRed;
yb += -2 * image[m][c].rgbtBlue;
yg += -2 * image[m][c].rgbtGreen;
}
}
if (m == i + 1)
{
if (c == j + 1 || c == j - 1)
{
yr += image[m][c].rgbtRed;
yb += image[m][c].rgbtBlue;
yg += image[m][c].rgbtGreen;
}
else
{
yr += 2 * image[m][c].rgbtRed;
yb += 2 * image[m][c].rgbtBlue;
yg += 2 * image[m][c].rgbtGreen;
}
}
else //c = j
{
yr += 0 * image[m][c].rgbtRed;
yb += 0 * image[m][c].rgbtBlue;
yg += 0 * image[m][c].rgbtGreen;
}
}
}
}
copia2[i][j].rgbtRed = yr;
copia2[i][j].rgbtGreen = yg;
copia2[i][j].rgbtBlue = yb;
}
}
//Implementing math operation to calculate resulting color
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
int r = 0;
int g = 0;
int b = 0;
image[i][j].rgbtRed = (int) round(sqrt((copia[i][j].rgbtRed * copia[i][j].rgbtRed) + (copia2[i][j].rgbtRed *
copia2[i][j].rgbtRed)));
image[i][j].rgbtGreen = (int) round(sqrt((copia[i][j].rgbtGreen * copia[i][j].rgbtGreen) + (copia2[i][j].rgbtGreen *
copia2[i][j].rgbtGreen)));
image[i][j].rgbtBlue = (int) round(sqrt((copia[i][j].rgbtBlue * copia[i][j].rgbtBlue) + (copia2[i][j].rgbtBlue *
copia2[i][j].rgbtBlue)));
r = image[i][j].rgbtRed;
g = image[i][j].rgbtGreen;
b = image[i][j].rgbtBlue;
if (image[i][j].rgbtRed > 255)
{
image[i][j].rgbtRed = 255;
}
if (image[i][j].rgbtGreen > 255)
{
image[i][j].rgbtGreen = 255;
}
if (image[i][j].rgbtBlue > 255)
{
image[i][j].rgbtBlue = 255;
}
}
}
return;
}

The problem you describe arises when you store round(sqrt((copia[i][j].rgbtRed * copia[i][j].rgbtRed) + (copia2[i][j].rgbtRed *copia2[i][j].rgbtRed))); into the variable image[i][j].rgbtRed(or any other variant thereof). This is because when calculating sqrt(gx^2 + gy^2) you are getting a number above 255. For example, you may get the integer value 395 after rounding. To store that value in to image[i][j].rgbtRed, C will store the value of 395 % 255, or 140, because the image cannot store values greater than 255, by definition.
This means that your if statements are useless, because the respective color values will never be greater than 255:
if (image[i][j].rgbtRed > 255)
{
image[i][j].rgbtRed = 255;
}
if (image[i][j].rgbtGreen > 255)
{
image[i][j].rgbtGreen = 255;
}
if (image[i][j].rgbtBlue > 255)
{
image[i][j].rgbtBlue = 255;
}
To solve this problem you have to cap the value before storing them into the image. A simple implementation of this would be by making a function called cap that returns 255 if an input is above 255.:
int cap(int rgb)
{
if (rgb > 255)
{
return 255;
}
else
{
return rgb;
}
}
You can then use this function in the following manner, which will solve your problem completely:
image[i][j].rgbtRed = cap(round(sqrt((copia[i][j].rgbtRed * copia[i][j].rgbtRed) + (copia2[i][j].rgbtRed * copia2[i][j].rgbtRed))));
image[i][j].rgbtGreen = cap(round(sqrt((copia[i][j].rgbtGreen * copia[i][j].rgbtGreen) + (copia2[i][j].rgbtGreen * copia2[i][j].rgbtGreen))));
image[i][j].rgbtBlue = cap(round(sqrt((copia[i][j].rgbtBlue * copia[i][j].rgbtBlue) + (copia2[i][j].rgbtBlue * copia2[i][j].rgbtBlue))));
This will also shorten your code, make it look cleaner, and avoid unnecessary repetition.

Related

Fuzzy matching of an OCR output in text file

I have a question regarding partial match of two strings.
I have a string and I need to validate it. To be more specific, I have an output from OCR reading and it contains some mistakes, of course. I need to check if the string is really there but as it can be written incorrectly I need only 70% match.
Is it possible to do that in UiPath? The string is in notepad (.txt) so any idead would be helpful.
Try passing OCR output/words_detected against a base word.(double fuzzyness is 0-1)
list<string> Search(string word, list<string> wordList, double fuzzyness) {
list<string> foundWords;
for (string s : wordList) {
int levenshteinDistance = LevenshteinDistance(word, s);
int length = max(word.length(), s.length());
double score = 1.0 - (double)levenshteinDistance / length;
if (score > fuzzyness) foundWords.push_back(s);
}
if (foundWords.size() > 1) {
for (double d = fuzzyness; ; d++) {
foundWords = Search(word, wordList, d);
if (foundWords.size() == 1) break;
}
}
return foundWords;}
int LevenshteinDistance(string src, string dest) {
std::vector<vector<int>> d;
d.resize((int)src.size() + 1, std::vector<int>((int)dest.size() + 1, 0));
int i, j, cost;
std::vector<char> str1(src.begin(), src.end());
std::vector<char> str2(dest.begin(), dest.end());
for (i = 0; i <= str1.size(); i++) d[i][0] = i;
for (j = 0; j <= str2.size(); j++) d[0][j] = j;
for (i = 1; i <= str1.size(); i++) {
for (j = 1; j <= str2.size(); j++) {
if (str1[i - 1] == str2[j - 1]) cost = 0;
else cost = 1;
d[i][j] = min(d[i - 1][j] + 1, min(d[i][j - 1] + 1, d[i - 1][j - 1] + cost));
if ((i > 1) && (j > 1) && (str1[i - 1] == str2[j - 2]) && (str1[i - 2] == str2[j - 1])) d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost);
}
}
return d[str1.size()][str2.size()];}

Maximum value of batchsize allowed for cublasDgetrfBatched() from CUBLAS Library

Is there any maximum batchsize limitation for cublasDgetrfBatched() from CUBLAS library? I am doing a benchmark problem for comparing timings between CPU and GPU. For a batchsize of 1000 i am getting GPU timing greater than CPU timing. But, for a batchsize of 100, i am getting some speedup over CPU.
I have posted below the code that i used for benchmarking.
1. main.cpp
/*main.cpp goes below*/
#include<stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "mathlib_blas.h"
int main(){
double**mat;
double**mat_scratch1;
int *ipvt;
double *fVec;
double *fVecSave;
double *fVec_scratch;
double *A;
double *B;
double **devPtrA;
double **devPtrB;
double **devPtrA_dev;
double **devPtrB_dev;
double *d_x;
double *x;
int *d_pivot_array ;
int *d_info_array;
int *h_info_array;
int batchsize;
int neqn;
cublasHandle_t handle;
cublasStatus_t status;
cudaError_t error;
clock_t start, end, start1, end1;
double rcond;
batchsize = 32;
neqn = 172;
mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
ipvt = (int*) calloc((size_t) neqn, sizeof(int));
fVec = (double*) calloc((size_t) neqn, sizeof(double));
fVecSave = (double*) calloc((size_t) neqn, sizeof(double));
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double));
A = (double*)malloc( neqn*neqn*sizeof(A[0]));
B = (double*)malloc( neqn*neqn*sizeof(B[0]));
devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA));
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB));
for(int b_count =0; b_count<batchsize; b_count++){
cudaMalloc((void **)&devPtrA[b_count], neqn*neqn * sizeof(devPtrA[0][0]));
cudaMalloc((void **)&devPtrB[b_count], batchsize*neqn * sizeof(devPtrB[0][0]));
}
cudaMalloc((void **)&devPtrA_dev, batchsize*sizeof(*devPtrA));
cudaMalloc((void **)&devPtrB_dev, batchsize*sizeof(*devPtrB));
cudaMemcpy(devPtrA_dev, devPtrA, batchsize*sizeof(*devPtrA), cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB_dev, devPtrB, batchsize*sizeof(*devPtrB), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_x, neqn*sizeof(double));
x =(double *)malloc(neqn*sizeof(double));
cudaMalloc((void **)&d_pivot_array, batchsize*neqn*sizeof(int));
cudaMalloc((void **)&d_info_array, batchsize*sizeof(int));
h_info_array =(int*)malloc(batchsize*sizeof(int));
cublasCreate(&handle);
srand(time(NULL));
/* Fill in the CPU and GPU Matrix */
for (int iRow = 0; iRow < neqn; iRow++) {
double sumCol = 0;
for (int iColumn = 0; iColumn < neqn; iColumn++) {
for(int b_count =0; b_count<batchsize; b_count++){
A[neqn*iColumn + iRow] = rand()%10 ;
mat[iRow][iColumn] = A[neqn*iColumn + iRow];
}
sumCol +=A[neqn*iColumn + iRow];
}
fVec[iRow] = sumCol;
fVecSave[iRow] = sumCol;
}
/*CPU_CODE GOES HERE */
start = clock();
for(int b_count =0; b_count<batchsize; b_count++){
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat_scratch1[iColumn][iRow]= mat[iColumn][iRow];
}
}
dgeco_blas(mat_scratch1, neqn, ipvt, &rcond, fVecSave);
}
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat[iColumn][iRow]= mat_scratch1[iColumn][iRow];
}
}
for(int b_count =0; b_count<batchsize; b_count++){
for(int i = 0; i < neqn; i++) fVec_scratch[i] = fVec[i];
dgesl_blas(mat, neqn, ipvt , fVec_scratch, 0);
}
end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("Time in seconds(CPU) : %lf \n", seconds);
/*CPU_CODE ENDS HERE */
start1 = clock();
for(int b_count =0; b_count<batchsize; b_count++){
status = cublasSetMatrix(neqn, neqn, sizeof(A[0]), A, neqn, devPtrA[b_count], neqn);
}
status = cublasDgetrfBatched(handle, neqn, ( double**)devPtrA_dev,neqn,d_pivot_array,d_info_array,batchsize);
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status);
cudaMemcpy(h_info_array, d_info_array, batchsize*sizeof(int), cudaMemcpyDeviceToHost);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy(devPtrB[b_count], fVec, neqn*sizeof(double),cudaMemcpyHostToDevice); /* for testing purpose only */
}
status = cublasDgetrsBatched(handle, CUBLAS_OP_N, neqn, batchsize, (const double**)devPtrA_dev,
neqn, d_pivot_array,devPtrB_dev, neqn, h_info_array, batchsize);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy( fVec,devPtrB[b_count], neqn*sizeof(double),cudaMemcpyDeviceToHost); /* for testing purpose only */
}
end1 = clock();
float seconds1 = (float)(end1 - start1) / CLOCKS_PER_SEC;
printf("Time in seconds(GPU) : %lf \n", seconds1);
printf("Speedup(CPU/GPU) : %lf \n", seconds/seconds1);
system("pause");
/* End of the main portion of the code */
free(mat);
free(mat_scratch1);
free(ipvt);
free(fVec);
free(fVecSave);
free(fVec_scratch);
free(A);
free(B);
cudaFree(devPtrA[0]);
cudaFree(devPtrB[0]);
cudaFree(devPtrA_dev);
cudaFree(devPtrB_dev);
free(devPtrA);
free(devPtrB);
cudaFree(d_x);
free(x);
cudaFree(d_pivot_array);
cudaFree(d_info_array);
free(h_info_array);
cublasDestroy_v2(handle);
}
2. mathlib_blas.h
#include <stdio.h>
#include <math.h>
#define maxm(a,b) (((a) > (b)) ? (a) : (b))
#define minm(a,b) (((a) < (b)) ? (a) : (b))
#define signum(a,b) (((b) < (0)) ? (-a) : (a))
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType);
void dgefa_blas(double **a,int n, int ipvt[],int *info);
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job);
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z);
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType )
{
void** array = nullptr;
array = (void**)calloc(size1, sizeof(void*));
if (array != nullptr) {
if (size2 > 0) {
void* data = calloc(size1*size2, sizeType);
if (data != nullptr) {
char* addr = (char*)data;
for (int index1 = 0; index1 < size1; index1++) {
array[index1] = (void*)addr;
addr += sizeType*size2; /* char is always 1 byte */
}
} else {
free(array);
free(data);
array = nullptr;
}
}
} else {
}
return array;
}
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z)
{
double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm;
int i,info,j,k,kb,kp1,l;
/* Compute 1-norm of a */
anorm = 0.0;
for (j = 0; j < n; j++) {
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(a[i][j]);
anorm = maxm(anorm,vecsum);
}
/* Factor. */
dgefa_blas(a,n,ipvt,&info);
/* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))).
* estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e.
* trans(a) is the transpose of a. The components of e are
* chosen to cause maximum local growth in the elements of
* w, where trans(u)*w=e. The vectors are frequently rescaled
* to avoid overflow.
*/
ek = 1.0;
for (j = 0; j < n; j++)
z[j] = 0.0;
for (k = 0; k < n; k++) {
if (z[k] != 0.0)
ek = signum(ek,-z[k]);
if (fabs(ek-z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(ek-z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ek *= s;
}
wk = ek - z[k];
wkm = -ek - z[k];
s = fabs(wk);
sm = fabs(wkm);
if (a[k][k] != 0.0) {
wk /= a[k][k];
wkm /= a[k][k];
}
else {
wk = 1.0;
wkm = 1.0;
}
kp1 = k + 1;
if (kp1 < n) {
for (j = kp1; j < n; j++) {
sm += (fabs(z[j] + wkm * a[k][j]));
z[j] += (wk * a[k][j]);
s += fabs(z[j]);
}
if (s < sm) {
t = wkm -wk;
wk = wkm;
for (j = kp1; j < n; j++)
z[j] += (t * a[k][j]);
}
}
z[k] = wk;
}
/* dasum(n,s,z,1) */
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
/* Solve trans(l)*y= w
*/
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (k < (n-1)) {
/* sdot(n-k,a(k+1,k),1,z(k+1),1) */
vecdot = 0.0;
for (i = k+1;i < n; i++)
vecdot += (a[i][k] * z[i]);
z[k] += vecdot;
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
}
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
} /* endfor kb */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm = 1.0;
/*
* Solve l * v = y
*/
for (k = 0; k < n; k++) {
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
if (k < (n-1)) {
/* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */
for (i = k+1;i < n; i++)
z[i] += (t * a[i][k]);
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
}
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
/* Solve u * z = v */
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (fabs(z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
if (a[k][k] != 0.0)
z[k] /= a[k][k];
if (a[k][k] == 0.0)
z[k] = 1.0;
t = -z[k];
/* daxpy(k-1,t,a[1][k],1,z[1],1) */
for (i = 0; i < k; i++)
z[i] += (t * a[i][k]);
}
/* Make znorm = 1.0 */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
if (anorm != 0.0) *rcond = ynorm/anorm;
if (anorm == 0.0) *rcond = 0.0;
}
void dgefa_blas(double **a,int n, int ipvt[],int *info)
{
double dmax,t;
int i,j,k,kp1,l,nm1;
*info = 0;
nm1 = n - 1;
if (n > 0) {
for (k = 0; k < nm1; k++) {
kp1 = k + 1;
/* Find l = pivot index. */
dmax = fabs(a[k][k]);
l = k;
for (i = k+1; i < n; i++) {
if (fabs(a[i][k]) <= dmax) continue;
l = i;
}
ipvt[k] = l;
/* Zero pivot implies this column already triangularized. */
if (a[l][k] == 0.0) {
*info = k;
continue;
}
/* Interchange if necessary. */
if (l != k) {
t = a[l][k];
a[l][k] = a[k][k];
a[k][k] = t;
}
/* Compute multipliers. */
if (a[k][k] == 0.0) printf("\n!ERROR. Singular matrix.\n");
t = -1.0/a[k][k];
for (i = k+1; i < n; i++)
a[i][k] *= t;
/* Row elimination with column indexing. */
for (j = kp1; j < n; j++) {
t = a[l][j];
if (l != k) {
a[l][j] = a[k][j];
a[k][j] = t;
}
for (i = k+1; i < n; i++ )
a[i][j] += (t * a[i][k]);
}
}
}
ipvt[n-1] = n-1;
if (a[n-1][n-1] == 0.0) *info = n-1;
}
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job)
{
double t;
int i,k,kb,l,nm1;
nm1 = n - 1;
if (job == 0) {
/* job = 0, solve a * x = b.
* First solve l * y = b.
*/
if (n > 0) {
for (k = 0; k < nm1; k++) {
l = ipvt[k];
t = b[l];
if (l != k) {
b[l] = b[k];
b[k] = t;
}
/* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */
for (i=k+1;i < n;i++)
b[i] += (t * a[i][k]);
}
}
/* Now solve u * x = y. */
for (kb = 0; kb < n; kb++) {
k = n - kb-1;
b[k] /= a[k][k];
t = -b[k];
/* saxpy(k-1,t,a(1,k),1,b(1),1); */
for (i = 0; i < k ; i++)
b[i] += (t * a[i][k]);
}
return;
}
/* job != 0, solve trans(a) * x = b.
* First solve trans(u) * x = y.
*/
for (k = 0; k < n; k++) {
/* t = ddot(k-1,a(1,k),1,b(1),1); */
t = 0;
for (i = 0; i < k; i++)
t += (a[i][k] * b[i]);
b[k] = (b[k] - t)/a[k][k];
}
/* Now solve trans(l) * x = y. */
if (n > 0) {
for (kb = 0; kb < nm1; kb++) {
k = n - 2 - kb;
/* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */
t = 0;
for (i = k+1;i < n; i++)
t += (a[i][k] * b[i]);
b[k] += t;
l = ipvt[k];
if (l != k) {
t = b[l];
b[l] = b[k];
b[k] = t;
}
}
}
}
There should not be any behavioral differences between a batch size of 100 and a batch size of 1000. (Certainly there would be a performance difference - the batch size of 1000 should probably take longer.)
There are no published limits to the batch size, other than implicit memory limits. In fact, unless the GPU is returning incorrect results, there is no reason to think that you've run into any hard limit anyway.
( If you wanted to explore some behavioral or performance issue, this question is not properly written to address that. )

CUDA Broken Float Math on Kepler/Fermi Arch

I have a program that does a lot of single precision math. It produces correct results if I specify 1.0 architecture but is broken for 2.X and 3.X architectures. What would cause this?
Included below:
Very long code sample.
Compile command and good output.
Compile command and bad output.
If I run the same routing in the CPU using gcc, I get results that match the 1.0 architecture.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
/*
* svdcomp - SVD decomposition routine.
* Takes an mxn matrix a and decomposes it into udv, where u,v are
* left and right orthogonal transformation matrices, and d is a
* diagonal matrix of singular values.
*
* This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is
* code from Numerical Recipes adapted by Luke Tierney and David Betz.
* Originally from: "Numerical Recipes in C: The Art of Scientific Computing",
* Press, Flannery, Teukolosky, Vetterling. 1992.
*
* Input to dsvd is as follows:
* a = mxn matrix to be decomposed, gets overwritten with u
* m = row dimension of a
* n = column dimension of a
* w = returns the vector of singular values of a
* v = returns the right orthogonal transformation matrix
*/
#define SIGN(a, b) ((b) >= 0.0f ? fabsf(a) : -fabsf(a))
#define MIN(x,y) ( (x) < (y) ? (x) : (y) )
#define MAX(x,y) ((x)>(y)?(x):(y))
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
#define ERRCHECK \
if (cudaPeekAtLastError()) { \
fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
__device__ int
svd(float *a, int m, int n, float *w, float *v, int skip_u)
{
int flag, i, its, j, jj, k, l, nm;
float c, f, h, s, x, y, z;
float anorm = 0.0f, g = 0.0f, scale = 0.0f;
float rv1[3];
/* Householder reduction to bidiagonal form */
for (i = 0; i < n; i++)
{
/* left-hand reduction */
l = i + 1;
rv1[i] = scale * g;
g = s = scale = 0.0f;
if (i < m)
{
for (k = i; k < m; k++)
scale += fabsf(a[k*n+i]);
if (scale)
{
for (k = i; k < m; k++)
{
a[k*n+i] /= scale;
s += powf(a[k*n+i], 2);
}
f = a[i*n+i];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+i] = f - g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = i; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = s / h;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (k = i; k < m; k++)
a[k*n+i] *= scale;
}
}
w[i] = scale * g;
/* right-hand reduction */
g = s = scale = 0.0f;
if (i < m && i != n - 1)
{
for (k = l; k < n; k++)
scale += fabsf(a[i*n+k]);
if (scale)
{
for (k = l; k < n; k++)
{
a[i*n+k] /= scale;
s += powf(a[i*n+k], 2);
}
f = a[i*n+l];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+l] = f - g;
for (k = l; k < n; k++)
rv1[k] = a[i*n+k] / h;
if (i != m - 1)
{
for (j = l; j < m; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[j*n+k] * a[i*n+k];
for (k = l; k < n; k++)
a[j*n+k] += s * rv1[k];
}
}
for (k = l; k < n; k++)
a[i*n+k] *= scale;
}
}
anorm = MAX(anorm, fabsf(w[i]) + fabsf(rv1[i]));
}
/* accumulate the right-hand transformation */
for (i = n - 1; i >= 0; i--)
{
if (i < n - 1)
{
if (g)
{
for (j = l; j < n; j++)
v[j*n+i] = (a[i*n+j] / a[i*n+l]) / g;
/* float division to avoid underflow */
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[i*n+k] * v[k*n+j];
for (k = l; k < n; k++)
v[k*n+j] += s * v[k*n+i];
}
}
for (j = l; j < n; j++)
v[i*n+j] = v[j*n+i] = 0.0f;
}
v[i*n+i] = 1.0f;
g = rv1[i];
l = i;
}
/* accumulate the left-hand transformation */
if (!skip_u) {
for (i = n - 1; i >= 0; i--)
{
l = i + 1;
g = w[i];
if (i < n - 1)
for (j = l; j < n; j++)
a[i*n+j] = 0.0f;
if (g)
{
g = 1.0f / g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = (s / a[i*n+i]) * g;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (j = i; j < m; j++)
a[j*n+i] = a[j*n+i]*g;
}
else
{
for (j = i; j < m; j++)
a[j*n+i] = 0.0f;
}
++a[i*n+i];
}
}
/* diagonalize the bidiagonal form */
for (k = n - 1; k >= 0; k--)
{ /* loop over singular values */
for (its = 0; its < 30; its++)
{ /* loop over allowed iterations */
flag = 1;
for (l = k; l >= 0; l--)
{ /* test for splitting */
nm = l - 1;
if (fabsf(rv1[l]) + anorm == anorm)
{
flag = 0;
break;
}
if (fabsf(w[nm]) + anorm == anorm)
break;
}
if (flag)
{
c = 0.0f;
s = 1.0f;
for (i = l; i <= k; i++)
{
f = s * rv1[i];
if (fabsf(f) + anorm != anorm)
{
g = w[i];
h = hypotf(f, g);
w[i] = h;
h = 1.0f / h;
c = g * h;
s = (- f * h);
if (!skip_u) {
for (j = 0; j < m; j++)
{
y = a[j*n+nm];
z = a[j*n+i];
a[j*n+nm] = y * c + z * s;
a[j*n+i] = z * c - y * s;
}
}
}
}
}
z = w[k];
if (l == k)
{ /* convergence */
if (z < 0.0f)
{ /* make singular value nonnegative */
w[k] = -z;
for (j = 0; j < n; j++)
v[j*n+k] = -v[j*n+k];
}
break;
}
if (its >= 30) {
}
/* shift from bottom 2 x 2 minor */
x = w[l];
nm = k - 1;
y = w[nm];
g = rv1[nm];
h = rv1[k];
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
g = hypotf(f, 1.0f);
f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
/* next QR transformation */
c = s = 1.0f;
for (j = l; j <= nm; j++)
{
i = j + 1;
g = rv1[i];
y = w[i];
h = s * g;
g = c * g;
z = hypotf(f, h);
rv1[j] = z;
c = f / z;
s = h / z;
f = x * c + g * s;
g = g * c - x * s;
h = y * s;
y = y * c;
for (jj = 0; jj < n; jj++)
{
x = v[jj*n+j];
z = v[jj*n+i];
v[jj*n+j] = x * c + z * s;
v[jj*n+i] = z * c - x * s;
}
z = hypotf(f, h);
w[j] = z;
if (z)
{
z = 1.0f / z;
c = f * z;
s = h * z;
}
f = (c * g) + (s * y);
x = (c * y) - (s * g);
if (!skip_u) {
for (jj = 0; jj < m; jj++)
{
y = a[jj*n+j];
z = a[jj*n+i];
a[jj*n+j] = y * c + z * s;
a[jj*n+i] = z * c - y * s;
}
}
}
rv1[l] = 0.0f;
rv1[k] = f;
w[k] = x;
}
}
return(0);
}
__global__ void
svd_kernel(float *v)
{
float a[9], w[3];
a[0] = 8.0f;
a[1] = 3.0f;
a[2] = 7.0f;
a[3] = 7.0f;
a[4] = 9.0f;
a[5] = 1.0f;
a[6] = 3.0f;
a[7] = 7.0f;
a[8] = 2.0f;
svd(a, 3, 3, w, v, 1);
}
int main()
{
int i, j;
float *v_d, v[9];
PERR(cudaMalloc(&v_d, 9*sizeof(float)));
svd_kernel<<<1,1>>>(v_d);
cudaDeviceSynchronize();
ERRCHECK;
PERR(cudaMemcpy(v, v_d, 9*sizeof(float), cudaMemcpyDeviceToHost));
for (i = 0; i < 3; i++) {
for (j = 0; j < 3; j++) {
printf("%6.3f\t", v[i*3+j]);
}
printf("\n");
}
return 0;
}
Correct Results:
$ nvcc -arch=sm_10 -o svd svd.cu
$ ./svd
-0.657 -0.685 0.314
-0.668 0.337 -0.664
-0.349 0.646 0.679
Broken Results:
$ nvcc -arch=sm_20 -o svd svd.cu
$ ./svd
-0.661 -0.660 0.356
-0.642 0.253 -0.724
0.019 0.460 0.888
It seems that CUDA 6 fixes the issue.

how can i print 11 numbers in 2 div,in each div i need 8 names(4 td's in each row)second div from 9,how can i do this?

<%
int apps = 11;
int noOfDiv = apps % 3, k, m;
for (int i = 1; i <= 2; i++) {
out.println("<div>");
out.println("<table>");
for (int j = 1; j <= 2; j++) {
out.println("<tr>");
for (k = 1; k <= 4; k++) {
out.println("<td>");
out.println("" + k + "");
out.println("</td>");
}
out.println("</tr>");
}
out.println("</table>");
out.println("</div>");
}
%>
for this i'm getting output as
1234
1234
in div1
1234
1234
in div2 ,
but i need
1234
5678
in div1 and
9 10 11
in div2 if i have total 11 numbers
You can do that by using following code;
<%
int apps = 11;
int noOfDiv = apps % 3, k, m;
for (int i = 1; i <= 2; i++) {
out.println("<div>");
out.println("<table>");
for (int j = 1; j <= 2; j++) {
out.println("<tr>");
int temp = (j-1)*4 +1;
for (k = temp; k <= temp+3; k++) {
out.println("<td>");
out.println("" + k + "");
out.println("</td>");
}
out.println("</tr>");
}
out.println("</table>");
out.println("</div>");
}
out.println("<div><table><tr><td>" + (apps - 2) + "</td><td>" + (apps - 1) + "</td><td>" + apps + "</td></tr></table></div>");
%>
It is because you prints k for k = 1 to 4, and to correct use an extra variable say capital K = 1 before any loop, then replace:
for (k = 1; k <= 4; k++) // print 1 to 4
as:
int noOfDiv = apps % 3, k, m, K = 1; // Added K = 1
// rest of your codes ...
for (k = K; k <= K + 3; k++){
// code you already have to print small `k`
K += 4;
}

I am having problems with my FFT in c#

Does Microsoft have a good FFT that I can use ?
so I made my owe FFT and it work from some case but now all...
like if I get it
f(t) =10*sin(2*pi *3000*t) + 20*sin(1000* 2* PI* t)
it will work but if I add
+ 5*sin(2*pi*100*T) is start acting fun?
now in Matlab it works good but not in my close, also my fft only seem to return the right numbers in the Image not so much in the real...
here is my code:
enter code here
public struct polar1
{
public double real;
public double img;
};
private float Fs;
private int N;
private polar1 [] F;
private int R;
public DSPclass(float[] DSP1,int f1)
{
N = DSP1.Length;
R = DSP1.Length;
F = new polar1[N];
Fs = (float)f1;
}
public void FFT1(float[] DSP1)
{
polar1[] x = new polar1[DSP1.Length];
for (int v = 0; v < N; v++)
{
x[v].real = DSP1[v];
x[v].img = 0;
}
F = FFT(x);
int temp;
}
public polar1[] FFT(polar1[] x)
{
int N2 = x.Length;
polar1[] X = new polar1[N2];
if (N2 == 1)
{
return x;
}
polar1[] odd = new polar1[N2 / 2];
polar1[] even = new polar1[N2 / 2];
polar1[] Y_Odd = new polar1[N2 / 2];
polar1[] Y_Even = new polar1[N2 / 2];
for (int t = 0; t < N2 / 2; t++)
{
even[t].img = x[t * 2].img;
even[t].real = x[t * 2].real;
odd[t].img = x[(t * 2) + 1].img;
odd[t].real = x[(t * 2) + 1].real;
}
Y_Even = FFT(even);
Y_Odd = FFT(odd);
polar1 temp4;
for (int k = 0; k < (N2 / 2); k++)
{
temp4 = Complex1(k, N2);
X[k].real = Y_Even[k].real + (Y_Odd[k].real * temp4.real);
X[k + (N2 / 2)].real = Y_Even[k].real - (Y_Odd[k].real * temp4.real);
X[k].img = Y_Even[k].img + (Y_Odd[k].real * temp4.img);
X[k + (N2 / 2)].img = Y_Even[k].img - (Y_Odd[k].real * temp4.img);
}
return X;
}
public double magnitude( polar1 temp)
{
double tempD;
tempD = Math.Sqrt ( (temp.img * temp.img) + (temp.real * temp.real));
return tempD;
}
public polar1 Complex2(int K, int N, int F3)
{
polar1 temp;
double temp1;
temp1 = (2D * K *F3) / N;
if (temp1 % 2 == 0 || temp1 == 0)
{
temp.real = 1D;
temp.img = 0D;
}
else if ((temp1 - 1) % 2 == 0)
{
temp.real = -1D;
temp.img = 0D;
}
else if ((temp1 / .5D) - 1 % 2 == 0)
{
if ((temp1 - .5D) % 2 == 0)
{
temp.real = 0D;
temp.img = -1D;
}
else
{
temp.real = 0D;
temp.img = 1D;
}
}
else
{
temp.real = Math.Cos(temp1 * Math.PI);
temp.img = -1D * Math.Sin(temp1 * Math.PI);
}
return temp;
}
public polar1 Complex1(int K, int N3)
{
polar1 temp;
double temp1;
temp1 = (2D * Math.PI *K) / N3;
temp.real = Math.Cos(temp1);
temp.img = Math.Sin(temp1);
return temp;
}
public int Apm(double[] img, double[] real)
{
for (int i = 0; i < R; i++)
{
img[i] = F[i].img;
real[i] = F[i].real;
}
return R;
}
public int frequencies(float [] freq, float [] Ctemp)
{
bool flag = false;
bool flagD = false;
float tempOld = 0;
float tempNew =0;
int tempc = 0;
int counter = 0;
for (int i = 0; i < R; i++)
{
if (((i / N) * Fs) >= (Fs / 2))
{
return counter;
}
if ((int)F[i].img != 0 )
{
flag = true;
tempOld = (float)(Math.Abs(F[i].img));
}
else
{
if (flagD == true)
{
freq[counter] = ((float)tempc / (float)N) * Fs;
Ctemp[counter] = tempNew; //magnitude(F[tempc]);
counter++;
flagD = false;
}
flag = false;
tempOld = 0;
tempNew = 0;
}
if(flag == true)
{
if (tempOld > tempNew)
{
tempNew = tempOld;
tempc = i;
flagD = true;
}
}
}
return counter;
}
}