I have a question regarding partial match of two strings.
I have a string and I need to validate it. To be more specific, I have an output from OCR reading and it contains some mistakes, of course. I need to check if the string is really there but as it can be written incorrectly I need only 70% match.
Is it possible to do that in UiPath? The string is in notepad (.txt) so any idead would be helpful.
Try passing OCR output/words_detected against a base word.(double fuzzyness is 0-1)
list<string> Search(string word, list<string> wordList, double fuzzyness) {
list<string> foundWords;
for (string s : wordList) {
int levenshteinDistance = LevenshteinDistance(word, s);
int length = max(word.length(), s.length());
double score = 1.0 - (double)levenshteinDistance / length;
if (score > fuzzyness) foundWords.push_back(s);
}
if (foundWords.size() > 1) {
for (double d = fuzzyness; ; d++) {
foundWords = Search(word, wordList, d);
if (foundWords.size() == 1) break;
}
}
return foundWords;}
int LevenshteinDistance(string src, string dest) {
std::vector<vector<int>> d;
d.resize((int)src.size() + 1, std::vector<int>((int)dest.size() + 1, 0));
int i, j, cost;
std::vector<char> str1(src.begin(), src.end());
std::vector<char> str2(dest.begin(), dest.end());
for (i = 0; i <= str1.size(); i++) d[i][0] = i;
for (j = 0; j <= str2.size(); j++) d[0][j] = j;
for (i = 1; i <= str1.size(); i++) {
for (j = 1; j <= str2.size(); j++) {
if (str1[i - 1] == str2[j - 1]) cost = 0;
else cost = 1;
d[i][j] = min(d[i - 1][j] + 1, min(d[i][j - 1] + 1, d[i - 1][j - 1] + cost));
if ((i > 1) && (j > 1) && (str1[i - 1] == str2[j - 2]) && (str1[i - 2] == str2[j - 1])) d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost);
}
}
return d[str1.size()][str2.size()];}
Is there any maximum batchsize limitation for cublasDgetrfBatched() from CUBLAS library? I am doing a benchmark problem for comparing timings between CPU and GPU. For a batchsize of 1000 i am getting GPU timing greater than CPU timing. But, for a batchsize of 100, i am getting some speedup over CPU.
I have posted below the code that i used for benchmarking.
1. main.cpp
/*main.cpp goes below*/
#include<stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "mathlib_blas.h"
int main(){
double**mat;
double**mat_scratch1;
int *ipvt;
double *fVec;
double *fVecSave;
double *fVec_scratch;
double *A;
double *B;
double **devPtrA;
double **devPtrB;
double **devPtrA_dev;
double **devPtrB_dev;
double *d_x;
double *x;
int *d_pivot_array ;
int *d_info_array;
int *h_info_array;
int batchsize;
int neqn;
cublasHandle_t handle;
cublasStatus_t status;
cudaError_t error;
clock_t start, end, start1, end1;
double rcond;
batchsize = 32;
neqn = 172;
mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double));
ipvt = (int*) calloc((size_t) neqn, sizeof(int));
fVec = (double*) calloc((size_t) neqn, sizeof(double));
fVecSave = (double*) calloc((size_t) neqn, sizeof(double));
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double));
A = (double*)malloc( neqn*neqn*sizeof(A[0]));
B = (double*)malloc( neqn*neqn*sizeof(B[0]));
devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA));
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB));
for(int b_count =0; b_count<batchsize; b_count++){
cudaMalloc((void **)&devPtrA[b_count], neqn*neqn * sizeof(devPtrA[0][0]));
cudaMalloc((void **)&devPtrB[b_count], batchsize*neqn * sizeof(devPtrB[0][0]));
}
cudaMalloc((void **)&devPtrA_dev, batchsize*sizeof(*devPtrA));
cudaMalloc((void **)&devPtrB_dev, batchsize*sizeof(*devPtrB));
cudaMemcpy(devPtrA_dev, devPtrA, batchsize*sizeof(*devPtrA), cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB_dev, devPtrB, batchsize*sizeof(*devPtrB), cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_x, neqn*sizeof(double));
x =(double *)malloc(neqn*sizeof(double));
cudaMalloc((void **)&d_pivot_array, batchsize*neqn*sizeof(int));
cudaMalloc((void **)&d_info_array, batchsize*sizeof(int));
h_info_array =(int*)malloc(batchsize*sizeof(int));
cublasCreate(&handle);
srand(time(NULL));
/* Fill in the CPU and GPU Matrix */
for (int iRow = 0; iRow < neqn; iRow++) {
double sumCol = 0;
for (int iColumn = 0; iColumn < neqn; iColumn++) {
for(int b_count =0; b_count<batchsize; b_count++){
A[neqn*iColumn + iRow] = rand()%10 ;
mat[iRow][iColumn] = A[neqn*iColumn + iRow];
}
sumCol +=A[neqn*iColumn + iRow];
}
fVec[iRow] = sumCol;
fVecSave[iRow] = sumCol;
}
/*CPU_CODE GOES HERE */
start = clock();
for(int b_count =0; b_count<batchsize; b_count++){
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat_scratch1[iColumn][iRow]= mat[iColumn][iRow];
}
}
dgeco_blas(mat_scratch1, neqn, ipvt, &rcond, fVecSave);
}
for (int iRow = 0; iRow < neqn; iRow++) {
for (int iColumn = 0; iColumn < neqn; iColumn++) {
mat[iColumn][iRow]= mat_scratch1[iColumn][iRow];
}
}
for(int b_count =0; b_count<batchsize; b_count++){
for(int i = 0; i < neqn; i++) fVec_scratch[i] = fVec[i];
dgesl_blas(mat, neqn, ipvt , fVec_scratch, 0);
}
end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("Time in seconds(CPU) : %lf \n", seconds);
/*CPU_CODE ENDS HERE */
start1 = clock();
for(int b_count =0; b_count<batchsize; b_count++){
status = cublasSetMatrix(neqn, neqn, sizeof(A[0]), A, neqn, devPtrA[b_count], neqn);
}
status = cublasDgetrfBatched(handle, neqn, ( double**)devPtrA_dev,neqn,d_pivot_array,d_info_array,batchsize);
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status);
cudaMemcpy(h_info_array, d_info_array, batchsize*sizeof(int), cudaMemcpyDeviceToHost);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy(devPtrB[b_count], fVec, neqn*sizeof(double),cudaMemcpyHostToDevice); /* for testing purpose only */
}
status = cublasDgetrsBatched(handle, CUBLAS_OP_N, neqn, batchsize, (const double**)devPtrA_dev,
neqn, d_pivot_array,devPtrB_dev, neqn, h_info_array, batchsize);
for(int b_count =0; b_count<batchsize; b_count++){
cudaMemcpy( fVec,devPtrB[b_count], neqn*sizeof(double),cudaMemcpyDeviceToHost); /* for testing purpose only */
}
end1 = clock();
float seconds1 = (float)(end1 - start1) / CLOCKS_PER_SEC;
printf("Time in seconds(GPU) : %lf \n", seconds1);
printf("Speedup(CPU/GPU) : %lf \n", seconds/seconds1);
system("pause");
/* End of the main portion of the code */
free(mat);
free(mat_scratch1);
free(ipvt);
free(fVec);
free(fVecSave);
free(fVec_scratch);
free(A);
free(B);
cudaFree(devPtrA[0]);
cudaFree(devPtrB[0]);
cudaFree(devPtrA_dev);
cudaFree(devPtrB_dev);
free(devPtrA);
free(devPtrB);
cudaFree(d_x);
free(x);
cudaFree(d_pivot_array);
cudaFree(d_info_array);
free(h_info_array);
cublasDestroy_v2(handle);
}
2. mathlib_blas.h
#include <stdio.h>
#include <math.h>
#define maxm(a,b) (((a) > (b)) ? (a) : (b))
#define minm(a,b) (((a) < (b)) ? (a) : (b))
#define signum(a,b) (((b) < (0)) ? (-a) : (a))
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType);
void dgefa_blas(double **a,int n, int ipvt[],int *info);
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job);
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z);
void **ArrayAlloc2d( const int size1, const int size2, const size_t sizeType )
{
void** array = nullptr;
array = (void**)calloc(size1, sizeof(void*));
if (array != nullptr) {
if (size2 > 0) {
void* data = calloc(size1*size2, sizeType);
if (data != nullptr) {
char* addr = (char*)data;
for (int index1 = 0; index1 < size1; index1++) {
array[index1] = (void*)addr;
addr += sizeType*size2; /* char is always 1 byte */
}
} else {
free(array);
free(data);
array = nullptr;
}
}
} else {
}
return array;
}
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z)
{
double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm;
int i,info,j,k,kb,kp1,l;
/* Compute 1-norm of a */
anorm = 0.0;
for (j = 0; j < n; j++) {
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(a[i][j]);
anorm = maxm(anorm,vecsum);
}
/* Factor. */
dgefa_blas(a,n,ipvt,&info);
/* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))).
* estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e.
* trans(a) is the transpose of a. The components of e are
* chosen to cause maximum local growth in the elements of
* w, where trans(u)*w=e. The vectors are frequently rescaled
* to avoid overflow.
*/
ek = 1.0;
for (j = 0; j < n; j++)
z[j] = 0.0;
for (k = 0; k < n; k++) {
if (z[k] != 0.0)
ek = signum(ek,-z[k]);
if (fabs(ek-z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(ek-z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ek *= s;
}
wk = ek - z[k];
wkm = -ek - z[k];
s = fabs(wk);
sm = fabs(wkm);
if (a[k][k] != 0.0) {
wk /= a[k][k];
wkm /= a[k][k];
}
else {
wk = 1.0;
wkm = 1.0;
}
kp1 = k + 1;
if (kp1 < n) {
for (j = kp1; j < n; j++) {
sm += (fabs(z[j] + wkm * a[k][j]));
z[j] += (wk * a[k][j]);
s += fabs(z[j]);
}
if (s < sm) {
t = wkm -wk;
wk = wkm;
for (j = kp1; j < n; j++)
z[j] += (t * a[k][j]);
}
}
z[k] = wk;
}
/* dasum(n,s,z,1) */
vecsum = 0.0;
for (i = 0;i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
/* Solve trans(l)*y= w
*/
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (k < (n-1)) {
/* sdot(n-k,a(k+1,k),1,z(k+1),1) */
vecdot = 0.0;
for (i = k+1;i < n; i++)
vecdot += (a[i][k] * z[i]);
z[k] += vecdot;
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
}
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
} /* endfor kb */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm = 1.0;
/*
* Solve l * v = y
*/
for (k = 0; k < n; k++) {
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
if (k < (n-1)) {
/* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */
for (i = k+1;i < n; i++)
z[i] += (t * a[i][k]);
}
if (fabs(z[k]) > 1.0) {
s = 1.0/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
}
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
/* Solve u * z = v */
for (kb = 0; kb < n; kb++) {
k = n - kb - 1;
if (fabs(z[k]) > fabs(a[k][k])) {
s = fabs(a[k][k])/fabs(z[k]);
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
}
if (a[k][k] != 0.0)
z[k] /= a[k][k];
if (a[k][k] == 0.0)
z[k] = 1.0;
t = -z[k];
/* daxpy(k-1,t,a[1][k],1,z[1],1) */
for (i = 0; i < k; i++)
z[i] += (t * a[i][k]);
}
/* Make znorm = 1.0 */
/* dasum(n,z,1) */
vecsum = 0.0;
for (i = 0; i < n; i++)
vecsum += fabs(z[i]);
s = 1.0/vecsum;
/* dscal(n,s,z,1) */
for (i = 0; i < n; i++)
z[i] *= s;
ynorm *= s;
if (anorm != 0.0) *rcond = ynorm/anorm;
if (anorm == 0.0) *rcond = 0.0;
}
void dgefa_blas(double **a,int n, int ipvt[],int *info)
{
double dmax,t;
int i,j,k,kp1,l,nm1;
*info = 0;
nm1 = n - 1;
if (n > 0) {
for (k = 0; k < nm1; k++) {
kp1 = k + 1;
/* Find l = pivot index. */
dmax = fabs(a[k][k]);
l = k;
for (i = k+1; i < n; i++) {
if (fabs(a[i][k]) <= dmax) continue;
l = i;
}
ipvt[k] = l;
/* Zero pivot implies this column already triangularized. */
if (a[l][k] == 0.0) {
*info = k;
continue;
}
/* Interchange if necessary. */
if (l != k) {
t = a[l][k];
a[l][k] = a[k][k];
a[k][k] = t;
}
/* Compute multipliers. */
if (a[k][k] == 0.0) printf("\n!ERROR. Singular matrix.\n");
t = -1.0/a[k][k];
for (i = k+1; i < n; i++)
a[i][k] *= t;
/* Row elimination with column indexing. */
for (j = kp1; j < n; j++) {
t = a[l][j];
if (l != k) {
a[l][j] = a[k][j];
a[k][j] = t;
}
for (i = k+1; i < n; i++ )
a[i][j] += (t * a[i][k]);
}
}
}
ipvt[n-1] = n-1;
if (a[n-1][n-1] == 0.0) *info = n-1;
}
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job)
{
double t;
int i,k,kb,l,nm1;
nm1 = n - 1;
if (job == 0) {
/* job = 0, solve a * x = b.
* First solve l * y = b.
*/
if (n > 0) {
for (k = 0; k < nm1; k++) {
l = ipvt[k];
t = b[l];
if (l != k) {
b[l] = b[k];
b[k] = t;
}
/* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */
for (i=k+1;i < n;i++)
b[i] += (t * a[i][k]);
}
}
/* Now solve u * x = y. */
for (kb = 0; kb < n; kb++) {
k = n - kb-1;
b[k] /= a[k][k];
t = -b[k];
/* saxpy(k-1,t,a(1,k),1,b(1),1); */
for (i = 0; i < k ; i++)
b[i] += (t * a[i][k]);
}
return;
}
/* job != 0, solve trans(a) * x = b.
* First solve trans(u) * x = y.
*/
for (k = 0; k < n; k++) {
/* t = ddot(k-1,a(1,k),1,b(1),1); */
t = 0;
for (i = 0; i < k; i++)
t += (a[i][k] * b[i]);
b[k] = (b[k] - t)/a[k][k];
}
/* Now solve trans(l) * x = y. */
if (n > 0) {
for (kb = 0; kb < nm1; kb++) {
k = n - 2 - kb;
/* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */
t = 0;
for (i = k+1;i < n; i++)
t += (a[i][k] * b[i]);
b[k] += t;
l = ipvt[k];
if (l != k) {
t = b[l];
b[l] = b[k];
b[k] = t;
}
}
}
}
There should not be any behavioral differences between a batch size of 100 and a batch size of 1000. (Certainly there would be a performance difference - the batch size of 1000 should probably take longer.)
There are no published limits to the batch size, other than implicit memory limits. In fact, unless the GPU is returning incorrect results, there is no reason to think that you've run into any hard limit anyway.
( If you wanted to explore some behavioral or performance issue, this question is not properly written to address that. )
I have a program that does a lot of single precision math. It produces correct results if I specify 1.0 architecture but is broken for 2.X and 3.X architectures. What would cause this?
Included below:
Very long code sample.
Compile command and good output.
Compile command and bad output.
If I run the same routing in the CPU using gcc, I get results that match the 1.0 architecture.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
/*
* svdcomp - SVD decomposition routine.
* Takes an mxn matrix a and decomposes it into udv, where u,v are
* left and right orthogonal transformation matrices, and d is a
* diagonal matrix of singular values.
*
* This routine is adapted from svdecomp.c in XLISP-STAT 2.1 which is
* code from Numerical Recipes adapted by Luke Tierney and David Betz.
* Originally from: "Numerical Recipes in C: The Art of Scientific Computing",
* Press, Flannery, Teukolosky, Vetterling. 1992.
*
* Input to dsvd is as follows:
* a = mxn matrix to be decomposed, gets overwritten with u
* m = row dimension of a
* n = column dimension of a
* w = returns the vector of singular values of a
* v = returns the right orthogonal transformation matrix
*/
#define SIGN(a, b) ((b) >= 0.0f ? fabsf(a) : -fabsf(a))
#define MIN(x,y) ( (x) < (y) ? (x) : (y) )
#define MAX(x,y) ((x)>(y)?(x):(y))
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
#define ERRCHECK \
if (cudaPeekAtLastError()) { \
fprintf(stderr, "%s:%d Error [%s]\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
__device__ int
svd(float *a, int m, int n, float *w, float *v, int skip_u)
{
int flag, i, its, j, jj, k, l, nm;
float c, f, h, s, x, y, z;
float anorm = 0.0f, g = 0.0f, scale = 0.0f;
float rv1[3];
/* Householder reduction to bidiagonal form */
for (i = 0; i < n; i++)
{
/* left-hand reduction */
l = i + 1;
rv1[i] = scale * g;
g = s = scale = 0.0f;
if (i < m)
{
for (k = i; k < m; k++)
scale += fabsf(a[k*n+i]);
if (scale)
{
for (k = i; k < m; k++)
{
a[k*n+i] /= scale;
s += powf(a[k*n+i], 2);
}
f = a[i*n+i];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+i] = f - g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = i; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = s / h;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (k = i; k < m; k++)
a[k*n+i] *= scale;
}
}
w[i] = scale * g;
/* right-hand reduction */
g = s = scale = 0.0f;
if (i < m && i != n - 1)
{
for (k = l; k < n; k++)
scale += fabsf(a[i*n+k]);
if (scale)
{
for (k = l; k < n; k++)
{
a[i*n+k] /= scale;
s += powf(a[i*n+k], 2);
}
f = a[i*n+l];
g = -SIGN(sqrtf(s), f);
h = f * g - s;
a[i*n+l] = f - g;
for (k = l; k < n; k++)
rv1[k] = a[i*n+k] / h;
if (i != m - 1)
{
for (j = l; j < m; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[j*n+k] * a[i*n+k];
for (k = l; k < n; k++)
a[j*n+k] += s * rv1[k];
}
}
for (k = l; k < n; k++)
a[i*n+k] *= scale;
}
}
anorm = MAX(anorm, fabsf(w[i]) + fabsf(rv1[i]));
}
/* accumulate the right-hand transformation */
for (i = n - 1; i >= 0; i--)
{
if (i < n - 1)
{
if (g)
{
for (j = l; j < n; j++)
v[j*n+i] = (a[i*n+j] / a[i*n+l]) / g;
/* float division to avoid underflow */
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < n; k++)
s += a[i*n+k] * v[k*n+j];
for (k = l; k < n; k++)
v[k*n+j] += s * v[k*n+i];
}
}
for (j = l; j < n; j++)
v[i*n+j] = v[j*n+i] = 0.0f;
}
v[i*n+i] = 1.0f;
g = rv1[i];
l = i;
}
/* accumulate the left-hand transformation */
if (!skip_u) {
for (i = n - 1; i >= 0; i--)
{
l = i + 1;
g = w[i];
if (i < n - 1)
for (j = l; j < n; j++)
a[i*n+j] = 0.0f;
if (g)
{
g = 1.0f / g;
if (i != n - 1)
{
for (j = l; j < n; j++)
{
for (s = 0.0f, k = l; k < m; k++)
s += a[k*n+i] * a[k*n+j];
f = (s / a[i*n+i]) * g;
for (k = i; k < m; k++)
a[k*n+j] += f * a[k*n+i];
}
}
for (j = i; j < m; j++)
a[j*n+i] = a[j*n+i]*g;
}
else
{
for (j = i; j < m; j++)
a[j*n+i] = 0.0f;
}
++a[i*n+i];
}
}
/* diagonalize the bidiagonal form */
for (k = n - 1; k >= 0; k--)
{ /* loop over singular values */
for (its = 0; its < 30; its++)
{ /* loop over allowed iterations */
flag = 1;
for (l = k; l >= 0; l--)
{ /* test for splitting */
nm = l - 1;
if (fabsf(rv1[l]) + anorm == anorm)
{
flag = 0;
break;
}
if (fabsf(w[nm]) + anorm == anorm)
break;
}
if (flag)
{
c = 0.0f;
s = 1.0f;
for (i = l; i <= k; i++)
{
f = s * rv1[i];
if (fabsf(f) + anorm != anorm)
{
g = w[i];
h = hypotf(f, g);
w[i] = h;
h = 1.0f / h;
c = g * h;
s = (- f * h);
if (!skip_u) {
for (j = 0; j < m; j++)
{
y = a[j*n+nm];
z = a[j*n+i];
a[j*n+nm] = y * c + z * s;
a[j*n+i] = z * c - y * s;
}
}
}
}
}
z = w[k];
if (l == k)
{ /* convergence */
if (z < 0.0f)
{ /* make singular value nonnegative */
w[k] = -z;
for (j = 0; j < n; j++)
v[j*n+k] = -v[j*n+k];
}
break;
}
if (its >= 30) {
}
/* shift from bottom 2 x 2 minor */
x = w[l];
nm = k - 1;
y = w[nm];
g = rv1[nm];
h = rv1[k];
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0f * h * y);
g = hypotf(f, 1.0f);
f = ((x - z) * (x + z) + h * ((y / (f + SIGN(g, f))) - h)) / x;
/* next QR transformation */
c = s = 1.0f;
for (j = l; j <= nm; j++)
{
i = j + 1;
g = rv1[i];
y = w[i];
h = s * g;
g = c * g;
z = hypotf(f, h);
rv1[j] = z;
c = f / z;
s = h / z;
f = x * c + g * s;
g = g * c - x * s;
h = y * s;
y = y * c;
for (jj = 0; jj < n; jj++)
{
x = v[jj*n+j];
z = v[jj*n+i];
v[jj*n+j] = x * c + z * s;
v[jj*n+i] = z * c - x * s;
}
z = hypotf(f, h);
w[j] = z;
if (z)
{
z = 1.0f / z;
c = f * z;
s = h * z;
}
f = (c * g) + (s * y);
x = (c * y) - (s * g);
if (!skip_u) {
for (jj = 0; jj < m; jj++)
{
y = a[jj*n+j];
z = a[jj*n+i];
a[jj*n+j] = y * c + z * s;
a[jj*n+i] = z * c - y * s;
}
}
}
rv1[l] = 0.0f;
rv1[k] = f;
w[k] = x;
}
}
return(0);
}
__global__ void
svd_kernel(float *v)
{
float a[9], w[3];
a[0] = 8.0f;
a[1] = 3.0f;
a[2] = 7.0f;
a[3] = 7.0f;
a[4] = 9.0f;
a[5] = 1.0f;
a[6] = 3.0f;
a[7] = 7.0f;
a[8] = 2.0f;
svd(a, 3, 3, w, v, 1);
}
int main()
{
int i, j;
float *v_d, v[9];
PERR(cudaMalloc(&v_d, 9*sizeof(float)));
svd_kernel<<<1,1>>>(v_d);
cudaDeviceSynchronize();
ERRCHECK;
PERR(cudaMemcpy(v, v_d, 9*sizeof(float), cudaMemcpyDeviceToHost));
for (i = 0; i < 3; i++) {
for (j = 0; j < 3; j++) {
printf("%6.3f\t", v[i*3+j]);
}
printf("\n");
}
return 0;
}
Correct Results:
$ nvcc -arch=sm_10 -o svd svd.cu
$ ./svd
-0.657 -0.685 0.314
-0.668 0.337 -0.664
-0.349 0.646 0.679
Broken Results:
$ nvcc -arch=sm_20 -o svd svd.cu
$ ./svd
-0.661 -0.660 0.356
-0.642 0.253 -0.724
0.019 0.460 0.888
It seems that CUDA 6 fixes the issue.
<%
int apps = 11;
int noOfDiv = apps % 3, k, m;
for (int i = 1; i <= 2; i++) {
out.println("<div>");
out.println("<table>");
for (int j = 1; j <= 2; j++) {
out.println("<tr>");
for (k = 1; k <= 4; k++) {
out.println("<td>");
out.println("" + k + "");
out.println("</td>");
}
out.println("</tr>");
}
out.println("</table>");
out.println("</div>");
}
%>
for this i'm getting output as
1234
1234
in div1
1234
1234
in div2 ,
but i need
1234
5678
in div1 and
9 10 11
in div2 if i have total 11 numbers
You can do that by using following code;
<%
int apps = 11;
int noOfDiv = apps % 3, k, m;
for (int i = 1; i <= 2; i++) {
out.println("<div>");
out.println("<table>");
for (int j = 1; j <= 2; j++) {
out.println("<tr>");
int temp = (j-1)*4 +1;
for (k = temp; k <= temp+3; k++) {
out.println("<td>");
out.println("" + k + "");
out.println("</td>");
}
out.println("</tr>");
}
out.println("</table>");
out.println("</div>");
}
out.println("<div><table><tr><td>" + (apps - 2) + "</td><td>" + (apps - 1) + "</td><td>" + apps + "</td></tr></table></div>");
%>
It is because you prints k for k = 1 to 4, and to correct use an extra variable say capital K = 1 before any loop, then replace:
for (k = 1; k <= 4; k++) // print 1 to 4
as:
int noOfDiv = apps % 3, k, m, K = 1; // Added K = 1
// rest of your codes ...
for (k = K; k <= K + 3; k++){
// code you already have to print small `k`
K += 4;
}