Refactor non-blocking nodejs do..while loop - mysql

I'm writing an api in node.js. The first webservice endpoint - /create - creates a new db entry with a randomised 6-character hash, much like a bit.ly hash.
Having done something similar in PHP, I've written a do..while loop which generates a random string and checks my mysql db (using node-mysql) to make sure it's free. I've also got a counter in there, so I can fail after x iterations if need be.
var i = 0;
var alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
var hash = null;
var success = false;
do {
// generate a random hash by shuffling the alphabet,
// joining it and getting 6 chars
hash = alphabet.sort(function(){
return 0.5 - Math.random();
}).join('').substr(0,6);
console.log(i + ': checking hash ' + hash);
// see if it exists in the db
db.query("SELECT hash FROM trips WHERE hash = " + hash, function(err, results){
if(results.length == 0) {
// the hash is free to use :)
success = true;
} else {
// the hash is already taken :(
success = false;
}
});
// increment the counter
i++;
} while(success === false && i < 10);
I currently only have one hash in my db (abcdef), but the loop is getting to ten and failing because it thinks each new hash is already present.
I'm pretty sure this is because of the non-blocking nature of node.js. This is obviously A Good Thing, but in my case I need the loop to block until the query has returned.
I'm pretty sure I could hack this by doing something like:
var q = db.query(...);
But I know that's throwing away a major feature of node.js.
Is there a code pattern for this sort of need?

I'm pretty sure this is because of the non-blocking nature of node.js.
Yes.
This is obviously A Good Thing, but in my case I need the loop to block until the query has returned.
No, you most certainly don't want to do that.
Embrace the asynchronous approcach. Work with call-backs:
function generateHash(onSuccess, onError, retryCount) {
// generate a random hash by shuffling the alphabet,
// joining it and getting 6 chars
var hash = alphabet.sort(function(){
return 0.5 - Math.random();
}).join('').substr(0,6);
// see if it exists in the db
db.query(
"SELECT hash FROM trips WHERE hash = '" + hash + "'",
function(err, results){
if (results.length == 0) {
// the hash is free to use :)
onSuccess(hash);
} else {
// the hash is already taken :(
if (retryCount > 1) {
generateHash(onSuccess, onError, retryCount - 1);
} else {
onError();
}
}
}
});
}
generateHash(
function(hash) { console.log('Success! New hash created: ' + hash); },
function() { console.log('Error! retry limit reached'); },
6
);

var i=0;
function generateHash(callback) {
// generate a random hash by shuffling the alphabet,
// joining it and getting 6 chars
hash = alphabet.sort(function(){
return 0.5 - Math.random();
}).join('').substr(0,6);
console.log(i + ': checking hash ' + hash);
// see if it exists in the db
db.query("SELECT hash FROM trips WHERE hash = " + hash, function(err, results){
if(results.length == 0) {
// the hash is free to use :)
callback(null, hash);
} else {
// increment the counter
i++;
if (i < 10)
generateHash(callback); //another attempt
else
callback('error'); // return result
}
});
}

Related

Collision probability using this custom id generation code (Node.js)

Am I running an unnecessary risk of creating an id that is not unique? I'm trying to generate a unique, random id of alphanumeric characters. This ID will be used in the primary key for the database record.
const idSeed: string =
crypto.randomBytes(16).toString('base64') +
'' +
Date.now();
const orderId: string = Buffer.from(idSeed)
.toString('base64')
.replace(/[\/\+\=]/g, '');
First off, I recommend that you get rid of the .replace(/[\/\+\=]/g, '') as that is losing randomness and, in fact, mapping some unique orderIds that differ only in those characters to be the same.
My recommendation would be to use a base58 encoder base-x that will directly encode to what you want. This encoder library lets you pass in the exact character set you want to use for encoding and it just uses that.
Here's my suggested code you can insert:
const base58Encode = require('base-x')('123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz').encode;
And, then where you create the orderID, change to this:
const idSeed = crypto.randomBytes(16)
const orderId = base58Encode(idSeed);
I don't know about the probability of a dup (you'd need a crypto/statistician for that), but I ran 10,000,000 orderId values without a dup and I repeated that 10 times and still didn't get a dup. Obviously, that doesn't mean it can't happen, but I'm doing this rapid fire too where Date.now() might not even be much different. I couldn't run it more than 10,000,000 times because I run out of memory trying to store all the prior orderId values in a Set object to check for dups. You could increase memory for nodejs and run it with even higher values or put it in a shell script and run it over and over and over again.
Here's my dup checker program if you want to run it yourself over and over:
const crypto = require('crypto');
function addCommas(str) {
var parts = (str + "").split("."),
main = parts[0],
len = main.length,
output = "",
i = len - 1;
while(i >= 0) {
output = main.charAt(i) + output;
if ((len - i) % 3 === 0 && i > 0) {
output = "," + output;
}
--i;
}
// put decimal part back
if (parts.length > 1) {
output += "." + parts[1];
}
return output;
}
let set = new Set();
const numToTry = 10_000_000;
const debugMultiple = 100_000;
for (let i = 0; i < numToTry; i++) {
if (i !== 0 && i % debugMultiple === 0) {
console.log(`Attempt #${addCommas(i)}`);
}
const idSeed = crypto.randomBytes(16).toString('base64') + '' + Date.now();
const orderId = Buffer.from(idSeed).toString('base64').replace(/[\/\+\=]/g, '');
//console.log(orderId);
if (set.has(orderId)) {
console.log(`Found conflict after ${addCommas(i)} attempts`);
console.log(`Conflicting orderId = ${orderId}`);
process.exit(1);
}
set.add(orderId);
}
console.log(`No dups found after ${addCommas(numToTry)} attempts`);
Before spending a lot of time on this, I would investigate your database to see if it will generate a unique key for you that could work as the orderId. This is a common database problem.
Here's a newer version that I was able to run up to 1,000,000,000 ids through. Still no conflicts. Because there's no way I could have a giant Set object of 1,000,000,000 ids in memory, I brainstormed about a number of ways to do it. I thought about using a redis server and storing the ids in there since it can use a lot more memory. But, then I came up with a disk-based solution that can scale as high as you want. Here's the basic idea:
One of your orderId values looks like this:
zz6h6q6oRELJXmh4By4NUw1587006335064`
When I generate a new orderId, if I can separate it out into a disk-based "bucket" that contains only ids with the same beginning characters, then I can split all the ids among many different files.
The idea is that if each id that starts with the same two characters is stored in the same file, then no other id in any other file could possibly match the ids in that file.
You can then do your work in two passes. The first pass generates 1,000,000,000 ids and as they are generated, they are written out to an appropriate bucket file based on the characters the id starts with.
After all the ids are generated and written to their appropriate bucket files, the second pass is to iterate through each of the bucket files one at a time, load all the ids into a Set object and see if any conflict. If none match, clear that Set and go onto the next file. This lets you do the memory constrained part (dealing with a Set object) in pieces to use less memory for big numbers of ids.
So, then the question is how to divide the ids up into bucket files? Since each byte in the base64 id value represents up to 64 possible values, if you use just the first two characters of the id to determine the bucket, you will get up to 64*64=4096 buckets. For some reason (which must have to do with how crypto.randomBytes(16) works), I only found ~3800 buckets actually occurred in the actual orderId values.
But, if you split 1,000,000,000 values into 3800 buckets, you get about 263,000 ids per bucket. We already showed that we could easily process 15,000,000 ids in memory before, so this should be more than enough buckets to be able to process each bucket in memory one at a time. In fact, if I were patient enough, we could probably go to 10,000,000,000 with buckets based on just the first two characters.
If you wanted more buckets, they could be based on the first three characters, though then you start getting too many files for a single directory and you have to start splitting files among directories which can be done, but complicates things.
So, I need to create a bucket filename that's based on the first two characters of the id. The ids are case sensitive (base64 uses upper and lower case to represent different values). My Windows file system is case insensitive so I can't just directly use the first two letters as the filename. So, I created a simple algorithm, that takes a two character mixed case prefix and makes it into a four character lowercase name. It maps a lowercase "a" to "a_" and a non-lowercase character like "B" to "bb". So, a lowercase value is followed by a _ and an uppercase value is follows by a second copy of itself. So, you'd have id mappings like this:
"ab" => "a_b_"
"AB" => "aabb"
"aB" => "a_BB"
"Ab" => "aab_"
Non-alpha characters (like numbers) just map to a doubling of themselves just like any non-lowercase characters. So, with this, I can get an id value, grab the first two characters, see what filename it belongs to and append it to that file.
For performance reasons, I created a Bucket class which maintains a cache of ids waiting to be written in memory. When the cache inside a particular bucket gets to a certain length (which I now have set to 3000), I append them all out to the file at once and clear the bucket cache. When I'm done generating all the ids, I iterate through all the buckets and flush out any remaining ids. With this kind of write caching, the generation of ids is mostly CPU bound, not disk bound. Disk utilization runs around 30%. One core of the CPU is pegged during id generation. This could probably be sped up with some WorkerThreads.
So, once all the ids are written to bucket files and nothing is in memory at all, it's time to read through each of the bucket files one at a time, load all their ids into a Set and see if there are any conflicts. Each bucket file is a line separated list of ids that all start with same prefix like this:
zzoexm2FE8DIrHnXpp8qw1587003338798
zzuP6LpusKIMeYrfl0WJnQ1587003338885
zz1itmTqA3yaFNo1KFUhg1587003338897
zz3TEFeqH965OTFCrFTjJQ1587003338904
zz8XQKvq11fCqn9kB4O2A1587003338904
zzaKMTFPct5ls7WW3YmcQ1587003338927
zzyX3htzIqi4zOq4Cxdg1587003338928
zzoHu6vIHMEgNMVY46Qw1587003338962
So, I just read a given bucket file, line by line, check each id against a Set for that bucket file. If it's already in the set, there's a conflict. Output that conflict and abort. If it's not the Set, add it to the Set and continue with the rest of the ids in that bucket file. Since this bucket file contains all the ids that start with the same two characters, no other id in any other bucket file can conflict with these so you can just compare all these ids vs each other.
The reading of the bucket files is heavily disk bound. When running 1,000,000,000 ids into the 3844 bucket files, each bucket file is about 5MB which is 22GB of data. Each file has to be read and parsed into lines and then each id added to the Set.
I tried a couple different mechanisms for reading the files line by line and found them quite slow. I started with the readLine interface which lets you iterate through line by line via a readStream. It was sloooow. Then, I just read the whole file into memory with fs.readFile() into a giant string and then called .split("\n") on it to break it into lines. This was actually better than readLine, but still slow. I theorized that there were just too many copies of the data which meant the garbage collector was having to work at lot.
So, finally I wrote my own version of readFile that reads the entire file into a reusable Buffer and splits it into lines by parsing the binary buffer directly. This saved at least a couple copies of the data along the way and saved a lot of GC work. It wasn't fast, but it was faster. Reusing the buffer also saved me a lot of separate 5MB allocations.
The first pass (generating the ids) is CPU bound. I've theorized I could speed that up quite a bit by starting up a number of Worker Threads (probably like 6 since I have an 8-core CPU) and letting them crunch on generating the ids. I would dole out 1/6 of the quantity to each Worker Thread and when they accumulated 1000 or so, they'd message those 1000 back to the main thread which would insert them in the right buckets. But, before I adventure into using WorkerThreads, I need to do some benchmarking to see how much of the total time of the first pass is in the crypto.randomBytes() function vs. elsewhere to make sure it would be worth it.
The second pass it totally disk bound, but the actual disk throughput is horrible (like 60MB/s). Either my disk really sucks, nodejs isn't very good at this type of file I/O or there's just a lot of overhead in handling 3800 large files (read directory entry, seek to disk for first sector, read as many sequential sectors as you can, seek again, etc...). I could try it on my fastest SSD, but I don't really want to go writing 20GB to my SSD everytime I play with this.
I played with increasing the UV_THREADPOOL_SIZE thinking that maybe nodejs was queuing too many reads/writes. But, performance actually got worse when I increased the thread pool size. I guess it's default of 4 is more than enough to keep one disk controller plenty busy. Anything more than that and you're just asking the disk head to jump around between different files when it would be more efficient to read all of one file, then go to the next file and so on.
While the second pass is mostly disk bound, there's still about 30% of the time spent in non-disk related stuff (based on some high-res timers I inserted). So, if it didn't cause too much harm with disk contention, it's possible you could spread the processing of the different bucket files out among a group of WorkerThreads. You would at least get parallelism on the CPU part of that process. You would likely get more disk contention though so I'm not sure if it would help.
Lastly, bucket files could be split among drives and, even ideally among separate SATA controllers. I have plenty of drives and a couple SATA controllers to try that, but then it gets pretty specific to my system.
Here's the code for the bucket system.
// unique-test.js
const crypto = require('crypto');
const readline = require('readline');
const fs = require('fs');
const fsp = fs.promises;
const path = require('path');
const {fastReadFileLines} = require('./fast-read-file.js');
function delay(t, v) {
return new Promise(resolve => {
setTimeout(resolve, t, v);
})
}
function addCommas(str) {
var parts = (str + "").split("."),
main = parts[0],
len = main.length,
output = "",
i = len - 1;
while(i >= 0) {
output = main.charAt(i) + output;
if ((len - i) % 3 === 0 && i > 0) {
output = "," + output;
}
--i;
}
// put decimal part back
if (parts.length > 1) {
output += "." + parts[1];
}
return output;
}
// make a unique filename using first several letters of
// the string. Strings are case sensitive, bucket filenames
// cannot be so it has to be case neutralized while retaining
// uniqueness
function makeBucketKey(str) {
let piece = str.substr(0,2);
let filename = [];
// double up each character, but
for (let ch of piece) {
filename.push(ch);
if (ch >= 'a' && ch <= 'z') {
filename.push("_")
} else {
filename.push(ch);
}
}
return filename.join("").toLowerCase();
}
// this value times the number of total buckets has to fit in memory
const bucketCacheMax = 3000;
class Bucket {
constructor(filename, writeToDisk = true) {
this.items = [];
this.filename = filename;
this.cnt = 0;
this.writeToDisk = writeToDisk;
// We dither the bucketCacheMax so that buckets aren't all trying to write at the same time
// After they write once (and are thus spread out in time), then they will reset to full cache size
let dither = Math.floor(Math.random() * bucketCacheMax) + 10;
if (Math.random() > 0.5) {
dither = -dither;
}
this.bucketCacheMax = bucketCacheMax + dither;
}
// add an item to cache, flush to disk if necessary
async add(item) {
++this.cnt;
this.items.push(item);
if (this.items.length > this.bucketCacheMax) {
// the dithered cache size is only used on the first write
// to spread out the writes. After that, we want a full cache size
let priorBucketCacheMax = this.bucketCacheMax;
this.bucketCacheMax = bucketCacheMax;
await this.flush();
}
}
// write any cached items to disk
async flush() {
if (this.writeToDisk && this.items.length) {
let data = this.items.join("\n") + "\n";
this.items.length = 0;
if (this.flushPending) {
throw new Error("Can't call flush() when flush is already in progress");
}
function flushNow() {
this.flushPending = true;
return fsp.appendFile(this.filename, data).finally(() => {
this.flushPending = false;
});
}
// we write to disk with retry because we once go EBUSY (perhaps from a backup program)
let retryCntr = 0;
const retryMax = 10;
const retryDelay = 200;
const retryBackoff = 200;
let lastErr;
function flushRetry() {
if (retryCntr > retryMax) {
throw lastErr;
}
return flushNow.call(this).catch(err => {
lastErr = err;
console.log("flushNow error, retrying...", err);
return delay(retryDelay + (retryCntr++ * retryBackoff)).then(() => {
return flushRetry.call(this);
});
});
}
return flushRetry.call(this);
}
this.items.length = 0;
}
delete() {
return fsp.unlink(this.filename);
}
get size() {
return this.cnt;
}
}
class BucketCollection {
constructor(dir, writeToDisk = true) {
// map key is bucketID, value is bucket object for that key
this.buckets = new Map();
this.dir = dir;
}
add(key, data) {
let bucket = this.buckets.get(key);
if (!bucket) {
let filename = path.join(this.dir, key);
bucket = new Bucket(filename, writeToDisk);
this.buckets.set(key, bucket);
}
return bucket.add(data);
}
async flush() {
// this could perhaps be sped up by doing 4 at a time instead of serially
for (let bucket of this.buckets.values()) {
await bucket.flush();
}
}
async delete() {
// delete all the files associated with the buckets
for (let bucket of this.buckets.values()) {
await bucket.delete();
}
}
get size() {
return this.buckets.size;
}
getMaxBucketSize() {
let max = 0;
for (let bucket of this.buckets.values()) {
max = Math.max(max, bucket.size);
}
return max;
}
}
// program options
let numToTry = 100_000;
let writeToDisk = true;
let cleanupBucketFiles = true;
let skipAnalyze = false;
let analyzeOnly = false;
// -nodisk don't write to disk
// -nocleanup erase bucket files when done
// -analyzeonly analyze files in bucket directory only
if (process.argv.length > 2) {
let args = process.argv.slice(2);
for (let arg of args) {
arg = arg.toLowerCase();
switch(arg) {
case "-nodisk":
writeToDisk = false;
break;
case "-nocleanup":
cleanupBucketFiles = false;
break;
case "-skipanalyze":
skipAnalyze = true;
break;
case "-analyzeonly":
analyzeOnly = true;
break;
default:
if (/[^\d,]/.test(arg)) {
console.log(`Unknown argument ${arg}`);
process.exit(1);
} else {
numToTry = parseInt(arg.replace(/,/g, ""), 10);
}
}
}
}
let bucketDir = path.join(__dirname, "buckets");
let collection = new BucketCollection(bucketDir, writeToDisk);
console.log(`Running ${addCommas(numToTry)} random ids`);
const debugMultiple = 100_000;
async function analyze() {
let cntr = 0;
const cntrProgress = 10;
const cntrProgressN = 10n;
let buffer = null;
let times = [];
async function processFile(file) {
if (cntr !== 0 && cntr % cntrProgress === 0) {
let sum = 0n;
for (let i = 0; i < cntrProgress; i++) {
sum += times[i];
}
console.log(`Checking bucket #${cntr}, Average readFileTime = ${sum / cntrProgressN}`);
times.length = 0;
}
++cntr;
let set = new Set();
let startT = process.hrtime.bigint();
let buffer = null;
let result = await fastReadFileLines(file, buffer);
let data = result.lines;
// keep reusing buffer which may have been made larger since last time
buffer = result.buffer;
//let data = (await fsp.readFile(file, "utf8")).split("\n");
let afterReadFileT = process.hrtime.bigint();
for (const lineData of data) {
let line = lineData.trim();
if (line) {
if (set.has(line)) {
console.log(`Found conflict on ${data}`);
} else {
set.add(line);
}
}
}
let loopT = process.hrtime.bigint();
let divisor = 1000n;
let readFileTime = (afterReadFileT - startT) / divisor;
times.push(readFileTime);
// console.log(`readFileTime = ${readFileTime}, loopTime = ${(loopT - afterReadFileT) / divisor}`);
/*
let rl = readline.createInterface({input:fs.createReadStream(file), crlfDelay: Infinity});
for await (const line of rl) {
let data = line.trim();
if (data) {
if (set.has(data)) {
console.log(`Found conflict on ${data}`);
} else {
set.add(data);
}
}
}
*/
}
if (analyzeOnly) {
let files = await fsp.readdir(bucketDir);
for (let file of files) {
let fullPath = path.join(bucketDir, file)
await processFile(fullPath);
}
} else {
for (let bucket of collection.buckets.values()) {
await processFile(bucket.filename);
}
}
}
async function makeRandoms() {
let start = Date.now();
if (analyzeOnly) {
return analyze();
}
for (let i = 0; i < numToTry; i++) {
if (i !== 0 && i % debugMultiple === 0) {
console.log(`Attempt #${addCommas(i)}`);
}
const idSeed = crypto.randomBytes(16).toString('base64') + '' + Date.now();
const orderId = idSeed.toString('base64').replace(/[\/\+\=]/g, '');
//console.log(orderId);
let bucketKey = makeBucketKey(orderId);
await collection.add(bucketKey, orderId);
}
console.log(`Total buckets: ${collection.size}, Max bucket size: ${collection.getMaxBucketSize()}`);
//console.log(`No dups found after ${addCommas(numToTry)} attempts`);
await collection.flush();
let delta = Date.now() - start;
console.log(`Run time for creating buckets: ${addCommas(delta)}ms, ${addCommas((delta / numToTry) * 1000)}ms per thousand`);
if (!skipAnalyze) {
console.log("Analyzing buckets...")
await analyze();
}
if (cleanupBucketFiles) {
console.log("Cleaning up buckets...")
await collection.delete();
}
}
makeRandoms();
And, here's a dependent file (goes in the same directory) for my faster readfile function:
// fast-read-file.js
const fsp = require('fs').promises;
async function fastReadFile(filename, buffer = null) {
let handle = await fsp.open(filename, "r");
let bytesRead;
try {
let stats = await handle.stat();
if (!buffer || buffer.length < stats.size) {
buffer = Buffer.allocUnsafe(stats.size);
}
// clear any extra part of the buffer so there's no data leakage
// from a previous file via the shared buffer
if (buffer.length > stats.size) {
buffer.fill(0, stats.size);
}
let ret = await handle.read(buffer, 0, stats.size, 0);
bytesRead = ret.bytesRead;
if (bytesRead !== stats.size) {
// no data leaking out
buffer.fill(0);
throw new Error("bytesRead not full file size")
}
} finally {
handle.close().catch(err => {
console.log(err);
});
}
return {buffer, bytesRead};
}
async function fastReadFileLines(filename, buf = null) {
const {bytesRead, buffer} = await fastReadFile(filename, buf);
let index = 0, targetIndex;
let lines = [];
while (index < bytesRead && (targetIndex = buffer.indexOf(10, index)) !== -1) {
// the buffer may be larger than the actual file data
// so we have to limit our extraction of data to only what was in the actual file
let nextIndex = targetIndex + 1;
// look for CR before LF
if (buffer[targetIndex - 1] === 13) {
--targetIndex;
}
lines.push(buffer.toString('utf8', index, targetIndex));
index = nextIndex;
}
// check for data at end of file that doesn't end in LF
if (index < bytesRead) {
lines.push(buffer.toString('utf8', index, bytesRead));
}
return {buffer, lines};
}
module.exports = {fastReadFile, fastReadFileLines};
// if called directly from command line, run this test function
// A file of ids named "zzzz" must exist in this directory
if (require.main === module) {
let buffer = Buffer.alloc(1024 * 1024 * 10, "abc\n", "utf8");
fastReadFileLines("zzzz", buffer).then(result => {
let lines = result.lines;
console.log(lines[0]);
console.log(lines[1]);
console.log(lines[2]);
console.log("...");
console.log(lines[lines.length - 3]);
console.log(lines[lines.length - 2]);
console.log(lines[lines.length - 1]);
}).catch(err => {
console.log(err);
});
}
You first create a sub-directory named "buckets" under where you are running this. Then, you run this from the command line:
node unique-test.js 1,000,000,000
There are some supported command lines options (mostly used during debugging):
-nodisk Don't write to disk
-nocleanup Don't cleanup generated disk files when done
-skipAnalyze Just generate bucket files, don't analyze them
-analyzeOnly Use previously generated bucket files and analyze them
The number you pass on the command line is how many ids to generate. If you pass nothing, it defaults to 100,000. For readability, it handles commas.
That's a really superb answer by #jfriend, I'd just like to add that you can calculate the result analytically, or rather an approximation. I believe using both approaches can be the best route to go.
This is an example of the Birthday Problem.
The TLDR on this is that the approximate probability of collision can be determined using the formula:
1 − exp(−n²/(2x))
Where x is the number of possible values and n is the number of generated values, as long as n is small compared to x (It will be!)
Now, you have approximately 16 bytes of entropy in the generated ids this gives 2^128 or 3.4 x 10^38 possible ids. Since two characters are being dropped (+/), the number of possible values is more like (62^21) = 4.37 x 10^37.
As #jfriend00 has pointed out, the addition of the date means you'd have to generate the number of ids in the table below every millisecond to have the corresponding probability of collision.
This table should give an approximation of the collision probabilities.
|----------------------------|----------------------------|
| Number of Ids | Collision Probability |
|----------------------------|----------------------------|
| 10^6 (1 million) | 2.29 × 10^-26 |
|----------------------------|----------------------------|
| 10^9 (1 billion) | 2.29 × 10^-20 |
|----------------------------|----------------------------|
| 10^12 (1 trillion) | 2.29 × 10^-14 |
|----------------------------|----------------------------|
| 10^15 (1 quadrillion) | 2.29 × 10^-8 |
|----------------------------|----------------------------|
I've used the very handy Wolfram Alpha to calculate these results.

firebase update by batches does not work with large dataset

I want to populate a feed to almost one million of users upon a content posted by a user with high number of followers using GCP cloud functions.
In order to do this, I am designing to split the firebase update of the feed into numbers of small batches. That's because I think if I dont split the update, I might face the following issues:
i) keeping one million of users feed in memory will exceed the allocated maximum 2GB memory.
ii) update one million of entries at one go will not work (How long it takes to update one million entries?)
However, the batch update only works for me when the batch only inserting around 100 entries per update invocation. When I tried with 1000 per batch, only the 1st batch was inserted. I wonder if this is due to:
i) time-out ? however I dont see this error in the log.
ii) The array variable , userFeeds{} , keeping the batch is destroyed when the function is out of scope ?
Below is my code:
var admin = require('firebase-admin');
var spark = require('./spark');
var user = require('./user');
var Promise = require('promise');
var sparkRecord;
exports.newSpark = function (sparkID) {
var getSparkPromise = spark.getSpark(sparkID);
Promise.all([getSparkPromise]).then(function(result) {
var userSpark = result[0];
sparkRecord = userSpark;
sparkRecord.sparkID = sparkID;
// the batch update only works if the entries per batch is aroud 100 instead of 1000
populateFeedsToFollowers(sparkRecord.uidFrom, 100, null, myCallback);
});
};
var populateFeedsToFollowers = function(uid, fetchSize, startKey, callBack){
var fetchCount = 0;
//retrieving only follower list by batch
user.setFetchLimit(fetchSize);
user.setStartKey(startKey);
//I use this array variable to keep the entries by batch
var userFeeds = {};
user.getFollowersByBatch(uid).then(function(users){
if(users == null){
callBack(null, null, null);
return;
}
//looping thru the followers by batch size
Object.keys(users).forEach(function(userKey) {
fetchCount += 1;
if(fetchCount > fetchSize){
// updating users feed by batch
admin.database().ref().update(userFeeds);
callBack(null, userKey);
fetchCount = 0;
return;
}else{
userFeeds['/userFeed/' + userKey + '/' + sparkRecord.sparkID] = {
phase:sparkRecord.phase,
postTimeIntervalSince1970:sparkRecord.postTimeIntervalSince1970
}
}
});//Object.keys(users).forEach
if(fetchCount > 0){
admin.database().ref().update(userFeeds);
}
});//user.getFollowersByBatch
};
var myCallback = function(err, nextKey) {
if (err) throw err; // Check for the error and throw if it exists.
if(nextKey != null){ //if having remaining followers, keep populating
populateFeedsToFollowers(sparkRecord.uidFrom, 100, nextKey, myCallback);
}
};

How to pass parameters to mysql query callback in nodejs

I'm trying to figure out the correct way of passing custom data to a query call to be made available in the callback.
I'm using MySQL library in nodejs (all latest versions).
I have a call to connection.query(sql, function(err, result) {...});
I couldn't find a way to 1) pass custom data/parameter to the call so that 2) it can be made available when the callback is invoked.
So what is the proper way of doing so?
I have the following (pseudo-code):
...
for (ix in SomeJSONArray) {
sql = "SELECT (1) FROM someTable WHERE someColumn = " + SomeJSONArray[ix].id;
connection.query(sql, function (err, result) {
...
var y = SomeJSONArray[ix].id;
};
}
From the code above, I need to be able to pass the current value of "ix" used in the query to the callback itself.
How do I do that?
If you are using node-mysql, do it like the docs say:
connection.query(
'SELECT * FROM table WHERE id=? LIMIT ?, 5',[ user_id, start ],
function (err, results) {
}
);
The docs also have code for proper escaping of strings, but using the array in the query call automatically does the escaping for you.
https://github.com/felixge/node-mysql
To answer the initial question with a complete answer/example to illustrate, wrap the callback with an anonymous function which immediately creates a scope containing a "snapshot" if you will of the data passed in.
var ix=1;
connection.query('SELECT 1',
(function(ix){
return function(err, rows, fields) {
console.log("ix="+ix);
console.log(rows);
};
})(ix));
For those new to this concept as I was 20 minutes ago, the last })(ix)); is the outer var ix=1 value which is passed into (function(ix){. This could be renamed (function(abc){ if you changed the console.log("ix="+abc);
fwiw (Thanks Chris for the link which filled in the blanks to arrive at a solution)
While it is OK to pass variables or objects to a mysql query callback function using the tactic described earlier -- wrapping the callback function in an anonymous function -- I think it is largely unnecessary, and I'll explain why with an example:
// This actually works as expected!
function run_query (sql, y) {
var y1 = 1;
connection.query (sql, function (error, rows, fields) {
if (! error)
{
var r = rows[0];
console.log ("r = " + r[1]);
console.log ("x = " + x);
console.log ("y = " + y);
console.log ("y1= " + y);
console.log ("");
}
else
{
console.log ("error = " + error);
}
});
};
var x = 5;
console.log ("step 1: x = " + x);
run_query ("SELECT 1", x);
x = x + 1;
console.log ("step 2: x = " + x);
run_query ("SELECT 1", x);
x = x + 1;
console.log ("step 3: x = " + x);
Produces the following output:
step 1: x = 5
step 2: x = 6
step 3: x = 7
r = 1
x = 7
y = 5
y1= 5
r = 1
x = 7
y = 6
y1= 6
The fear is that the second call to run_query() will overwrite the variable y and/or y1 before the first call to run_query() has a chance to invoke its callback function. However, the variables in each instance of the called run_query() function are actually isolated from each other, saving the day.
MySQL con.query has overloaded function. Inside of callback you use global variable or any variable that is passed into your function parameter. For example:
Example 1: it takes sql string and callback function
var adr = 'Mountain 21';
var sql = 'SELECT * FROM customers;
con.query(sql, function (err, result) {
if (err) throw err;
console.log(adr);
});
Example 2: it takes sql string, parameter and callback function
var adr = 'Mountain 21';
var sql = 'SELECT * FROM customers WHERE address = ?';
con.query(sql, [adr], function (err, result) {
if (err) throw err;
console.log(adr);
});

How can I do this asyn feature in nodejs

I have a code to do some calculation.
How can I write this code in an asyn way?
When query the database, seems we can not get the results synchronously.
So how to implement this kind of feature?
function main () {
var v = 0, k;
for (k in obj)
v += calc(obj[k].formula)
return v;
}
function calc (formula) {
var result = 0;
if (formula.type === 'SQL') {
var someSql = "select value from x = y"; // this SQL related to the formula;
client.query(someSql, function (err, rows) {
console.log(rows[0].value);
// *How can I get the value here?*
});
result = ? // *How can I return this value to the main function?*
}
else
result = formulaCalc(formula); // some other asyn code
return result;
}
Its not possible to return the result of an asynchronous function, it will just return in its own function scope.
Also this is not possible, the result will always be unchanged (null)
client.query(someSql, function (err, rows) {
result = rows[0].value;
});
return result;
Put a callback in the calc() function as second parameter and call that function in the client.query callback with the result
function main() {
calc(formula,function(rows) {
console.log(rows) // this is the result
});
}
function calc(formula,callback) {
client.query(query,function(err,rows) {
callback(rows);
});
}
Now if you want the main to return that result, you also have to put a callback parameter in the main and call that function like before.
I advice you to check out async its a great library to not have to deal with this kind of hassle
Here is a very crude way of implementing a loop to perform a calculation (emulating an asynchronous database call) by using events.
As Brmm alluded, once you go async you have to go async all the way. The code below is just a sample for you to get an idea of what the process in theory should look like. There are several libraries that make handling the sync process for asynch calls much cleaner that you would want to look into as well:
var events = require('events');
var eventEmitter = new events.EventEmitter();
var total = 0;
var count = 0;
var keys = [];
// Loop through the items
calculatePrice = function(keys) {
for (var i = 0; i < keys.length; i++) {
key = keys[i];
eventEmitter.emit('getPriceFromDb', {key: key, count: keys.length});
};
}
// Get the price for a single item (from a DB or whatever)
getPriceFromDb = function(data) {
console.log('fetching price for item: ' + data.key);
// mimic an async db call
setTimeout( function() {
price = data.key * 10;
eventEmitter.emit('aggregatePrice', {key: data.key, price: price, count: data.count});
}, 500);
}
// Agregate the price and figures out if we are done
aggregatePrice = function(data) {
count++;
total += data.price;
console.log('price $' + price + ' total so far $' + total);
var areWeDone = (count == data.count);
if (areWeDone) {
eventEmitter.emit('done', {key: data.key, total: total});
}
}
// We are done.
displayTotal = function(data) {
console.log('total $ ' + data.total);
}
// Wire up the events
eventEmitter.on('getPriceFromDb', getPriceFromDb);
eventEmitter.on('aggregatePrice', aggregatePrice);
eventEmitter.on('done', displayTotal);
// Kick of the calculate process over an array of keys
keys = [1, 2, 3]
calculatePrice(keys);

Node.JS String encoding issues

I'm writing a sort of interface between a TCP Chat server and an SQL Server, and while working on a part where a user submits a value and is assigned a named pulled from a row in this DB with this value. When querying this DB with a value from a telnet shell, I can't get any results from the DB. But I can when I perform the same query in adminer/MySQL # BASH etc...
My thoughts are that it has come down to an encoding issue. Being rather noobish to node, I don't really know what to do. I do have a pretty good experience with JavaScript, just not node.
Code
function setCliNameOrKick(client, key){
key = String(key).replace(/\n\r\b\\\s/gi, "");
var q = "SELECT username FROM webusers WHERE lic = \'"+String(key).toString()+"\'; --";
console.log(key);
query(q);
cli.query(q, function cb(e, r, f){
if(client != null){
console.log(r);
if(r.length >= 1){
client.name = r['username'];
}else{
client.stream.end();
}
}else{
console.log("Was Passed A Null Client!");
}
});
}
That comes from the DB query tool
It takes input from a string sent by the client on connect, alongside an object representing the client
stream.addListener("data", function(data){
if(client.name == null){
data = String(data).replace(new RegExp("[\n]+", "g"), "");
cNameBuff = cNameBuff + data;
if(cNameBuff.length > 1){ //Min Length
//client.name = cNameB;
db.set(client, cNameBuff);
onAuth(client);
}
return;
}
data = String(data);
if(data.length >= 2){
srv.procChat(client, data);
}
});