firebase update by batches does not work with large dataset - function

I want to populate a feed to almost one million of users upon a content posted by a user with high number of followers using GCP cloud functions.
In order to do this, I am designing to split the firebase update of the feed into numbers of small batches. That's because I think if I dont split the update, I might face the following issues:
i) keeping one million of users feed in memory will exceed the allocated maximum 2GB memory.
ii) update one million of entries at one go will not work (How long it takes to update one million entries?)
However, the batch update only works for me when the batch only inserting around 100 entries per update invocation. When I tried with 1000 per batch, only the 1st batch was inserted. I wonder if this is due to:
i) time-out ? however I dont see this error in the log.
ii) The array variable , userFeeds{} , keeping the batch is destroyed when the function is out of scope ?
Below is my code:
var admin = require('firebase-admin');
var spark = require('./spark');
var user = require('./user');
var Promise = require('promise');
var sparkRecord;
exports.newSpark = function (sparkID) {
var getSparkPromise = spark.getSpark(sparkID);
Promise.all([getSparkPromise]).then(function(result) {
var userSpark = result[0];
sparkRecord = userSpark;
sparkRecord.sparkID = sparkID;
// the batch update only works if the entries per batch is aroud 100 instead of 1000
populateFeedsToFollowers(sparkRecord.uidFrom, 100, null, myCallback);
});
};
var populateFeedsToFollowers = function(uid, fetchSize, startKey, callBack){
var fetchCount = 0;
//retrieving only follower list by batch
user.setFetchLimit(fetchSize);
user.setStartKey(startKey);
//I use this array variable to keep the entries by batch
var userFeeds = {};
user.getFollowersByBatch(uid).then(function(users){
if(users == null){
callBack(null, null, null);
return;
}
//looping thru the followers by batch size
Object.keys(users).forEach(function(userKey) {
fetchCount += 1;
if(fetchCount > fetchSize){
// updating users feed by batch
admin.database().ref().update(userFeeds);
callBack(null, userKey);
fetchCount = 0;
return;
}else{
userFeeds['/userFeed/' + userKey + '/' + sparkRecord.sparkID] = {
phase:sparkRecord.phase,
postTimeIntervalSince1970:sparkRecord.postTimeIntervalSince1970
}
}
});//Object.keys(users).forEach
if(fetchCount > 0){
admin.database().ref().update(userFeeds);
}
});//user.getFollowersByBatch
};
var myCallback = function(err, nextKey) {
if (err) throw err; // Check for the error and throw if it exists.
if(nextKey != null){ //if having remaining followers, keep populating
populateFeedsToFollowers(sparkRecord.uidFrom, 100, nextKey, myCallback);
}
};

Related

How to fix "Service Documents failed while accessing document" while inserting a lot of data?

This is a follow-up question derivated from How to solve error when adding big number of tables
With the code below, I get the following message when, for 500 tables. BUt it works fine for 200 for example.
Exception: Service Documents failed while accessing document with id
The error happens on line 22, inside de if body = DocumentApp.getActiveDocument().getBody();
You also have the table template id to try, but here is an image
Image Table Template
function RequirementTemplate_Copy() {
var templatedoc = DocumentApp.openById("1oJt02MfOIQPFptdWCwDpj5j-zFdO_Wrq-I48mUq9I-w");
return templatedoc.getBody().getChild(1).copy()
}
function insertSpecification_withSection(){
// Retuns a Table Template Copied from another Document
reqTableItem = RequirementTemplate_Copy();
var body = DocumentApp.getActiveDocument().getBody();
// Creates X number of separated tables from the template
for (var i = 1; i < 501; i++){
table = reqTableItem.copy().replaceText("#Title#",String(i))
body.appendTable(table);
if((i % 100) === 0) {
DocumentApp.getActiveDocument().saveAndClose();
body = DocumentApp.getActiveDocument().getBody()
}
}
}
It looks that the error message isn't related to the number of tables to be inserted because it occurs before adding the tables.
Just wait a bit an try again. If the problem persist try your code using a different account if the code runs on the second account it's very possible that you first account exceeded a limit... there are some limits to prevent abuse that aren't published and that might change without any announcement.
Using the fix suggested for the code from my answer to the previous question and changing the number for iteration limit to 1000 and 2000 works fine
The following screenshot shows the result for 1000
Here is the code used for the tests
function insertSpecification_withSection(){
startTime = new Date()
console.log("Starting Function... ");
// Retuns a Table Template Copied from another Document
reqTableItem = RequirementTemplate_Copy();
var body = DocumentApp.getActiveDocument().getBody();
// Creates X number of separated tables from the template
for (var i = 0; i < 2000; i++){
table = body.appendTable(reqTableItem.copy());
// if((i % 100) === 0) {
// DocumentApp.getActiveDocument().saveAndClose();
// }
//
}
endTime = new Date();
timeDiff = endTime - startTime;
console.log("Ending Function..."+ timeDiff + " ms");
}
function RequirementTemplate_Copy() {
//---------------------------------------------------------------------------------------------------------------------------------------------------
var ReqTableID = PropertiesService.getDocumentProperties().getProperty('ReqTableID');
try{
var templatedoc = DocumentApp.openById(ReqTableID);
} catch (error) {
DocumentApp.getUi().alert("Could not find the document. Confirm it was not deleted and that anyone have read access with the link.");
//Logger.log("Document not accessible", ReqTableID)
}
var reqTableItem = templatedoc.getChild(1).copy();
//---------------------------------------------------------------------------------------------------------------------------------------------------
return reqTableItem
}
function setReqTableID(){
PropertiesService.getDocumentProperties().setProperty('ReqTableID', '1NS9nOb3qEBrqkcAQ3H83OhTJ4fxeySOQx7yM4vKSFu0')
}

Collision probability using this custom id generation code (Node.js)

Am I running an unnecessary risk of creating an id that is not unique? I'm trying to generate a unique, random id of alphanumeric characters. This ID will be used in the primary key for the database record.
const idSeed: string =
crypto.randomBytes(16).toString('base64') +
'' +
Date.now();
const orderId: string = Buffer.from(idSeed)
.toString('base64')
.replace(/[\/\+\=]/g, '');
First off, I recommend that you get rid of the .replace(/[\/\+\=]/g, '') as that is losing randomness and, in fact, mapping some unique orderIds that differ only in those characters to be the same.
My recommendation would be to use a base58 encoder base-x that will directly encode to what you want. This encoder library lets you pass in the exact character set you want to use for encoding and it just uses that.
Here's my suggested code you can insert:
const base58Encode = require('base-x')('123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz').encode;
And, then where you create the orderID, change to this:
const idSeed = crypto.randomBytes(16)
const orderId = base58Encode(idSeed);
I don't know about the probability of a dup (you'd need a crypto/statistician for that), but I ran 10,000,000 orderId values without a dup and I repeated that 10 times and still didn't get a dup. Obviously, that doesn't mean it can't happen, but I'm doing this rapid fire too where Date.now() might not even be much different. I couldn't run it more than 10,000,000 times because I run out of memory trying to store all the prior orderId values in a Set object to check for dups. You could increase memory for nodejs and run it with even higher values or put it in a shell script and run it over and over and over again.
Here's my dup checker program if you want to run it yourself over and over:
const crypto = require('crypto');
function addCommas(str) {
var parts = (str + "").split("."),
main = parts[0],
len = main.length,
output = "",
i = len - 1;
while(i >= 0) {
output = main.charAt(i) + output;
if ((len - i) % 3 === 0 && i > 0) {
output = "," + output;
}
--i;
}
// put decimal part back
if (parts.length > 1) {
output += "." + parts[1];
}
return output;
}
let set = new Set();
const numToTry = 10_000_000;
const debugMultiple = 100_000;
for (let i = 0; i < numToTry; i++) {
if (i !== 0 && i % debugMultiple === 0) {
console.log(`Attempt #${addCommas(i)}`);
}
const idSeed = crypto.randomBytes(16).toString('base64') + '' + Date.now();
const orderId = Buffer.from(idSeed).toString('base64').replace(/[\/\+\=]/g, '');
//console.log(orderId);
if (set.has(orderId)) {
console.log(`Found conflict after ${addCommas(i)} attempts`);
console.log(`Conflicting orderId = ${orderId}`);
process.exit(1);
}
set.add(orderId);
}
console.log(`No dups found after ${addCommas(numToTry)} attempts`);
Before spending a lot of time on this, I would investigate your database to see if it will generate a unique key for you that could work as the orderId. This is a common database problem.
Here's a newer version that I was able to run up to 1,000,000,000 ids through. Still no conflicts. Because there's no way I could have a giant Set object of 1,000,000,000 ids in memory, I brainstormed about a number of ways to do it. I thought about using a redis server and storing the ids in there since it can use a lot more memory. But, then I came up with a disk-based solution that can scale as high as you want. Here's the basic idea:
One of your orderId values looks like this:
zz6h6q6oRELJXmh4By4NUw1587006335064`
When I generate a new orderId, if I can separate it out into a disk-based "bucket" that contains only ids with the same beginning characters, then I can split all the ids among many different files.
The idea is that if each id that starts with the same two characters is stored in the same file, then no other id in any other file could possibly match the ids in that file.
You can then do your work in two passes. The first pass generates 1,000,000,000 ids and as they are generated, they are written out to an appropriate bucket file based on the characters the id starts with.
After all the ids are generated and written to their appropriate bucket files, the second pass is to iterate through each of the bucket files one at a time, load all the ids into a Set object and see if any conflict. If none match, clear that Set and go onto the next file. This lets you do the memory constrained part (dealing with a Set object) in pieces to use less memory for big numbers of ids.
So, then the question is how to divide the ids up into bucket files? Since each byte in the base64 id value represents up to 64 possible values, if you use just the first two characters of the id to determine the bucket, you will get up to 64*64=4096 buckets. For some reason (which must have to do with how crypto.randomBytes(16) works), I only found ~3800 buckets actually occurred in the actual orderId values.
But, if you split 1,000,000,000 values into 3800 buckets, you get about 263,000 ids per bucket. We already showed that we could easily process 15,000,000 ids in memory before, so this should be more than enough buckets to be able to process each bucket in memory one at a time. In fact, if I were patient enough, we could probably go to 10,000,000,000 with buckets based on just the first two characters.
If you wanted more buckets, they could be based on the first three characters, though then you start getting too many files for a single directory and you have to start splitting files among directories which can be done, but complicates things.
So, I need to create a bucket filename that's based on the first two characters of the id. The ids are case sensitive (base64 uses upper and lower case to represent different values). My Windows file system is case insensitive so I can't just directly use the first two letters as the filename. So, I created a simple algorithm, that takes a two character mixed case prefix and makes it into a four character lowercase name. It maps a lowercase "a" to "a_" and a non-lowercase character like "B" to "bb". So, a lowercase value is followed by a _ and an uppercase value is follows by a second copy of itself. So, you'd have id mappings like this:
"ab" => "a_b_"
"AB" => "aabb"
"aB" => "a_BB"
"Ab" => "aab_"
Non-alpha characters (like numbers) just map to a doubling of themselves just like any non-lowercase characters. So, with this, I can get an id value, grab the first two characters, see what filename it belongs to and append it to that file.
For performance reasons, I created a Bucket class which maintains a cache of ids waiting to be written in memory. When the cache inside a particular bucket gets to a certain length (which I now have set to 3000), I append them all out to the file at once and clear the bucket cache. When I'm done generating all the ids, I iterate through all the buckets and flush out any remaining ids. With this kind of write caching, the generation of ids is mostly CPU bound, not disk bound. Disk utilization runs around 30%. One core of the CPU is pegged during id generation. This could probably be sped up with some WorkerThreads.
So, once all the ids are written to bucket files and nothing is in memory at all, it's time to read through each of the bucket files one at a time, load all their ids into a Set and see if there are any conflicts. Each bucket file is a line separated list of ids that all start with same prefix like this:
zzoexm2FE8DIrHnXpp8qw1587003338798
zzuP6LpusKIMeYrfl0WJnQ1587003338885
zz1itmTqA3yaFNo1KFUhg1587003338897
zz3TEFeqH965OTFCrFTjJQ1587003338904
zz8XQKvq11fCqn9kB4O2A1587003338904
zzaKMTFPct5ls7WW3YmcQ1587003338927
zzyX3htzIqi4zOq4Cxdg1587003338928
zzoHu6vIHMEgNMVY46Qw1587003338962
So, I just read a given bucket file, line by line, check each id against a Set for that bucket file. If it's already in the set, there's a conflict. Output that conflict and abort. If it's not the Set, add it to the Set and continue with the rest of the ids in that bucket file. Since this bucket file contains all the ids that start with the same two characters, no other id in any other bucket file can conflict with these so you can just compare all these ids vs each other.
The reading of the bucket files is heavily disk bound. When running 1,000,000,000 ids into the 3844 bucket files, each bucket file is about 5MB which is 22GB of data. Each file has to be read and parsed into lines and then each id added to the Set.
I tried a couple different mechanisms for reading the files line by line and found them quite slow. I started with the readLine interface which lets you iterate through line by line via a readStream. It was sloooow. Then, I just read the whole file into memory with fs.readFile() into a giant string and then called .split("\n") on it to break it into lines. This was actually better than readLine, but still slow. I theorized that there were just too many copies of the data which meant the garbage collector was having to work at lot.
So, finally I wrote my own version of readFile that reads the entire file into a reusable Buffer and splits it into lines by parsing the binary buffer directly. This saved at least a couple copies of the data along the way and saved a lot of GC work. It wasn't fast, but it was faster. Reusing the buffer also saved me a lot of separate 5MB allocations.
The first pass (generating the ids) is CPU bound. I've theorized I could speed that up quite a bit by starting up a number of Worker Threads (probably like 6 since I have an 8-core CPU) and letting them crunch on generating the ids. I would dole out 1/6 of the quantity to each Worker Thread and when they accumulated 1000 or so, they'd message those 1000 back to the main thread which would insert them in the right buckets. But, before I adventure into using WorkerThreads, I need to do some benchmarking to see how much of the total time of the first pass is in the crypto.randomBytes() function vs. elsewhere to make sure it would be worth it.
The second pass it totally disk bound, but the actual disk throughput is horrible (like 60MB/s). Either my disk really sucks, nodejs isn't very good at this type of file I/O or there's just a lot of overhead in handling 3800 large files (read directory entry, seek to disk for first sector, read as many sequential sectors as you can, seek again, etc...). I could try it on my fastest SSD, but I don't really want to go writing 20GB to my SSD everytime I play with this.
I played with increasing the UV_THREADPOOL_SIZE thinking that maybe nodejs was queuing too many reads/writes. But, performance actually got worse when I increased the thread pool size. I guess it's default of 4 is more than enough to keep one disk controller plenty busy. Anything more than that and you're just asking the disk head to jump around between different files when it would be more efficient to read all of one file, then go to the next file and so on.
While the second pass is mostly disk bound, there's still about 30% of the time spent in non-disk related stuff (based on some high-res timers I inserted). So, if it didn't cause too much harm with disk contention, it's possible you could spread the processing of the different bucket files out among a group of WorkerThreads. You would at least get parallelism on the CPU part of that process. You would likely get more disk contention though so I'm not sure if it would help.
Lastly, bucket files could be split among drives and, even ideally among separate SATA controllers. I have plenty of drives and a couple SATA controllers to try that, but then it gets pretty specific to my system.
Here's the code for the bucket system.
// unique-test.js
const crypto = require('crypto');
const readline = require('readline');
const fs = require('fs');
const fsp = fs.promises;
const path = require('path');
const {fastReadFileLines} = require('./fast-read-file.js');
function delay(t, v) {
return new Promise(resolve => {
setTimeout(resolve, t, v);
})
}
function addCommas(str) {
var parts = (str + "").split("."),
main = parts[0],
len = main.length,
output = "",
i = len - 1;
while(i >= 0) {
output = main.charAt(i) + output;
if ((len - i) % 3 === 0 && i > 0) {
output = "," + output;
}
--i;
}
// put decimal part back
if (parts.length > 1) {
output += "." + parts[1];
}
return output;
}
// make a unique filename using first several letters of
// the string. Strings are case sensitive, bucket filenames
// cannot be so it has to be case neutralized while retaining
// uniqueness
function makeBucketKey(str) {
let piece = str.substr(0,2);
let filename = [];
// double up each character, but
for (let ch of piece) {
filename.push(ch);
if (ch >= 'a' && ch <= 'z') {
filename.push("_")
} else {
filename.push(ch);
}
}
return filename.join("").toLowerCase();
}
// this value times the number of total buckets has to fit in memory
const bucketCacheMax = 3000;
class Bucket {
constructor(filename, writeToDisk = true) {
this.items = [];
this.filename = filename;
this.cnt = 0;
this.writeToDisk = writeToDisk;
// We dither the bucketCacheMax so that buckets aren't all trying to write at the same time
// After they write once (and are thus spread out in time), then they will reset to full cache size
let dither = Math.floor(Math.random() * bucketCacheMax) + 10;
if (Math.random() > 0.5) {
dither = -dither;
}
this.bucketCacheMax = bucketCacheMax + dither;
}
// add an item to cache, flush to disk if necessary
async add(item) {
++this.cnt;
this.items.push(item);
if (this.items.length > this.bucketCacheMax) {
// the dithered cache size is only used on the first write
// to spread out the writes. After that, we want a full cache size
let priorBucketCacheMax = this.bucketCacheMax;
this.bucketCacheMax = bucketCacheMax;
await this.flush();
}
}
// write any cached items to disk
async flush() {
if (this.writeToDisk && this.items.length) {
let data = this.items.join("\n") + "\n";
this.items.length = 0;
if (this.flushPending) {
throw new Error("Can't call flush() when flush is already in progress");
}
function flushNow() {
this.flushPending = true;
return fsp.appendFile(this.filename, data).finally(() => {
this.flushPending = false;
});
}
// we write to disk with retry because we once go EBUSY (perhaps from a backup program)
let retryCntr = 0;
const retryMax = 10;
const retryDelay = 200;
const retryBackoff = 200;
let lastErr;
function flushRetry() {
if (retryCntr > retryMax) {
throw lastErr;
}
return flushNow.call(this).catch(err => {
lastErr = err;
console.log("flushNow error, retrying...", err);
return delay(retryDelay + (retryCntr++ * retryBackoff)).then(() => {
return flushRetry.call(this);
});
});
}
return flushRetry.call(this);
}
this.items.length = 0;
}
delete() {
return fsp.unlink(this.filename);
}
get size() {
return this.cnt;
}
}
class BucketCollection {
constructor(dir, writeToDisk = true) {
// map key is bucketID, value is bucket object for that key
this.buckets = new Map();
this.dir = dir;
}
add(key, data) {
let bucket = this.buckets.get(key);
if (!bucket) {
let filename = path.join(this.dir, key);
bucket = new Bucket(filename, writeToDisk);
this.buckets.set(key, bucket);
}
return bucket.add(data);
}
async flush() {
// this could perhaps be sped up by doing 4 at a time instead of serially
for (let bucket of this.buckets.values()) {
await bucket.flush();
}
}
async delete() {
// delete all the files associated with the buckets
for (let bucket of this.buckets.values()) {
await bucket.delete();
}
}
get size() {
return this.buckets.size;
}
getMaxBucketSize() {
let max = 0;
for (let bucket of this.buckets.values()) {
max = Math.max(max, bucket.size);
}
return max;
}
}
// program options
let numToTry = 100_000;
let writeToDisk = true;
let cleanupBucketFiles = true;
let skipAnalyze = false;
let analyzeOnly = false;
// -nodisk don't write to disk
// -nocleanup erase bucket files when done
// -analyzeonly analyze files in bucket directory only
if (process.argv.length > 2) {
let args = process.argv.slice(2);
for (let arg of args) {
arg = arg.toLowerCase();
switch(arg) {
case "-nodisk":
writeToDisk = false;
break;
case "-nocleanup":
cleanupBucketFiles = false;
break;
case "-skipanalyze":
skipAnalyze = true;
break;
case "-analyzeonly":
analyzeOnly = true;
break;
default:
if (/[^\d,]/.test(arg)) {
console.log(`Unknown argument ${arg}`);
process.exit(1);
} else {
numToTry = parseInt(arg.replace(/,/g, ""), 10);
}
}
}
}
let bucketDir = path.join(__dirname, "buckets");
let collection = new BucketCollection(bucketDir, writeToDisk);
console.log(`Running ${addCommas(numToTry)} random ids`);
const debugMultiple = 100_000;
async function analyze() {
let cntr = 0;
const cntrProgress = 10;
const cntrProgressN = 10n;
let buffer = null;
let times = [];
async function processFile(file) {
if (cntr !== 0 && cntr % cntrProgress === 0) {
let sum = 0n;
for (let i = 0; i < cntrProgress; i++) {
sum += times[i];
}
console.log(`Checking bucket #${cntr}, Average readFileTime = ${sum / cntrProgressN}`);
times.length = 0;
}
++cntr;
let set = new Set();
let startT = process.hrtime.bigint();
let buffer = null;
let result = await fastReadFileLines(file, buffer);
let data = result.lines;
// keep reusing buffer which may have been made larger since last time
buffer = result.buffer;
//let data = (await fsp.readFile(file, "utf8")).split("\n");
let afterReadFileT = process.hrtime.bigint();
for (const lineData of data) {
let line = lineData.trim();
if (line) {
if (set.has(line)) {
console.log(`Found conflict on ${data}`);
} else {
set.add(line);
}
}
}
let loopT = process.hrtime.bigint();
let divisor = 1000n;
let readFileTime = (afterReadFileT - startT) / divisor;
times.push(readFileTime);
// console.log(`readFileTime = ${readFileTime}, loopTime = ${(loopT - afterReadFileT) / divisor}`);
/*
let rl = readline.createInterface({input:fs.createReadStream(file), crlfDelay: Infinity});
for await (const line of rl) {
let data = line.trim();
if (data) {
if (set.has(data)) {
console.log(`Found conflict on ${data}`);
} else {
set.add(data);
}
}
}
*/
}
if (analyzeOnly) {
let files = await fsp.readdir(bucketDir);
for (let file of files) {
let fullPath = path.join(bucketDir, file)
await processFile(fullPath);
}
} else {
for (let bucket of collection.buckets.values()) {
await processFile(bucket.filename);
}
}
}
async function makeRandoms() {
let start = Date.now();
if (analyzeOnly) {
return analyze();
}
for (let i = 0; i < numToTry; i++) {
if (i !== 0 && i % debugMultiple === 0) {
console.log(`Attempt #${addCommas(i)}`);
}
const idSeed = crypto.randomBytes(16).toString('base64') + '' + Date.now();
const orderId = idSeed.toString('base64').replace(/[\/\+\=]/g, '');
//console.log(orderId);
let bucketKey = makeBucketKey(orderId);
await collection.add(bucketKey, orderId);
}
console.log(`Total buckets: ${collection.size}, Max bucket size: ${collection.getMaxBucketSize()}`);
//console.log(`No dups found after ${addCommas(numToTry)} attempts`);
await collection.flush();
let delta = Date.now() - start;
console.log(`Run time for creating buckets: ${addCommas(delta)}ms, ${addCommas((delta / numToTry) * 1000)}ms per thousand`);
if (!skipAnalyze) {
console.log("Analyzing buckets...")
await analyze();
}
if (cleanupBucketFiles) {
console.log("Cleaning up buckets...")
await collection.delete();
}
}
makeRandoms();
And, here's a dependent file (goes in the same directory) for my faster readfile function:
// fast-read-file.js
const fsp = require('fs').promises;
async function fastReadFile(filename, buffer = null) {
let handle = await fsp.open(filename, "r");
let bytesRead;
try {
let stats = await handle.stat();
if (!buffer || buffer.length < stats.size) {
buffer = Buffer.allocUnsafe(stats.size);
}
// clear any extra part of the buffer so there's no data leakage
// from a previous file via the shared buffer
if (buffer.length > stats.size) {
buffer.fill(0, stats.size);
}
let ret = await handle.read(buffer, 0, stats.size, 0);
bytesRead = ret.bytesRead;
if (bytesRead !== stats.size) {
// no data leaking out
buffer.fill(0);
throw new Error("bytesRead not full file size")
}
} finally {
handle.close().catch(err => {
console.log(err);
});
}
return {buffer, bytesRead};
}
async function fastReadFileLines(filename, buf = null) {
const {bytesRead, buffer} = await fastReadFile(filename, buf);
let index = 0, targetIndex;
let lines = [];
while (index < bytesRead && (targetIndex = buffer.indexOf(10, index)) !== -1) {
// the buffer may be larger than the actual file data
// so we have to limit our extraction of data to only what was in the actual file
let nextIndex = targetIndex + 1;
// look for CR before LF
if (buffer[targetIndex - 1] === 13) {
--targetIndex;
}
lines.push(buffer.toString('utf8', index, targetIndex));
index = nextIndex;
}
// check for data at end of file that doesn't end in LF
if (index < bytesRead) {
lines.push(buffer.toString('utf8', index, bytesRead));
}
return {buffer, lines};
}
module.exports = {fastReadFile, fastReadFileLines};
// if called directly from command line, run this test function
// A file of ids named "zzzz" must exist in this directory
if (require.main === module) {
let buffer = Buffer.alloc(1024 * 1024 * 10, "abc\n", "utf8");
fastReadFileLines("zzzz", buffer).then(result => {
let lines = result.lines;
console.log(lines[0]);
console.log(lines[1]);
console.log(lines[2]);
console.log("...");
console.log(lines[lines.length - 3]);
console.log(lines[lines.length - 2]);
console.log(lines[lines.length - 1]);
}).catch(err => {
console.log(err);
});
}
You first create a sub-directory named "buckets" under where you are running this. Then, you run this from the command line:
node unique-test.js 1,000,000,000
There are some supported command lines options (mostly used during debugging):
-nodisk Don't write to disk
-nocleanup Don't cleanup generated disk files when done
-skipAnalyze Just generate bucket files, don't analyze them
-analyzeOnly Use previously generated bucket files and analyze them
The number you pass on the command line is how many ids to generate. If you pass nothing, it defaults to 100,000. For readability, it handles commas.
That's a really superb answer by #jfriend, I'd just like to add that you can calculate the result analytically, or rather an approximation. I believe using both approaches can be the best route to go.
This is an example of the Birthday Problem.
The TLDR on this is that the approximate probability of collision can be determined using the formula:
1 − exp(−n²/(2x))
Where x is the number of possible values and n is the number of generated values, as long as n is small compared to x (It will be!)
Now, you have approximately 16 bytes of entropy in the generated ids this gives 2^128 or 3.4 x 10^38 possible ids. Since two characters are being dropped (+/), the number of possible values is more like (62^21) = 4.37 x 10^37.
As #jfriend00 has pointed out, the addition of the date means you'd have to generate the number of ids in the table below every millisecond to have the corresponding probability of collision.
This table should give an approximation of the collision probabilities.
|----------------------------|----------------------------|
| Number of Ids | Collision Probability |
|----------------------------|----------------------------|
| 10^6 (1 million) | 2.29 × 10^-26 |
|----------------------------|----------------------------|
| 10^9 (1 billion) | 2.29 × 10^-20 |
|----------------------------|----------------------------|
| 10^12 (1 trillion) | 2.29 × 10^-14 |
|----------------------------|----------------------------|
| 10^15 (1 quadrillion) | 2.29 × 10^-8 |
|----------------------------|----------------------------|
I've used the very handy Wolfram Alpha to calculate these results.

Node JS mySQL tripple query and lost of information

In my application I have a two different tables related to each other by ID of the first one (one to many relation). It should first collect the data from the frontend-side by in JSON format which looks like this:
cancellation = {
name: someting
id: someting
rule =
[
{someting}, {something}, {something}
]
}
One table would be for cancellation and the second one for the rules. If I want to put those information in this order I need first insert one record for cancellation. Then make a query to find out what is an ID of this record in the database and after that insert all rules using this ID as a foreign key. But since Node JS is asynchronous before I fetch the information about the ID of the record program stars to execute rest of the code and consider this variable as undefined.
app.post('/databaseSend/cancellation', function(req,res){
var cancellationReceived = req.body;
var cancellationID;
var rules = [];
var cancellation = [];
cancellation[0] =
[
cancellationReceived.name,
cancellationReceived.id
]
// inserting data into cancellation table
connection.query("INSERT INTO cancellations (name, User_ID) VALUES ?", [cancellation],
function(err,results){
if(err){console.log(err)}
}
)
//fetching ID of the current record
connection.query("SELECT id FROM cancellations WHERE User_ID = ? AND name = ?", [cancellationReceived.id, cancellationReceived.name],
function(err, results){
var cancellationID = results[0].id;
});
//assigning ID to use it as a foreign key
for(var i = 0; i < cancellationReceived.rule.length; i++)
{
rules[i] =
[
cancellationReceived.rule[i].daysBefore,
cancellationReceived.rule[i].fee,
cancellationReceived.rule[i].type,
cancellationID
]
}
for(var i = 0; i < rules.length; i++)
{
console.log(rules[i]); // ID is undefined
}
});
How can I solve this problem? I tried to use setTimeout for pausing my code but it did not change anything.
And I use this node module for mysql - > https://github.com/mysqljs/mysql
The best way to solve this problem is RTFM.
connection.query('INSERT INTO cancellations (name, user_id) values ?', [cancellation], function(err, results) {
if (err)
return console.error(err);
// See https://github.com/mysqljs/mysql#getting-the-id-of-an-inserted-row
var cancellation_id = results.insertId;
// Generate sql for rules, join them by ; and execute as one query
// See https://github.com/mysqljs/mysql#multiple-statement-queries
connection.query(sqls, function(err) {
if (err)
return console.error(err);
// Send response here
});
})

how to save two rows when two rows are sent and than delete one if just one sent?

I am new in mySql. but need to implement next fnx:
Step 1: I save
notificationChannels = ['channel1', 'channel2];
I save each channel in separate row.using loop
notificationSettings.notificationChannels.forEach(function (channel) {
var channelQuery = 'INSERT INTO notification_channels (user_id, channel) VALUES('+ connection.escape(req.session.user_id)+','+ connection.escape(channel)+') ON DUPLICATE KEY UPDATE channel='+connection.escape(channel) ;
connection.query( channelQuery, function(err, rows) {
if (!err){
isOk = true;
}
else{
console.log(err);
}
});
});
So I've saved 2 rows.
than I want to change settigns - save just 1 channel:
notificationChannels = ['channel2];
So now I need to delete row with 'channel1 and leave just 'channel2'. how to do it?

Refactor non-blocking nodejs do..while loop

I'm writing an api in node.js. The first webservice endpoint - /create - creates a new db entry with a randomised 6-character hash, much like a bit.ly hash.
Having done something similar in PHP, I've written a do..while loop which generates a random string and checks my mysql db (using node-mysql) to make sure it's free. I've also got a counter in there, so I can fail after x iterations if need be.
var i = 0;
var alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
var hash = null;
var success = false;
do {
// generate a random hash by shuffling the alphabet,
// joining it and getting 6 chars
hash = alphabet.sort(function(){
return 0.5 - Math.random();
}).join('').substr(0,6);
console.log(i + ': checking hash ' + hash);
// see if it exists in the db
db.query("SELECT hash FROM trips WHERE hash = " + hash, function(err, results){
if(results.length == 0) {
// the hash is free to use :)
success = true;
} else {
// the hash is already taken :(
success = false;
}
});
// increment the counter
i++;
} while(success === false && i < 10);
I currently only have one hash in my db (abcdef), but the loop is getting to ten and failing because it thinks each new hash is already present.
I'm pretty sure this is because of the non-blocking nature of node.js. This is obviously A Good Thing, but in my case I need the loop to block until the query has returned.
I'm pretty sure I could hack this by doing something like:
var q = db.query(...);
But I know that's throwing away a major feature of node.js.
Is there a code pattern for this sort of need?
I'm pretty sure this is because of the non-blocking nature of node.js.
Yes.
This is obviously A Good Thing, but in my case I need the loop to block until the query has returned.
No, you most certainly don't want to do that.
Embrace the asynchronous approcach. Work with call-backs:
function generateHash(onSuccess, onError, retryCount) {
// generate a random hash by shuffling the alphabet,
// joining it and getting 6 chars
var hash = alphabet.sort(function(){
return 0.5 - Math.random();
}).join('').substr(0,6);
// see if it exists in the db
db.query(
"SELECT hash FROM trips WHERE hash = '" + hash + "'",
function(err, results){
if (results.length == 0) {
// the hash is free to use :)
onSuccess(hash);
} else {
// the hash is already taken :(
if (retryCount > 1) {
generateHash(onSuccess, onError, retryCount - 1);
} else {
onError();
}
}
}
});
}
generateHash(
function(hash) { console.log('Success! New hash created: ' + hash); },
function() { console.log('Error! retry limit reached'); },
6
);
var i=0;
function generateHash(callback) {
// generate a random hash by shuffling the alphabet,
// joining it and getting 6 chars
hash = alphabet.sort(function(){
return 0.5 - Math.random();
}).join('').substr(0,6);
console.log(i + ': checking hash ' + hash);
// see if it exists in the db
db.query("SELECT hash FROM trips WHERE hash = " + hash, function(err, results){
if(results.length == 0) {
// the hash is free to use :)
callback(null, hash);
} else {
// increment the counter
i++;
if (i < 10)
generateHash(callback); //another attempt
else
callback('error'); // return result
}
});
}