Importing huge dataset from JSON to Cloud Firestrore - json

I'm trying to import a large JSON file(177k record) to cloud firestore, firstly I found the code below;
Uploading Code
var admin = require("firebase-admin");
var serviceAccount = require("./service_key.json");
admin.initializeApp({
credential: admin.credential.cert(serviceAccount),
databaseURL: "my service key"
});
const firestore = admin.firestore();
const path = require("path");
const fs = require("fs");
const directoryPath = path.join(__dirname, "files");
fs.readdir(directoryPath, function(err, files) {
if (err) {
return console.log("Unable to scan directory: " + err);
}
files.forEach(function(file) {
var lastDotIndex = file.lastIndexOf(".");
var menu = require("./files/" + file);
menu.forEach(function(obj) {
firestore
.collection('academicians2')
.add({ 'department': obj['department'], 'designation': obj['designation'], 'field': obj['field'], 'name': obj['name'], 'university': obj['university'], 'reviewList': [], 'rating': 0 })
.then(function(docRef) {
console.log("Document written");
})
.catch(function(error) {
console.error("Error adding document: ", error);
});
});
});
});
but after uploading 10-15k records started to giving errors,(Memory error I guess), So I decided to schedule cloud functions for every 1.2 seconds as timeout and batch write this JSON to firestore, but really have no idea how to get 499 rows for each run from my JSON.
Scheduled Cloud Function
/* eslint-disable */
const functions = require("firebase-functions");
const admin = require('firebase-admin');
const { user } = require("firebase-functions/lib/providers/auth");
admin.initializeApp();
const firestore = admin.firestore();
const userRef = admin.firestore().collection('academicians2');
exports.scheduledFunction = functions.pubsub.schedule('every 1.2 seconds').onRun((context) => {
//do i need to create for loop for batch or how can i approach to solve this problem
});

I would do something like this:
Make the scheduled function get 500 records at a time with a "start after" clause.
Perform a batch write to the db (batch writes are limited to 500 as you may know)
If successful, copy the last record (or a reference to the last record: ex: record's ID) of those 500 records into a document in your db. It can be a document called "upload_tracker" with a field called "last_uploaded".
On subsequent operations: the function queries that last_uploaded record from your db, then performs another operation starting AFTER that last record.
Notes:
. The scheduled function can write multiple batches before terminating if you want to finish quickly.
. In your Google Cloud Console / Cloud Functions, you may want to extend the function's timeout value to 9 minutes, if you know it's going to run for a long time.
. The document ID's should reflect your "record ID's" if you have them, to make sure there are no duplicates.

Related

Node.js trying to use async/await to get data from a mysql select before using the variable

I need to catch some data by a mysql query, and use the result to build up and email message with its results with node.
I put the code inside a function, but the call to the query appear to still be async, as the result is never given back before the end of the function, and the returning variable is alwasy empty.
Tried different approach with async/await but still the execution seems async
In my following code is just get in the console log up to the step 3, the step 4 is mde no matter what I try to do at the end of the function call
async function querydb (utente){
console.log("sono in querydb");
var messageHTMLAllegati="";
var risultatoquery;
console.log("step 1");
var connection = mysql.createConnection({
host : process.env.IP_address,
user : process.env.nome_utente,
password : process.env.password,
port : process.env.port,
database : process.env.DB,
});
console.log("step 2");
const query = util.promisify(connection.query).bind(connection);
(async () => {
try {
console.log("step 3");
var result = await query('SELECT Link FROM Link_Foto where ID_Utente="' + utente + '"');
var i = result.length;
console.log("step 4");
var j ;
for (j=0; j < i; j++) {
messageHTMLAllegati +='Immagine ' + (j+1)+ '<BR>';
console.log("print the link found in the DB and added to the text to be printed"+result[j].Link);
}
} finally {
connection.end();
}
})()
return messageHTMLAllegati;
}
I do expect the final variable "messageHTMLAllegati" to contain some text plus the query fields needed, but it get always empty. In the log I see though that the variable is filled up, but only after that the function is returned, therefore the text used to put the email together is empty from the DB section
async/await method only works when await functions is a promise. functions like 'query' in mysql are using a callback function to get the result. So if you want to use it with async/await method you should use it in another function and get the result in its callback function as a promise like this:
function query_promise(q_string){
return new Promise((resolve, reject)=>{
query(q_string,(err, result)=>{
if(err) return reject(err);
resolve(result);
});
});
}
then in your code:
var result = await query_promise('SELECT Link FROM Link_Foto where ID_Utente="' + utente + '"');

Real-time database view on HTML page with Socket.io

I have a raspberry Pi that is constantly pushing data to a MySQL database via PHP. I am trying to create a website where I can see the contents of this database realtime.
I've been following this tutorial : http://markshust.com/2013/11/07/creating-nodejs-server-client-socket-io-mysql which shows an example on using socket.io for this purpose. This is working fine from 2 clients, when I add a new note it updates on both browsers. The problem is when I manually add a record to the database from mysql CLI, it does not update. I'm guessing this is because there is no emit happening. How can I implement this?
Server.js:
var mysql = require('mysql')
// Let’s make node/socketio listen on port 3000
var io = require('socket.io').listen(3000)
// Define our db creds
var db = mysql.createConnection({
host: 'localhost',
user: 'root',
password: 'root',
database: 'node'
})
// Log any errors connected to the db
db.connect(function(err){
if (err) console.log(err)
})
// Define/initialize our global vars
var notes = []
var isInitNotes = false
var socketCount = 0
console.log("connected");
io.sockets.on('connection', function(socket){
// Socket has connected, increase socket count
socketCount++
// Let all sockets know how many are connected
io.sockets.emit('users connected', socketCount)
socket.on('disconnect', function() {
// Decrease the socket count on a disconnect, emit
socketCount--
io.sockets.emit('users connected', socketCount)
})
socket.on('new note', function(data){
// New note added, push to all sockets and insert into db
notes.push(data)
io.sockets.emit('new note', data)
// Use node's db injection format to filter incoming data
db.query('INSERT INTO notes (note) VALUES (?)', data.note)
})
// Check to see if initial query/notes are set
if (! isInitNotes) {
// Initial app start, run db query
db.query('SELECT * FROM notes')
.on('result', function(data){
// Push results onto the notes array
notes.push(data)
})
.on('end', function(){
// Only emit notes after query has been completed
socket.emit('initial notes', notes)
})
isInitNotes = true
} else {
// Initial notes already exist, send out
socket.emit('initial notes', notes)
}
})
Index.html:
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"></script>
<script src="http://localhost:3000/socket.io/socket.io.js"></script>
<script>
$(document).ready(function(){
// Connect to our node/websockets server
var socket = io.connect('http://localhost:3000');
// Initial set of notes, loop through and add to list
socket.on('initial notes', function(data){
var html = ''
for (var i = 0; i < data.length; i++){
// We store html as a var then add to DOM after for efficiency
html += '<li>' + data[i].note + '</li>'
}
$('#notes').html(html)
})
// New note emitted, add it to our list of current notes
socket.on('new note', function(data){
$('#notes').append('<li>' + data.note + '</li>')
})
// New socket connected, display new count on page
socket.on('users connected', function(data){
$('#usersConnected').html('Users connected: ' + data)
})
// Add a new (random) note, emit to server to let others know
$('#newNote').click(function(){
var newNote = 'This is a random ' + (Math.floor(Math.random() * 100) + 1) + ' note'
socket.emit('new note', {note: newNote})
})
})
</script>
<ul id="notes"></ul>
<div id="usersConnected"></div>
<div id="newNote">Create a new note</div>
This is similar to a previous question, where it appears there was no simple way to do this with MySQL.
If you are in an early enough stage of development that you are not tied to MySQL, then I will point out that you can solve this problem with postgresql:
Be pushed to from PHP via PDO library (see docs).
Runs on the Raspberry Pi.
Can detect updates pushed from anywhere on the command-line via pg_notify on a trigger (see docs).
Updates can be subscribed to with NodeJS via the pg package.
On a technical level this will work, but databases in general are not efficient as messaging systems (watch out for the Database-as-IPC anti-pattern). The PHP client could also emit its own notification when things happen, via a message queue, UDP socket, or something else.

Async Node JS query connection

I am trying to create a excel workbook using excelbuilder and populating the data with information from a mysql database. This is my get method:
app.get('/excel', function(req, res) {
var locationSpread = ['location1.xlsx', 'location2.xlsx',
'location3.xlsx'];
locationSpread.forEach(function(filename) {
fs.unlink("./Spreadsheets/" + filename, (err) => {
if (err) {
console.log('Spreadsheet ' + filename + ' not found');
} else {
console.log('Spreadsheet ' + filename + ' successfully found');
}
});
var query = connection.query('SELECT * from ' + filename.slice(0, -5), function(err, rows) {
var workbook = excelbuilder.createWorkbook('./Spreadsheets/', filename);
var sheet = workbook.createSheet(filename.slice(0, -5), 3, rows.length);
sheet.set(1, 1, 'First Name');
sheet.set(2, 1, 'Last Name');
sheet.set(3, 1, 'Company');
*** TROUBLE CODE START HERE ***
for (var j = 2, z = 0; z < rows.length; j++, z++) {
sheet.set(1, j, rows[z].firstName);
sheet.set(2, j, rows[z].lastName);
sheet.set(3, j, rows[z].company);
}
*** TROUBLE CODE END HERE ***
workbook.save(function(err) {
console.log('workbook saved ' + (err ? 'failed' : 'ok'));
});
});
});
});
The first thing I do is delete the files if they exist in the folder Spreadsheets. Then I query the database based on the location. I create a new workbook and sheet then write to it. The for loop breaks the program and while the error is:
TypeError: Cannot read property '1' of undefined
I believe it to be something along the lines of async
One possible fault I see here is that you're using the async version of unlink. Potentially, if the file was not deleted when you try to create the work book, an error may occur.
You can try switching to unlinkSync and verifying that the workbook object is not null before creating a sheet (and as additional precaution, that the sheet object is not null before setting values).
Beyond that, it's impossible to assist further without your error stack.
So the error was because of an error of an initial setup. Basically the createWorkbook has an initial number of rows and columns and if you happen to try and write a value that goes out of the bounds of what you initially created, it throws an error. Hence, why my for loop kept on crashing.

Yahoo-Finance Query Speed

I'm currently working on a project that involves querying yahoo-finance for many different ticker symbols. The bottleneck is acquiring the data from yahoo, so I was wondering if there is a way I might go about speeding this up.
If I used multiple machines to query and then aggregated the data, would that help? I only have one physical machine; how might I go about doing that?
Thanks!
EDIT: Currently, I'm using Node.js, yahoo-finance, and Q.deferred to ask yahoo for historical data. Then, once all the promises are fulfilled (for each ticker), I'm doing a Q.all() to persist the data.
var data = [];
tickers = ["goog", "aapl", ...];
...
Q.all(_.map(tickers, function(symbol) {
return getYahooPromise(symbol);
}))
.done( function() { persistData(data) });
getYahooPromise retrieves data for the ticker symbol and pushes it into the data array. Once all promises are resolved, the data is persisted in a MySQL database.
SECOND EDIT:
More code:
var sequentialCalls = [];
for ( var i = 0; i < tickers.length / chunkSize; i++ ) {
sequentialCalls.push( persistYahooChunk );
}
sequentialCalls.push( function(callback) {
connection.end();
callback();
});
async.series( sequentialCalls )
exports.persistYahooChunk = function(callback) {
console.log("Starting yahoo query");
var currentTickers = tickers.slice(currentTickerIndex,currentTickerIndex + chunkSize);
return yahooFinance.historical( {
symbols: currentTickers,
from: "2015-01-28",
to: "2015-02-05"
}).then( function(result) {
console.log("Query " + currentTickerIndex + "/" + tickers.length + "completed");
currentTickerIndex += chunkSize;
//add valid data
var toPersist = _.map(result, function(quotes, symbol) {
return [symbol, quotes.length != 0 ];
});
var query = "INSERT INTO `ticker` (`symbol`, `valid`) VALUES ?";
connection.query(query, [toPersist], function(err, result) {
if (err) {
console.log (err);
}
//console.log(result);
callback();
});
});
}
The bottleneck is because you are doing one query per ticker.
Depending on the data you need to pull, if you could do a single query that includes all your tickers it would be much faster.
Here is an example if you need to get all current prices for a list of tickers, with a single query :
http://finance.yahoo.com/webservice/v1/symbols/A,B,C,D,E/quote?format=json

IndexedDB can make database unreachable (getting blocked), how unblock?

UPDATE
I discovered the issue is that it's blocked. Despite the database always being created and upgraded by the same extension, it does not get closed. So now I'm getting the "onblocked" function called.
How do I "unblock" currently blocked databases? And how do I prevent this in the future? This is an app, so no tabs are using it. And since I can't open those databases to even delete them (this also gets blocked), how do I close them?
(For anyone wondering, to avoid this issue from the start, you HAVE to do the folllowing:)
mydb.onversionchange = function(event) {
mydb.close();
};
Original Post
IndexedDB dies and becomes unopenable if I (accidentally) try to open and upgrade with the wrong version. As far as I can tell, there's no way to ask indexedDB for the latest version of a DB. So if I try to run the following code twice, it destroys the database and it becomes unopenable:
And it never throws an error or calls onerror. It just sits silently
var db = null;
//Note, no version passed in, so the second time I do this, it seems to cause an error
var req = indexedDB.open( "test" );
req.onsuccess = function(event) { console.log( "suc: " + event.target.result.version ); db = event.target.result; };
req.onerror = function(event) { console.log( "err: " + event ); };
req.onupgradeneeded = function(event) { console.log( "upg: " + event.target.result.version ); };
//We're doing in interval since waiting for callback
var intv = setInterval(
function()
{
if ( db === null ) return;
clearInterval( intv );
var req2 = indexedDB.open( "test", db.version + 1 );
req2.onsuccess = function(event) { console.log( "suc: " + event.target.result.version ); };
req2.onerror = function(event) { console.log( "err: " + event ); };
req2.onupgradeneeded = function(event) { console.log( "upg: " + event.target.result.version ); };
},
50
);
All of that code is in my chrome.runtime.onInstalled.addListener. So when I update my app, it calls it again. If I call indexedDB.open( "test" ) without passing in the new version and then again run the setInterval function, it causes everything to become unusable and I'm never able to open "test" again. This would be solved if I could query indexedDB for the version of a database prior to attempting to open it. Does that exist?
Maybe this helps?
function getVersion(callback) {
var r = indexedDB.open('asdf');
r.onblocked = r.onerror = console.error;
r.onsuccess = function(event) {
event.target.result.close();
callback(event.target.result.version);
};
}
getVersion(function(version) {
console.log('The version is: %s', version);
});
Ok, based on the convo, this little util function might set you on the path:
var DATABASE_NAME_CONSTANT = 'whatever';
// Basic indexedDB connection helper
// #param callback the action to perform with the open connection
// #param version the version of the database to open or upgrade to
// #param upgradeNeeded the callback if the db should be upgraded
function connect(callback, version, upgradeNeeded) {
var r = indexedDB.open(DATABASE_NAME_CONSTANT, version);
if(upgradeNeeded) r.onupgradeneeded = updateNeeded;
r.onblocked = r.onerror = console.error;
r.onsuccess = function(event) {
console.log('Connected to %s version %s',
DATABASE_NAME_CONSTANT, version);
callback(event.target.result);
};
}
// Now let us say you needed to connect
// and need to have the version be upgraded
// and need to send in custom upgrades based on some ajax call
function fetch() {
var xhr = new XMLHttpRequest();
// ... setup the request and what not
xhr.onload = function(event) {
// if response is 200 etc
// store the json in some variable
var responseJSON = ...;
console.log('Fetched the json file successfully');
// Let's suppose you send in version and updgradeNeeded
// as properties of your fetched JSON object
var targetVersion = responseJSON.idb.targetVersion;
var upgradeNeeded = responseJSON.idb.upgradeNeeded;
// Now connect and do whatever
connect(function(db) {
// Do stuff with the locally scoped db variable
// For example, grab a prop from the fetched object
db.objectStore('asdf').put(responseJSON.recordToInsert);
// If you feel the need, but should not, close the db
db.close();
console.log('Finished doing idb stuff');
}, targetVersion, upgradeNeeded);
}
}
I think it is best to provide the version number always. If you don't how are you going to manage upgrades on the db structure? If you don't its a good chance you will get in a situation where same db versions on a client will have an other database structure, and I don't think that is the thing you want. So I would suggest to keep the version number in a variable.
Also when working with indexeddb you will have to provide an upgrade plan from al previous versions to the current. Meaning version 4 has a certain structure, but you will have to be able to get that same structure from scratch as from version 1,2 and 3