Yahoo-Finance Query Speed - mysql

I'm currently working on a project that involves querying yahoo-finance for many different ticker symbols. The bottleneck is acquiring the data from yahoo, so I was wondering if there is a way I might go about speeding this up.
If I used multiple machines to query and then aggregated the data, would that help? I only have one physical machine; how might I go about doing that?
Thanks!
EDIT: Currently, I'm using Node.js, yahoo-finance, and Q.deferred to ask yahoo for historical data. Then, once all the promises are fulfilled (for each ticker), I'm doing a Q.all() to persist the data.
var data = [];
tickers = ["goog", "aapl", ...];
...
Q.all(_.map(tickers, function(symbol) {
return getYahooPromise(symbol);
}))
.done( function() { persistData(data) });
getYahooPromise retrieves data for the ticker symbol and pushes it into the data array. Once all promises are resolved, the data is persisted in a MySQL database.
SECOND EDIT:
More code:
var sequentialCalls = [];
for ( var i = 0; i < tickers.length / chunkSize; i++ ) {
sequentialCalls.push( persistYahooChunk );
}
sequentialCalls.push( function(callback) {
connection.end();
callback();
});
async.series( sequentialCalls )
exports.persistYahooChunk = function(callback) {
console.log("Starting yahoo query");
var currentTickers = tickers.slice(currentTickerIndex,currentTickerIndex + chunkSize);
return yahooFinance.historical( {
symbols: currentTickers,
from: "2015-01-28",
to: "2015-02-05"
}).then( function(result) {
console.log("Query " + currentTickerIndex + "/" + tickers.length + "completed");
currentTickerIndex += chunkSize;
//add valid data
var toPersist = _.map(result, function(quotes, symbol) {
return [symbol, quotes.length != 0 ];
});
var query = "INSERT INTO `ticker` (`symbol`, `valid`) VALUES ?";
connection.query(query, [toPersist], function(err, result) {
if (err) {
console.log (err);
}
//console.log(result);
callback();
});
});
}

The bottleneck is because you are doing one query per ticker.
Depending on the data you need to pull, if you could do a single query that includes all your tickers it would be much faster.
Here is an example if you need to get all current prices for a list of tickers, with a single query :
http://finance.yahoo.com/webservice/v1/symbols/A,B,C,D,E/quote?format=json

Related

multiple res.send/json - can't set headers after they are sent

I wanna know if it's possible do multiple res.send/json in same method. I have a big problem with that because I wanna develop a function into a Web-Worker to call a put request each minute but in 2nd request I get "can't set headers after they are sent.". I know that it's not possible do it but I wanna know if exist some way to run.
exports.update = function (req, res) {
var feed = req.feed;
feed.title = req.body.title;
feed.apifeed = req.body.apifeed;
feed.apikey = req.body.apikey;
feed.active = req.body.active;
if(feed.apifeed && feed.apikey && feed.active){
var t = Threads.create();
t.eval(setInterval(
function(){
async.parallel([
function(callback, data){
var url = 'https://api.xively.com/v2/feeds/' + feed.apifeed + '.json?key=' + feed.apikey;
sensordata.getSensorData(url, function(data){
callback(null, data);
});
}
], function(err, data){
feed.content[0].value = data[0][0].value;
feed.content[0].date = new Date();
feed.save(function (err) {
if (err) {
return res.status(400).send({
message: errorHandler.getErrorMessage(err)
});
} else {
res.json(feed);
}
});
});
}, 5000)
);
}
};
Thanks for supporting and I wait your responses. Greetings!
You cannot respond to a request more than once, that wouldn't make sense.
If you need to stream data, then you will need to use long polling, WebSockets, Server-sent Events, etc.
Also, assuming Threads.create() does what I think it may be doing, spawning an OS thread just to do what you're currently doing is a waste of resources. Nothing in that block of code is CPU-bound.

Can't insert element from array in mysql database with nodejs

I actually have a problem saving some data from an array in a mysql database with nodejs.
This is my code
for (var i = 0; i < data.data.length; i++) {
var imageObject = data.data[i];
var url = imageObject.images.standard_resolution.url;
var id = imageObject.id;
var sql = 'SELECT COUNT(*) AS imageIDCount FROM images WHERE id = ?'
var ids
connection.query(sql, [id], function(err, rows, fields) {
console.log(rows[0].imageIDCount);
if (err) throw err;
if (rows[0].imageIDCount == 0) {
console.log(id + " doesn't exist"); // ### the ID at this point is always the last from that array
//insertImage(id, url);
} else {
// console.log("ID exists");
}
});
}
This code run's when I get a response from an rest-api with the request-framework.
So my problem is that at the point I get the result from the count-query and there is no element with the specific id I get always the same id. I think that's because I use the same variable "id" there but how can I fix it ? I hope somebody can help me.
In this case, you're a victim of Node's asynchronous event loop. You're executing a synchronous for-loop and defining id:
for (var i = 0; i < data.data.length; i++) {
var id = imageObject.id;
}
This works in normal Javascript if you try and do something with id, but because the database module you're using runs asynchronously, that entire loop will have already completed before your first db query completes, effectively clobbering the value of id.
You'll need to re-write your function to behave asynchronously instead, or use something like node-async to help.
Here's a quick example of how that might look. Note that I didn't write your insertImage function for you; you'll need to rewrite that to support a callback as well.
async.each(data.data, function(imageObject, callback) {
var sql = 'SELECT COUNT(*) AS imageIDCount FROM images WHERE id = ?'
connection.query(sql, [imageObject.id], function(err, rows, fields) {
if (err) callback(err);
if (rows[0].imageIDCount == 0) {
console.log(id + " doesn't exist");
insertImage(imageObject.id, imageObject.images.standard_resolution.url, function(err) {
callback(err); // fires the callback to async
})
} else {
console.log(id + " already exists");
callback(); // maybe you want an error here too?
}
});
}, function(err, results) {
// all of your db queries are completed
});

Using Q to return secondary query in node with express and mysql

New to node, As I am cycling through a roster of students, I need to check and see if a teacher has requested them for tutoring.
I realized I can't just do this:
var checkRequest = function(id){
var value = '';
roster.query('SELECT * FROM teacher_request WHERE student_id ='+id, function(err, row){
value = row.length;
}
return value;
}
After a bit of digging around promises looked like a great solution, but if I simply return the deferred.promise from the checkRequest function, all I get is an object that says [deferred promise] which I can't access the actual data from. (Or have not figured out how yet)
If I follow along with their api and use .then (as illustrated in the getRow) function, I am back in the same problem I was in before.
function checkRequest(id) {
console.log(id);
var deferred = Q.defer();
connection.query('SELECT * FROM teacher_request WHERE student_id ='+id, function(err, row){
deferred.resolve(row.length);
});
return deferred.promise;
}
var getRow = function(id){
checkRequest(id).then(function(val) {
console.log(val); // works great
return val; //back to the same problem
});
}
The roster needs to be able to be pulled from an external API which is why I am not bundling the request check with the original roster query.
Thanks in advance
From the stuff you posted, I assume you have not really understood the concept of promises. They allow you to queue up callbacks, that get executed, when the asynchronous operation has finished (by succeeding or failing).
So instead of somehow getting the results back to your synchronous workflow, you should convert that workflow to work asynchronous as well. So a small example for your current problem:
// your students' ids in here
var studentsArray = [ 1, 2, 5, 6, 9 ];
for( var i=0; i<studentsArray.length; i++ ) {
checkRequest( i )
.then( function( data ){
console.log( data.student_id );
// any other code related to a specific student in here
});
}
or another option, if you need all students' data at the same time:
// your students' ids in here
var studentsArray = [ 1, 2, 5, 6, 9 ];
// collect all promises
var reqs = [];
for( var i=0; i<studentsArray.length; i++ ) {
reqs.push( checkRequest( i ) );
}
Q.all( reqs )
.then( function(){
// code in here
// use `arguments` to access data
});

Creating synchronous queries with node-mysql

I'm trying to ensure that one mysql query leads to another and is not completed until all of its children queries are completed. So for example, I start with one select and stream rows and execute subsequent queries from that row result. This is doable with callbacks, but I end up running out of memory, so I'd like to slow down the process and run batches, but due to the async nature of the dispatch, I can't keep things in phase and end the connection after all the rows have been processed.
Here's an example:
var query = conn.query('select id from table1 limit 10');
query.on('result', function(row){
console.log('query1', row);
var query2 = conn.query('select id from books where id = ? ', [row.id]);
query2.on('result', function(row2){
console.log('query2', row2);
var query3 = conn.query('insert into test (id) values (?)', [row2.id]);
query3.on('result', function(row3){
console.log(row3);
});
});
});
query.on('end', function(){
conn.end();
});
The above fails, because there are still rows to process in query3 after the initial query is ended.
Any thoughts? The actual code is even more complicated, because I have to process xml from the subsequent queries and fire off even more inserts as I loop through the batch.
Thanks!
I would suggest this solution with async module:
var async = require("async");
// connection instance
var conn;
// here goes task serving logic
// if any async function should be finished before drain callback, push them into q
var solvers = {
query: function(q, task, row){
console.log('query1', row);
q.push({
solver: "query2",
req: "select id from books where id = ?",
reqArgs: [row.id]
});
},
query2: function(q, task, row){
console.log('query2', row);
q.push({
solver: "query3",
req: "insert into test (id) values (?)",
reqArgs: [row.id]
});
},
query3: function(q, task, row){
console.log(row);
}
}
// here is a queue of tasks
var q = async.queue(function(task, cb){
var query = conn.query(task.req, task.reqArgs);
query.on("end", cb);
query.on("result",function(row){
solvers[task.solver](q, task, row);
});
}, 2); // limit of parallel queries
// when every request has reached "end"
q.drain = function(){
conn.end();
// continue from here
};
// initial task
q.push({
solver: "query",
req: "select id from table1 limit 10",
reqArgs: []
});
But still, I'm not sure that making requests ID by ID is a good solution.
Maybe, I'm just not aware of a full problem.
#glukki, thanks for the great answer and reference to async. I went with a permutation of your code and two async requests which do a 'chomp and chew' using a single connection and pool of connections to process over 100K row select into 1.2M row inserts. Worked amazingly well and took less than 10 minutes. Here's the full implementation minus the module and connection setup. I hope this helps someone else too. Thanks again!
function populateMesh(row, callback){
xmlParser.parseString('<root>'+row.mesh_heading_list+'</root>', function(err, result){
var q2 = async.queue(function (task, cb) {
pool.getConnection(function(err, cnx){
cnx.query('INSERT INTO abstract_mesh (mesh_id, abstract_id, major_topic) SELECT mesh_descriptor.id, ?, ? FROM mesh_descriptor WHERE mesh_descriptor.name = ?', [task.id, task.majorTopic, task.descriptorName], function(err, result){
if (err) {throw err;}
cnx.release();
cb();
});
});
}, 50);
q2.drain = function() {
//console.log('all mesh processed');
callback();
}
if(!(result.root instanceof Object)){
//console.log('its not obj!', row.id);
q2.push({id: row.id, majorTopic: 'N', descriptorName: 'Null'}, function (err) {});
}
for(var i in result.root.MeshHeading){
// console.log('in loop',result.root.MeshHeading[i].DescriptorName);
if(typeof result.root.MeshHeading[i].DescriptorName === 'undefined'){
q2.push({id: row.id, majorTopic: 'N', descriptorName: 'Emergency'}, function(err){});
}
for(var j in result.root.MeshHeading[i].DescriptorName){
var descriptorName = result.root.MeshHeading[i].DescriptorName[j]._;
var majorTopic = result.root.MeshHeading[i].DescriptorName[j].$.MajorTopicYN;
q2.push({id: row.id, majorTopic: majorTopic, descriptorName: descriptorName}, function (err) {});
}
}
});
}
// here goes task serving logic
// if any async function should be finished before drain callback, push them into q
var q = async.queue(function (row, callback) {
console.log('got id: ' + row.id);
populateMesh(row, function(){
callback();
});
}, 10);
q.drain = function() {
console.log('all items have been processed');
conn.end(function(err){
console.log('connection ended');
});
pool.end(function(err){
console.log('pool closed');
});
};
var truncate = conn.query('truncate abstract_mesh');
var select = conn.query('SELECT id, mesh_heading_list FROM pubtbl');
select.on('result', function(result){
// console.log(result);
q.push(result, function (err) {
//console.log('finished processing row');
});
});
In my opinion the best solution is to make the code synchronously in a very easy way.
You could use the "synchonize" package.
Just
npm install synchronize
Then var sync = require(synchronize);
Put logic which should be synchronous into a fiber by using
sync.fiber(function() {
//put your logic here
}
An example for two mysql queries:
var express = require('express');
var bodyParser = require('body-parser');
var mysql = require('mysql');
var sync = require('synchronize');
var db = mysql.createConnection({
host : 'localhost',
user : 'user',
password : 'password',
database : 'database'
});
db.connect(function(err) {
if (err) {
console.error('error connecting: ' + err.stack);
return;
}
});
function saveSomething() {
var post = {id: newId};
//no callback here; the result is in "query"
var query = sync.await(db.query('INSERT INTO mainTable SET ?', post, sync.defer()));
var newId = query.insertId;
post = {foreignKey: newId};
//this query can be async, because it doesn't matter in this case
db.query('INSERT INTO subTable SET ?', post, function(err, result) {
if (err) throw err;
});
}
When "saveSomething()" is called, it inserts a row in a main table and receives the last inserted id. After that the code below will be executed. No need for nesting promises or stuff like that.
This is what i did,
db.query(
"select name from USER where name = ?",
["test"],
(err, result) => {
if (err) {
console.log("Error : ", err);
} else if (result.length <= 0) {
res.json("Not Found");
} else {
console.log("name found, executing update query!");
updateAgeIfUserFound("test"); //Calling funtion with 2nd query
}
}
);
//Update age only if name is present
function updateAgeIfUserFound(name, age) {
if (name) {
db.query(
"update USER set age = ? where name = ?,
[age, name],
(err, result) => {
if (err) throw err;
console.log("Name Updated");
res.json("Name Updated");
}
);
}
}

NodeJS MySQL Dump

I've attempted to write a basic cron script to run and 'dump' a mysql database. For some reason, when it 'successfully saves the file', it does create the file, but it is empty. If instead of saving the file, I perform a console.log, it prints an empty string. Any thoughts on what I may be doing wrong?
Thanks in advance.
var mysql_backup = function(){
this.backup = '';
this.mysql = require('mysql'),
this.init = function(){
this.connection = this.mysql.createConnection({
user: 'root',
password: 'root',
database: 'test'
});
}
this.query = function(sql, callback) {
this.connection.query(sql, function (error, results, fields) {
if (error) {
throw error;
}
if (results.length > 0) {
callback(results);
}
});
}
this.get_tables = function(callback){
var me = this;
me.query('SHOW TABLES',
function(tables) {
for (var table in tables){
me.query(
'SHOW CREATE TABLE ' + tables[table].Tables_in_test,
function(r){
for (var t in r) {
me.backup += "DROP TABLE " + r[t].Table + "\n\n";
me.backup += r[t]["Create Table"] + "\n\n";
}
}
)
}
me.save_backup();
});
}
this.save_backup = function(){
var fs = require('fs');
fs.writeFile("./backup_test.txt", this.backup, function(err) {
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
}
};
var db = new mysql_backup;
db.init();
db.get_tables();
db.connection.destroy();
The code as written didn't even get to a file saving for me. There seem like a few issues. Not sure if this is the actual code or some things got lost in the copy paste. However, based on what you've got:
A big one is that you never connect to the database in your code with connection.connect().
The code you want to run once connected should be inside the connection.connect() callback. e.g.
connection.connect(function (err, empty) {
if (err)
throw new Error ('Panic');
// if no error, we are off to the races...
}
However, even if you quickly refactor your code to wrap your last lines inside of that get connection callback, you'll still have problems, because you are destroying the connection before the various SQL calls are getting made, so you will want to move the code into some sort of final callback.
Even after you do that, you'll still have an empty file, because you're calling save_backup from your 'SHOW TABLES' callback rather than after you have actually populated it via the inner callback where you get the CREATE TABLE statement and populate the backup property.
This is the minimal rewriting of your code which will do what you are intending. An important thing to note is the "counter" which manages when to write the file and close the connection. I would make other changes if it were mine, including:
Using 'self' instead of 'me'
Using a numeric for loop rather than the for (... in ...) syntax
Having my own callbacks fall the node convention of (err, stuff)
A more substantial changes is that I would rewrite this to use promises, as doing so can spare you some grief with the confusion inherent with deeply nested callbacks. I personally like the Q library, but there are several options here.
Hope this helped.
var mysql_backup = function(){
this.backup = '';
this.mysql = require('mysql');
this.init = function(){
this.connection = this.mysql.createConnection({
user : 'root',
password : 'root',
database : 'test'
});
};
this.query = function(sql, callback) {
this.connection.query(sql, function (error, results, fields) {
if (error) {
throw error;
}
if (results.length > 0) {
callback(results);
}
});
};
this.get_tables = function(callback){
var counter = 0;
var me = this;
this.query('SHOW TABLES',
function(tables) {
for (table in tables){
counter++;
me.query(
'SHOW CREATE TABLE ' + tables[table].Tables_in_mvc,
function(r){
for (t in r) {
me.backup += "DROP TABLE " + r[t].Table + "\n\n";
me.backup += r[t]["Create Table"] + "\n\n";
}
counter--;
if (counter === 0){
me.save_backup();
me.connection.destroy();
}
}
)
}
});
};
this.save_backup = function(){
var fs = require('fs');
fs.writeFile("./backup_test.txt", this.backup, function(err) {
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
}
};
var db = new mysql_backup;
db.init();
db.connection.connect(function (err){
if (err) console.log(err);
db.get_tables(function(x){;});
});
Update: If you are curious, here is a heavily-commented implementation using promises. Note that without the comments explaining the Q promise library functions, it is somewhat shorter than the original version and also offers more comprehensive error handling.
var MysqlBackup = function(connectionInfo, filename){
var Q = require('q');
var self = this;
this.backup = '';
// my personal preference is to simply require() inline if I am only
// going to use something a single time. I am certain some will find
// this a terrible practice
this.connection = require('mysql').createConnection(connectionInfo);
function getTables(){
// return a promise from invoking the node-style 'query' method
// of self.connection with parameter 'SHOW TABLES'.
return Q.ninvoke(self.connection,'query', 'SHOW TABLES');
};
function doTableEntries(theResults){
// note that because promises only pass a single parameter around,
// if the 'denodeify-ed' callback has more than two parameters (the
// first being the err param), the parameters will be stuffed into
// an array. In this case, the content of the 'fields' param of the
// mysql callback is in theResults[1]
var tables = theResults[0];
// create an array of promises resulting from another Q.ninvoke()
// query call, chained to .then(). Note that then() expects a function,
// so recordEntry() in fact builds and returns a new one-off function
// for actually recording the entry (see recordEntry() impl. below)
var tableDefinitionGetters = [];
for (var i = 0; i < tables.length ; i++){
// I noticed in your original code that your Tables_in_[] did not
// match your connection details ('mvc' vs 'test'), but the below
// should work and is a more generalized solution
var tableName = tables[i]['Tables_in_'+connectionInfo.database];
tableDefinitionGetters.push(Q.ninvoke(self.connection, 'query', 'SHOW CREATE TABLE ' + tableName)
.then(recordEntry(tableName)) );
}
// now that you have an array of promises, you can use Q.allSettled
// to return a promise which will be settled (resolved or rejected)
// when all of the promises in the array are settled. Q.all is similar,
// but its promise will be rejected (immediately) if any promise in the
// array is rejected. I tend to use allSettled() in most cases.
return Q.allSettled(tableDefinitionGetters);
};
function recordEntry (tableName){
return function(createTableQryResult){
self.backup += "DROP TABLE " + tableName + "\n\n";
self.backup += createTableQryResult[0][0]["Create Table"] + "\n\n";
};
};
function saveFile(){
// Q.denodeify return a promise-enabled version of a node-style function
// the below is probably excessively terse with its immediate invocation
return (Q.denodeify(require('fs').writeFile))(filename, self.backup);
}
// with the above all done, now you can actually make the magic happen,
// starting with the promise-return Q.ninvoke to connect to the DB
// note that the successive .then()s will be executed iff (if and only
// if) the preceding item resolves successfully, .catch() will get
// executed in the event of any upstream error, and finally() will
// get executed no matter what.
Q.ninvoke(this.connection, 'connect')
.then(getTables)
.then(doTableEntries)
.then(saveFile)
.then( function() {console.log('Success'); } )
.catch( function(err) {console.log('Something went awry', err); } )
.finally( function() {self.connection.destroy(); } );
};
var myConnection = {
host : '127.0.0.1',
user : 'root',
password : 'root',
database : 'test'
};
// I have left this as constructor-based calling approach, but the
// constructor just does it all so I just ignore the return value
new MysqlBackup(myConnection,'./backup_test.txt');