I am trying to insert couple of millions records (with approximately 6 fields/columns) by receiving in requests from clients 10,000 records per bulk insert attempt (using sequelize.js and bulkCreate())
This obviously was a bad idea, so I tried looking into node-pg-copy-streams
However, I do not want to initiate a change on the client side, where a json array is sent as such
# python
data = [
{
"column a":"a values",
"column b":"b values",
},
...
# 10,000 items
...
]
request.post(data=json.dumps(data), url=url)
On the Server side in nodejs, how would I stream the received request.body in the following skeleton ?
.post(function(req, res){
// old sequelize code
/* table5.bulkCreate(
req.body, {raw:true}
).then(function(){
return table5.findAll();
}).then(function(result){
res.json(result.count);
});*/
// new pg-copy-streams code
pg.connect(function(err, client, done) {
var stream = client.query(copyFrom('COPY my_table FROM STDIN'));
// My question is here, how would I stream or pipe the request body ?
// ?.on('error', done);
// ?.pipe(stream).on('finish', done).on('error', done);
});
});
Here's how I solved my problem,
First a function to convert my req.body dict to a TSV (not a part of the initial problem)
/**
* Converts a dictionary and set of keys to a Tab Separated Value blob of text
* #param {Dictionary object} dict
* #param {Array of Keys} keys
* #return {Concatenated Tab Separated Values} String
*/
function convertDictsToTSV(dicts, keys){
// ...
}
Second the rest of my original .post function
.post(function(req, res){
// ...
/* requires 'stream' as
* var stream = require('stream');
* var copyFrom = require('pg-copy-streams').from;
*/
var read_stream_string = new stream.Readable();
read_stream_string.read = function noop() {};
var keys = [...]; // set of dictionary keys to extract from req.body
read_stream_string.push(convertDictsToTSV(req.body, keys));
read_stream_string.push(null);
pg.connect(connectionString, function(err, client, done) {
// ...
// error handling
// ...
var copy_string = 'Copy tablename (' + keys.join(',') + ') FROM STDIN'
var pg_copy_stream = client.query( copyFrom( copy_string ) );
read_stream_string.pipe(pg_copy_stream).on('finish', function(finished){
// handle finished and done appropriately
}).on('error', function(errored){
// handle errored and done appropriately
});
});
pg.end();
});
Technically, there is no streaming here, not in terms of how NodeJS streaming works.
You are sending a chunk of 10,000 records each time and expect your server-side to insert those and return an OK to the client to send another 10,000 records. That's throttling/paging data in, not streaming.
Once your server has received the next 10,000 records, insert them (usually as a transaction), and then respond with OK back to the client so it can send the next 10,000 records.
Writing transactions with node-postgres isn't an easy task, as it is too low-level for that.
Below is an example of how to do that with the help of pg-promise:
function insertRecords(records) {
return db.tx(t=> {
var inserts = [];
records.forEach(r=> {
var query = t.none("INSERT INTO table(fieldA, ...) VALUES(${propA}, ...)", r);
inserts.push(query);
});
return t.batch(inserts);
});
}
Then inside your HTTP handler, you would write:
function myPostHandler(req, res) {
// var records = get records from the request;
insertRecords(records)
.then(data=> {
// set response as success;
})
.catch(error=> {
// set response as error;
});
}
Related
I am writing a Firefox extension using the WebRequest. I am in a situation that when I receive the response, I want to look back and find the request associated with this response to retrieve a custom request header. My current code is like this:
browser.webRequest.onBeforeSendHeaders.addListener(
addCustomHeader, // here I add custom_header:value
{urls: ["<all_urls>"]},
["blocking", "requestHeaders"]
);
...
browser.webRequest.onHeadersReceived.addListener(
process_response, // here I want to get back to the request and retrieve the custom header value
{urls: ["<all_urls>"]},
["blocking", "responseHeaders"]
);
The value of custom_header is set as a global variable, which changes per each request. And when I receive the response of, say, request_1, I want to retrieve the header value from request_1 in the process_response() function. However, if I directly use the value, I found it may have been changed by subsequent requests, say request_2 or request_3.
I noticed the response has a requestId property, and I guess I can use it to find the corresponding request. However, I am not able to find any document or example that tells me how. I'd appreciate for any hint!
Use a global map variable:
const reqMap = (() => {
const data = new Map();
const MAX_AGE = 10 * 60e3; // 10 minutes
const cleanup = () => {
const cutOff = performance.now() - MAX_AGE;
data.forEach(({ time }, id) => time < cutOff && data.delete(id));
};
return {
set(id, value) {
cleanup();
data.set(id, {value, time: performance.now()});
},
pop(id) {
const {value} = data.get(id) || {};
data.delete(id);
return value;
},
};
})();
function onBeforeSendHeaders(details) {
reqMap.set(details.requestId, {any: 'data'});
}
function onHeadersReceived(details) {
const data = reqMap.pop(details.requestId);
if (data) {
// ............
}
}
I am writing a backend API in node.js and need the functionality for users to be able to upload files with data and then calling stored procedures for inserting data into MySQL. I'm thinking of using fast-csv as parser, however I am struggling with how to set up the call to stored procedure in csv stream. the idea is something like this:
var fs = require("fs");
var csv = require("fast-csv");
var stream1 = fs.createReadStream("files/testCsvFile.csv");
csv
.fromStream(stream2, { headers: true })
.on("data", function(data) {
//CALL TO SP with params from "data"//
numlines++;
})
.on("end", function() {
console.log("done");
});
In other parts of application I have set up routes as follows:
auth.post("/verified", async (req, res) => {
var user = req.session.passwordless;
if (user) {
const rawCredentials = await admin.raw(getUserRoleCredentials(user));
const { user_end, role } = await normalizeCredentials(rawCredentials);
const user_data = { user_end, role };
res.send(user_data);
} else {
res.sendStatus(401);
}
});
..that is - routes are written in async/await way with queries (all are Stored Procedures called) being defined as Promises.. I would like to follow this pattern in upload/parse csv/call SP for every line function
This is doing the job for me - - can you please describe how to achive that with your framework - - I believe it should be done somehowe, I just need to configure it correctli
//use fast-csv to stream data from a file
csv
.fromPath(form.FileName, { headers: true })
.on("data", async data => {
const query = await queryBuilder({
schema,
routine,
parameters,
request
}); //here we prepare query for calling the SP with parameters from data
winston.info(query + JSON.stringify(data));
const rawResponse = await session.raw(query); //here the query gets executed
fileRows.push(data); // push each row - for testing only
})
.on("end", function() {
console.log(fileRows);
fs.unlinkSync(form.FileName); // remove temp file
//process "fileRows" and respond
res.end(JSON.stringify(fileRows)) // - for testing
});
As mentioned in the comment, I made my scramjet to handle such a use case with ease... Please correct me if I understood it wrong, but I understand you want to call the two await lines for every CSV row in the test.
If so, your code would look like this (updated to match your comment/answer):
var fs = require("fs");
var csv = require("fast-csv");
var stream1 = fs.createReadStream("files/testCsvFile.csv");
var {DataStream} = require("scramjet");
DataStream
// the following line will convert any stream to scramjet.DataStream
.from(csv.fromStream(stream2, { headers: true }))
// the next lines controls how many simultaneous operations are made
// I assumed 16, but if you're fine with 40 or you want 1 - go for it.
.setOptions({maxParallel: 16})
// the next line will call your async function and wait until it's completed
// and control the back-pressure of the stream
.do(async (data) => {
const query = await queryBuilder({
schema,
routine,
parameters,
request
}); //here we prepare query for calling the SP with parameters from data
winston.info(query + JSON.stringify(data));
const rawResponse = await session.raw(query); //here the query gets executed
return data; // push each row - for testing only)
})
// next line will run the stream until end and return a promise
.toArray()
.then(fileRows => {
console.log(fileRows);
fs.unlinkSync(form.FileName); // remove temp file
//process "fileRows" and respond
res.end(JSON.stringify(fileRows)); // - for testing
})
.catch(e => {
res.writeHead(500); // some error handling
res.end(e.message);
})
;
// you may want to put an await statement before this, or call then to check
// for errors, which I assume is your use case.
;
To answer your comment question - if you were to use an async function in the on("data") event - you would need to create an array of promises and await Promise.all of that array on stream end - but that would need to be done synchronously - so async function in an event handler won't do it.
In scramjet this happens under the hood, so you can use the function.
Basically, I am setting up a web server via Node.js and Express (I am a beginner at this) to retrieve data by reading a JSON file.
For example, this is my data.json file:
[{
"color": "black",
"category": "hue",
"type": "primary"
},
{
"color": "red",
"category": "hue",
"type": "primary"
}
]
I am trying to retrieve all of the colors by implementing this code for it to display on localhost:
router.get('/colors', function (req, res) {
fs.readFile(__dirname + '/data.json', 'utf8', function (err, data) {
data = JSON.parse(data);
res.json(data); //this displays all of the contents of data.json
})
});
router.get('/colors:name', function (req, res) {
fs.readFile(__dirname + '/data.json', 'utf8', function (err, data) {
data = JSON.parse(data);
for (var i = 0; i < data.length; i++) {
res.json(data[i][1]); //trying to display the values of color
}
})
});
How do I go about doing this?
What you are trying to do is actually pretty simple once you break it into smaller problems. Here is one way to break it down:
Load your JSON data into memory for use by your API.
Define an API route which extracts only the colours from your JSON data and sends them to the client as a JSON.
var data = [];
try {
data = JSON.parse(fs.readFileSync('/path/to/json'));
} catch (e) {
// Handle JSON parse error or file not exists error etc
data = [{
"color": "black",
"category": "hue",
"type": "primary"
},
{
"color": "red",
"category": "hue",
"type": "primary"
}
]
}
router.get('/colors', function (req, res, next) {
var colors = data.map(function (item) {
return item.color
}); // This will look look like: ["black","red"]
res.json(colors); // Send your array as a JSON array to the client calling this API
})
Some improvements in this method:
The file is read only once synchronously when the application is started and the data is cached in memory for future use.
Using Array.prototype.map Docs to extract an array of colors from the object.
Note:
You can structure the array of colors however you like and send it down as a JSON in that structure.
Examples:
var colors = data.map(function(item){return {color:item.color};}); // [{"color":"black"},{"color":"red"}]
var colors = {colors: data.map(function(item){return item.color;})} // { "colors" : ["black" ,"red"] }
Some gotchas in your code:
You are using res.json in a for loop which is incorrect as the response should only be sent once. Ideally, you would build the JS object in the structure you need by iterating over your data and send the completed object once with res.json (which I'm guessing internally JSON.stringifys the object and sends it as a response after setting the correct headers)
Reading files is an expensive operation. If you can afford to read it once and cache that data in memory, it would be efficient (Provided your data is not prohibitively large - in which case using files to store info might be inefficient to begin with)
in express, you can do in this way
router.get('/colors/:name', (req, res) => {
const key = req.params.name
const content = fs.readFileSync(__dirname + '/data.json', 'utf8')
const data = JSON.parse(content)
const values = data.reduce((values, value) => {
values.push(value[key])
return values
}, [])
// values => ['black', 'red']
res.send(values)
});
and then curl http://localhost/colors/color,
you can get ['black', 'red']
What you're looking to do is:
res.json(data[i]['color']);
If you don't really want to use the keys in the json you may want to use the Object.values function.
...
data = JSON.parse(data)
var values = []
for (var i = 0; i < data.length; i++) {
values.push(Object.values(data[i])[0]) // 0 - color, 1 - category, 2 - type
}
res.json(values) // ["black","red"]
...
You should never use fs.readFileSync in production. Any sync function will block the event loop until the execution is complete hence delaying everything afterwords (use with caution if deemed necessary). A few days back I had the worst experience myself and learnt that in a hard way.
In express you can define a route with param or query and use that to map the contents inside fs.readFile callback function.
/**
* get color by name
*
* #param {String} name name of the color
* #return {Array} array of the color data matching param
*/
router.get('/colors/:name', (req, res) => {
const color = req.params.name
const filename = __dirname + '/data.json';
fs.readFile('/etc/passwd', 'utf8', (err, data) => {
if(err){
return res.send([]); // handle any error returned by readFile function here
}
try{
data = JSON.parse(data); // parse the JSON string to array
let filtered = []; // initialise empty array
if(data.length > 0){ // we got an ARRAY of objects, right? make your check here for the array or else any map, filter, reduce, forEach function will break the app
filtered = data.filter((obj) => {
return obj.color === color; // return the object if the condition is true
});
}
return res.send(filtered); // send the response
}
catch(e){
return res.send([]); // handle any error returned from JSON.parse function here
}
});
});
To summarise, use fs.readFile asynchronous function so that the event loop is not clogged up. Inside the callback parse through the content and then return the response. return is really important or else you might end up getting Error: Can't set headers after they are sent
DISCLAIMER This code above is untested but should work. This is just to demonstrate the idea.
I think you can’t access JSON without key. You can use Foreach loop for(var name : object){} check about foreach it may help you
I have a unique situation here which I am having trouble solving in an elegant fashion.
A user passes up an array of signals which they want to export data for. This array can be 1 -> Any_Number so first I go fetch the table names (each signal stores data in a separate table) based on the signals passed and store those in an object.
The next step is to iterate over that object (which contains the table names I need to query), execute the query per table and store the results in an object which will be passed to next chain in the Promise. I haven't seen any examples online of good ways to handle this but I know it's a fairly unique scenario.
My code prior to attempting to add support for arrays of signals was simply the following:
exports.getRawDataForExport = function(data) {
return new Promise(function(resolve, reject) {
var getTableName = function() {
return knex('monitored_parameter')
.where('device_id', data.device_id)
.andWhere('internal_name', data.param)
.first()
.then(function(row) {
if(row) {
var resp = {"table" : 'monitored_parameter_data_' + row.id, "param" : row.display_name};
return resp;
}
});
}
var getData = function(runningResult) {
return knexHistory(runningResult.table)
.select('data_value as value', 'unit', 'created')
.then(function(rows) {
runningResult.data = rows;
return runningResult;
});
}
var createFile = function(runningResult) {
var fields = ['value', 'unit', 'created'],
csvFileName = filePathExport + runningResult.param + '_export.csv',
zipFileName = filePathExport + runningResult.param + '_export.gz';
var csv = json2csv({data : runningResult.data, fields : fields, doubleQuotes : ''});
fs.writeFileSync(csvFileName, csv);
// create streams for gZipping
var input = fs.createReadStream(csvFileName);
var output = fs.createWriteStream(zipFileName);
// gZip
input.pipe(gzip).pipe(output);
return zipFileName;
}
getTableName()
.then(getData)
.then(createFile)
.then(function(zipFile) {
resolve(zipFile);
});
});
}
Obviously that works fine for a single table and I have gotten the getTableName() and createFile() methods updated to handle arrays of data so this question only pertains to the getData() method.
Cheers!
This kind of problem is far from unique and, approached the right way, is very simply solved.
Don't rewrite any of the three internal functions.
Just purge the explicit promise construction antipattern from .getRawDataForExport() such that it returns a naturally occurring promise and propagates asynchronous errors to the caller.
return getTableName()
.then(getData)
.then(createFile);
Now, .getRawDataForExport() is the basic building-block for your multiple "gets".
Then, a design choice; parallel versus sequential operations. Both are very well documented.
Parallel:
exports.getMultiple = function(arrayOfSignals) {
return Promise.all(arrayOfSignals.map(getRawDataForExport));
};
Sequential:
exports.getMultiple = function(arrayOfSignals) {
return arrayOfSignals.reduce(function(promise, signal) {
return promise.then(function() {
return getRawDataForExport(signal);
});
}, Promise.resolve());
};
In the first instance, for best potential performance, try parallel.
If the server chokes, or is likely ever to choke, on parallel operations, choose sequential.
I have a snippet of Express code
Below what i am trying to do is pass the table name to keyName by extracting from the request
But I am facing deaslock
i wanted to know whether i am following proper protocols for JSON response
[Part-of-Express-Code]
app.get('/RestaurantDesc/:Key',function(request,response,next){
var keyName=request.query.Key;
var name_of_restaurants, RestaurantTimings;
async.series( [
// Get the first table contents
function ( callback ) {
connection.query('SELECT * FROM ',keyName, function(err, rows, fields)
{
console.log('Connection result error '+err);
name_of_restaurants = rows;
callback();
});
},
// Get the second table contents
function ( callback ) {
connection.query('SELECT * FROM RestaurantTimings', function(err, rows, fields)
{
console.log('Connection result error '+err);
RestaurantTimings = rows;
callback();
});
}
// Send the response
], function ( error, results ) {
response.json({
'restaurants' : name_of_restaurants,
'RestaurantTimings' : RestaurantTimings
});
} );
} );
I am getting the output as Cannot GET /RestaurantDesc/
Any Ideas
your route should be path, A path that you can access through GET request.
for ex: you should be able to access it through
http://example.com/RestaurantDesc/anyKeyHere
and in your code you have
var keyName = request.query.Key
req.query contains query variables see http://expressjs.com/api.html#req.query
So your keyName variable won't contain anyKeyHere.
req.params.Key will contain value anyKeyHere;
but you will need to pass it in url path.
if you need to pass key data in query you can do this.
app.get('/RestaurantDesc',function(request,response,next){
var keyName=request.query.Key;
});
and pass key like this in your url
http://example.com/RestaurantDesc/?Key=restaurnetkeyHere
Try going through guide in express site and understand routings and how it works.
If you getting "Cannot GET /RestaurantDesc/" is because you have not setup this route, try /RestaurantDesc/something. request.query is used for search terms, ie things that come after a questionmaek in a url. Use request.param.Key instead.
Also for best practices you should lowercase resource names and use the shorter req/res instead of request/response.