I have written a small script in Node.js to scrape a web page and get the some links. The scrapping part is done with Cheerio. My code is here (simplified for space):
var request = require('request');
var cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
}
//console.log(mutuals); // 1st place
});
console.log(mutuals); // 2nd place
Here comes the fun part: When I try to output the JSON document from the "1st place", inside the 'request' block, it comes out nice and true. An example is here:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET),
companies:
[ { name: ' J.P. MORGAN ASSET MANAGEMENT',
link: 'mtfCompany?id=J.P.+MORGAN+ASSET+MANAGEMENT' },
{ name: ' BNP PARIBAS INVESTMENT PARTNERS',
link: 'mtfCompany?id=BNP+PARIBAS+INVESTMENT+PARTNERS' },
{ name: ' PICTET', link: 'mtfCompany?id=PICTET' },
{ name: ' ALLIANZ ΑΕΔΑΚ',
link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a' },
{ name: ' ALLIANZ ΑΕΔΑΚ (ΑΝΤΙΠΡ.)',
link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a+(%ce%91%ce%9d%ce%a4%ce%99%ce%a0%ce%a1.)' },
{ name: ' ALLIANZ ΕΛΛΑΣ Α.Ε.',
link: 'mtfCompany?id=ALLIANZ+%ce%95%ce%9b%ce%9b%ce%91%ce%a3+%ce%91.%ce%95.' }]}
When I try to output the JSON document from the "2nd place", outside of ANY block and at the end of execution, this is what I get:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }
It looks like the 'companies' array inside the JSON document gets emptied. I have a suspicion that the 'mutuals.companies = [];' line gets executed again for some reason.
Can anyone help with this?
UPDATE 1:
Changed my code as suggested to use 'async.series...'. This is the updated version:
var request = require('request'),
async = require('async'),
cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
async.series([
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
}
});
callback(null, 'one');
},
function (callback) {
console.log(mutuals);
callback(null, 'two');
}
]);
Still does not work. Still the JSON outputted is this:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }
Your "2nd place" is printing the variable before the request finishes.
Your "1st place" works because it's located inside the callback of the request. The request is made, the data is pulled, the callback then gets called and is successfully printed.
This is the way asynchronous code works. Nothing blocks. So when you issue your request, node stores the callback function so that it can execute code with the results of the request.
Update 1:
The issue with your update is mostly the same. In your first function in the series, the callback gets called before request has finished. If you move callback into the function passed to request then it gets called after request finishes.
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
}
});
},
Suggestion 1
Developing in node.js with callbacks can leave you with a deep nested structure. Don't let your if statements make nesting worse. Use early returns instead of deeper nesting. Example:
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if(error) return callback(error);
if(response.statusCode !== 200) return callback('status code not 200');
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
});
},
Suggestion 2
When using async it can help to simplify things by using named functions. Example:
var request = require('request'),
async = require('async'),
cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
function getPage(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if(error) return callback(error);
if(response.statusCode !== 200) return callback('status code not 200');
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
});
}
function printMutuals(callback) {
console.log(mutuals);
callback(null, 'two');
}
async.series([
getPage,
printMutuals
]);
Related
so i am learning about this package html-to-react, and for the most part, i understand it. However, their is a piece of code i just cannot seem to be able to get my head around. The code is:
var React = require('react');
var HtmlToReact = require('html-to-react');
var HtmlToReactParser = require('html-to-react').Parser;
var htmlToReactParser = new HtmlToReactParser();
var htmlInput = '<div><div data-test="foo"><p>Text</p><p>Text</p></div></div>';
var htmlExpected = '<div><div data-test="foo"><h1>Heading</h1></div></div>';
var isValidNode = function () {
return true;
};
var processNodeDefinitions = new HtmlToReact.ProcessNodeDefinitions(React);
// Order matters. Instructions are processed in
// the order they're defined
var processingInstructions = [
{
// This is REQUIRED, it tells the parser
// that we want to insert our React
// component as a child
replaceChildren: true,
shouldProcessNode: function (node) {
return node.attribs && node.attribs['data-test'] === 'foo';
},
processNode: function (node, children, index) {
return React.createElement('h1', {key: index,}, 'Heading');
}
},
{
// Anything else
shouldProcessNode: function (node) {
return true;
},
processNode: processNodeDefinitions.processDefaultNode,
},
];
var reactComponent = htmlToReactParser.parseWithInstructions(
htmlInput, isValidNode, processingInstructions);
var reactHtml = ReactDOMServer.renderToStaticMarkup(
reactComponent);
assert.equal(reactHtml, htmlExpected);
The code i don't understand is the:
shouldProcessNode: function (node) {
return node.attribs && node.attribs['data-test'] === 'foo';
},
Any help would be very appreciated. Thanks
We are trying to download a 7 GB from CDN using JSZip.js. The chrome browser suddenly seems to close the connection when the download reaches 3.5gb every time. The approximate time is around 15 mins. Is there a way we increase the tolerant time to 1 hr say?
$("#downloadJSZip").on('click', function () {
var result = [{ "cdn": "url....", "filename": "7.84 gb.zip", "size": 4194304, "path": "7.84 gb" }];
var Promise = window.Promise;
if (!Promise) {
Promise = JSZip.external.Promise;
}
function urlToPromise(url) {
return new Promise(function(resolve, reject) {
JSZipUtils.getBinaryContent(url, function (err, data) {
if(err) {
reject(err);
} else {
resolve(data);
}
});
});
}
var fileNameArray = [];
function changeFileName(fileName,j){
var i = fileName.lastIndexOf('.');
var newfilename = fileName.slice(0,i)+"--"+j+fileName.slice(i);
if(fileNameArray.indexOf(newfilename) != -1){
j = j+1;
changeFileName(fileName,j);
}
return newfilename;
}
var zip = new JSZip();
// find every checked item
result.forEach(function(file){
var filename = file.filename;
if(fileNameArray.indexOf(filename) != -1){
var newfilename = changeFileName(filename,1);
filename = newfilename;
}
fileNameArray.push(filename);
var url = file.cdn;
var folder = (file.path);
zip.folder(folder).file(filename, urlToPromise(url), {binary:true});
// zip.file(filename, urlToPromise(url), {binary:true});
});
// when everything has been downloaded, we can trigger the dl
zip.generateAsync({type:"blob",
}, function updateCallback(metadata) {
var msg = "progression : " + metadata.percent.toFixed(2) + " %";
if(metadata.currentFile) {
msg += ", current file = " + metadata.currentFile;
}
console.log(msg);
console.log(metadata.percent|0);
})
.then(function callback(blob) {
// see FileSaver.js
//console.log("blob=====>");
//console.dir(blob);
//saveAs(blob, "example.zip") ;
saveAs(blob, $scope.folderName+".zip") ;
//console.log("done !");
}, function (e) {
});
});
Is this chrome browser configuration?
I created a test server that sends chunks of stringified JSON. When I connect to the server it sends invalid JSON and for the life of me I can't figure out why. The output adds an extra double quotation mark.
Server code:
const net = require('net'),
server = net.createServer(function(connection) {
console.log('subscriber connected.');
// send first chunk immediately
connection.write('{"type":"changed","file":"targ"');
let timer = setTimeout(function() {
connection.write('et.txt","timestamp":1358175758495}' + '\n');
connection.end();
}, 1000);
connection.on('end', function() {
clearTimeout(timer);
console.log('subscriber disconnected');
});
});
server.listen(5432, function() {
console.log('test server listening for subs...')
});
ldj.js
'use strict';
const
events = require('events'),
util = require('util'),
// client constructor
LDJClient = function(stream) {
events.EventEmitter.call(this);
let self = this;
let buffer = '';
stream.on('data', function(data) {
buffer += data;
console.log(buffer)
let boundary = buffer.indexOf('\n');
while(boundary !== -1) {
let input = buffer.substr(0, boundary);
buffer = buffer.substr(boundary + 1);
//self.emit('message', JSON.parse(input));
boundary = buffer.indexOf('\n');
}
});
};
util.inherits(LDJClient, events.EventEmitter);
// expose module methods
exports.LDJClient = LDJClient;
exports.connect = function(stream) {
return new LDJClient(stream);
};
Output:
{"type":"changed","file":"targ"
{"type":"changed","file":"targ"et.txt","timestamp":1358175758495}
That extra " should not be in "target.txt" value. Any ideas?
TIA
rathern than splitting string manualy try to get whole string and split it to chunks and then send it:
var data = '{"type":"changed","file":"target.txt","timestamp":1358175758495}';
var chunks = data.match(/.{1,10}/g); // 10 symbol chunks
for(var i = 0; i < chunks.length; i++) {
var chunk = chunks[i];
setTimeout(function() {
if(connection) {
connection.write(chunk+'\n');
if(i + 1 == chunks.length) {
connection.end();
}
}
}, i*1000);
}
connection.on('end', function() {
console.log('subscriber disconnected');
});
Trying to get API data.
I have problem with creating valid JSON after modification.
Data should looks like this: [{"1"},{"2"},{"3"}, ... ,{201},{202},{203}, ...]
but now: [{"1"},{"2"},{"3"}, ...],[{"201"},{"202"},{"203"}, ...]
Where is my mistake?
var Promise = require("bluebird");
var request = require('bluebird').promisifyAll(require('request'));
var fs = Promise.promisifyAll(require('fs'));
var ladders = {"hardcore":"hardcore", "standard":"standard"};
function getJSONsync(urls) {
var ladder = [];
Promise.map(urls, function(url) {
return request
.getAsync(url)
.spread(function (res, body) {
if (res.statusCode != 200) {
throw new Error('Unsuccessful attempt. Code: '+ res.statusCode);
}
return JSON.stringify(ladder.concat(JSON.parse(body).entries), "", 4);
})
.catch(console.error);
},{ concurrency: 10 })
.then(function(arr) {
fs.writeFileAsync('file.json', arr);
})
}
function setUrls(ladderName, offset, limit) {
var arr = [];
while(offset < 15000 ) {
arr.push('http://api.pathofexile.com/ladders/'+ladderName+'?offset='+offset+'&limit='+limit);
offset = offset + 200;
}
return arr;
}
getJSONsync(setUrls(ladders.hardcore, 0, 200));
Thx for help.
Sorry for my Eng.
Finally:
var Promise = require("bluebird");
var request = require('bluebird').promisifyAll(require('request'));
var fs = Promise.promisifyAll(require('fs'));
var ladders = {"hardcore":"hardcore","standard":"standard"};
function getJSONsync(urls) {
Promise.map(urls, function(url) {
return request
.getAsync(url)
.spread(function (res, body) {
if (res.statusCode != 200) {
throw new Error('Unsuccessful attempt. Code: '+ res.statusCode);
}
return JSON.parse(body).entries;
})
.catch(console.error);
},{ concurrency: 10 })
.reduce(function(a, b) { return a.concat(b) })
.then(function(arr) {
fs.writeFileAsync('file.json', JSON.stringify(arr, "", 4));
console.log(arr.length);
})
}
function setUrls(ladder, offset, limit) {
var arr = [];
while(offset < 15000 ) {
arr.push('http://api.pathofexile.com/ladders/'+ladder+'?offset='+offset+'&limit='+limit);
offset = offset + 200;
}
return arr;
}
getJSONsync(setUrls(ladders.hardcore, 0, 200));
Promise.map returns an array, so when you do ladder.concat you return another array, so it becomes [[{"1"}], [{"1", "2"}]
You should just remove concat:
return JSON.stringify(JSON.parse(body).entries, "", 4);
But if you want to use variable ladder you may ladder.push(JSON.stringify(JSON.parse(body).entries, "", 4)) and use it instead of arr returned variable
Okay, so here is a part of my casperjs script below which works fine
if(casper.exists(ac1)){
var uel = "https://example.ws/send.html?f=1099817";
this.thenOpen(uel, function() {
casper.wait(10000, function() {
casper.then(function() {
this.evaluate(function() {
var amount = 0.29
var result = amount * 0.019
var result2 = result.toFixed(6);
var fresult = amount - result2;
var needed = fresult.toFixed(3);
document.getElementById('account').value = 'ydfg028';
document.getElementsByName('data')[0].value = needed;
});
this.click("input#sbt.button[type='submit']");
casper.wait(10000, function() {
casper.then(function() {
this.capture("filenadfgmedsfg.jpg");
var el2 = this.getHTML();
fs.write('results23.html', el2, 'w');
});
});
});
});
});
} else {
this.exit();
}
The problem I have is over 14 of the following statements
if(casper.exists()){
So what I am trying to do, is use the casperjs steps as a function. This is what I have tried below, but it just does nothing and casperjs ends when it reaches the function. Here's what I am trying
This is the casperjs function I have made
function casperstep(amount, user, location) {
var uel = "https://example.ws/send.html?f=" + location;
this.thenOpen(uel, function() {
casper.wait(10000, function() {
casper.then(function() {
this.evaluate(function() {
var result = amount * 0.019
var result2 = result.toFixed(6);
var fresult = amount - result2;
var needed = fresult.toFixed(3);
document.getElementById('account').value = user;
document.getElementsByName('data')[0].value = needed;
});
this.click("input#sbt.button[type='submit']");
casper.wait(10000, function() {
casper.then(function() {
this.capture("filenadfgmedsfg.jpg");
var el2 = this.getHTML();
fs.write('results23.html', el2, 'w');
});
});
});
});
});
}
Then when I try the following
if(casper.exists(ac1)){
casperstep(0.29, "username", "3245324");
}
it just does not work at all. The casper steps just do not fire. How can I fix this in theory? It should have worked.
What I have been trying with your answers...
My function
casper.captchaget = function (selector) {
var Loc = this.getHTML(selector, true).match(/src="(.*?)"/)[1];
var Ilocation = 'https://perfectmoney.is' + Loc;
var image = Loc;
var imagesplit = image.split ('?');
var split1 = imagesplit[1];
var string = split1 + ".jpg";
this.download(Ilocation, string);
}
and how I am trying to use it
casper.then(function(){
this.captchaget('img#cpt_img');//this.casperstep(0.29, "username", "3245324");
});
I tried the above to test out using casper extension.
Well, you want to add your own method to a casper object instance : http://casperjs.readthedocs.org/en/latest/extending.html
so :
casper.casperstep = function (amount, user, location) {
{your instructions....}
}
Then call it :
casper.start();
casper.then(function(){
if(casper.exists(ac1)){
casper.casperstep(0.29, "username", "3245324");//this.casperstep(0.29, "username", "3245324");
}
})
.run(function() {
test.done();
});
Old-monkey patching :)
To see other ways to do it : Custom casperjs modules