Parsing HTML/XML with XPath in node.js - html

I'm trying to write an XPath statement to fetch the contents of each row in a table, but only when the 2nd column of each row is not set to "TBA".
The page I am working off this page. I am new to using XPath.
I've come up with the following statement, which I've managed to test successfully (or appears successful anyway) with an online XPath tester, but have been unable to figure out how to apply it in node.js:
//*[#id="body_column_left"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]
This is my attempt below, I've tried variations but I can't get it to even validate as a valid XPath statement and as a result I've been lost in not very helpful stack traces:
var fs = require('fs');
var xpath = require('xpath');
var parse5 = require('parse5');
var xmlser = require('xmlserializer');
var dom = require('xmldom').DOMParser;
var request = require('request');
var getHTML = function (url, callback) {
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
return callback(body) // return the HTML
}
})
}
getHTML("http://au.cybergamer.com/pc/csgo/ladder/scheduled/", function (html) {
var parser = new parse5.Parser();
var document = parser.parse(html.toString());
var xhtml = xmlser.serializeToString(document);
var doc = new dom().parseFromString(xhtml);
var select = xpath.useNamespaces({"x": "http://www.w3.org/1999/xhtml"});
var nodes = select("//x:*[#id=\"body_column_left\"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]", doc);
console.log(nodes);
});
Any help would be appreciated!

I ended up solving this issue using cheerioinstead of xpath:
See below:
var $ = cheerio.load(html);
$('.s_grad br').replaceWith("\n");
$('.s_grad thead').remove();
$('.s_grad tr').each(function(i, elem) {
rows[i] = $(this).text();
rows[i] = rows[i].replace(/^\s*[\r\n]/gm, ""); // remove empty newlines
matches.push(new match($(this).find('a').attr('href').substring(7).slice(0, -1))) // create matches
});

How about using this xpath-html, I loved its simplicity.
const xpath = require("xpath-html");
const nodes = xpath
.fromPageSource(html)
.findElements("//img[starts-with(#src, 'https://cloud.shopback.com')]");

Related

Error handling - retry urlfetch on error until success

I've looked at all the relevant questions here (such as this), but still cannot make sense of this VERY simple task.
So, trying to verify numbers using the NumVerify API. We're still on the free license on APILAYER so we're getting the following error from time to time
Request failed for https://apilayer.net returned code 500
I'd like to add a loop so that the script will try again until a proper response is received.
Here is a snippet based on several answers here:
function numverifylookup(mobilephone) {
console.log("input number: ",mobilephone);
var lookupUrl = "https://apilayer.net/api/validate?access_key=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX&number="+mobilephone+"&country_code=IL";
try {
var response = UrlFetchApp.fetch(lookupUrl);
if (response) {//Check for truthy value
var json = response.getContentText();
} else {
Utilities.sleep(2000);
continue;//If "get" returned a falsy value then continue
}
} catch(e) {
continue;//If error continue looping
}
var data = JSON.parse(response);
Sadly, still not working due to the following error:
Continue must be inside loop. (line 10
Any thoughts?
I think it's actually better to solve this using muteHTTPexepctions but couldn't quite make it work.
Thanks!
I think I got this to work as below:
function numverify(mobilephone);
console.log("input number: ",mobilephone);
var lookupUrl = "https://apilayer.net/api/validate?access_key=XXXXXXXXXXXX&number="+mobilephone+"&country_code=IL";
var i = 0;
var trycount = 1;
var errorcodes = "";
while (i != 1) {
var response = UrlFetchApp.fetch(lookupUrl, {muteHttpExceptions: true });
var responsecode = response.getResponseCode();
var errorcodes = errorcodes + "," + responsecode;
if (responsecode = 200) {//Check for truthy value
var json = response.getContentText();
var i = 1
} else {
var trycount = trycount + 1;
Utilities.sleep(2000);
}
}
var data = JSON.parse(response);
var valid = data.valid;
var localnum = data.local_format;
var linetype = data.line_type;
console.log(data," ",valid," ",localnum," ",linetype," number of tries= ",trycount," responsecodes= ", errorcodes);
var answer = [valid,localnum,linetype];
return answer;
}
I'll circle back in case it still doesn't work.
Thanks for helping!
You cannot use continue to achieve what you want, instead you can / need to call the function again:
function numverifylookup(mobilephone) {
console.log("input number: ", mobilephone);
var lookupUrl = "https://apilayer.net/api/validate?access_key=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX&number=" + mobilephone + "&country_code=IL";
try {
var response = UrlFetchApp.fetch(lookupUrl);
if (response) {//Check for truthy value
var json = response.getContentText();
} else {
Utilities.sleep(2000);
numverifylookup(mobilephone);
}
} catch (e) {
Utilities.sleep(2000);
numverifylookup(mobilephone);//If error rerun the function
}
var data = JSON.parse(response);
}
As you can draw from the documentation the statement continue can only be used inside of loops, like e.g. the for loop.

How to catch execution error in XMLService getAttribute method?

I'm using this script in a Google Sheets spreadsheet to parse a table in a web page and to store results:
var doc = XmlService.parse(result);
var html = doc.getRootElement();
var resulttable = getElementsByClassName(html, 'resulttable')[0];
var descendants = html.getDescendants();
descendants.push(html);
for(var i in descendants) {
var elt = descendants[i].asElement(); <== it crashes
if(elt != null) {
var test = elt.getAttributes();
var test_bis = elt.getAttribute('http-equiv'); <== it does not crashes: 'http-equiv' exists
var classes = elt.getAttribute('class'); <== it crashes:'class' does not exists
}
}
As it's shown, I have some errors (simply raised as "server errors") in the marked lines of this code. I also put try...catch block, but they don't catch the errors: the script terminates abruptly.
How can I catch the errors so as to let the script continuing despite some of these XML errors?
My expectations were to have undefined elements when the asElement or the getAttribute methods fail.
To parse the URL I was using this approach
var url = "https://albopretorio.comune.gravina.ba.it/fo/?ente=GravinaInPuglia";
var today = Utilities.formatDate(new Date(), "GMT+1", "dd/MM/yyyy");
var payload =
{
"tipoSubmit":"ricerca",
"enti":"GravinaInPuglia",
"uo":"",
"tipoatto":"",
"anno":"",
"numda":"",
"numa":"",
"annoatto":"",
"numatto":"",
"datada":"",
"dataa":"",
"pubblicatoda":today,
"pubblicatoa":"",
"presenteal":"",
"chiave":"",
"provenienza":"",
};
var options =
{
"method" : "POST",
"payload" : payload,
"followRedirects" : true,
"muteHttpExceptions": true
};
var result = UrlFetchApp.fetch(url, options);
Issue:
You don't know which ContentType each descendant is, so you don't know which method you should use to cast the node.
Solution:
For each descendant, check for the corresponding ContentType with the method getType(). You can use the returned value (the ContentType) and a switch statement to use one method or another.
Code snippet:
for (var i in descendants) {
var contentType = descendants[i].getType();
var elt;
switch (contentType.toString()) {
case 'TEXT':
elt = descendants[i].asText();
break;
case 'ELEMENT':
elt = descedants[i].asElement();
break;
// Add other possible ContentTypes, if necessary
Update: Issue with getAttribute:
In order to avoid the script trying retrieve an attribute that does not exist, you can retrieve an array of the attribute names for this element, and then check if your attribute is included in that array:
var attributes = elt.getAttributes().map(attribute => attribute.getName());
if (attributes.includes('class')) {
var classes = elt.getAttribute('class');
}
Reference:
Enum ContentType
Interface Content: getType()
switch

Node.js + Cheerio Scraping : How to post in a contact form?

I am pretty new to web scraping techniques though I already have solid knowledge in terms of PHP / HTML / CSS.
After reading a few tutorials and a lot of tries, I finally managed to scrape my first results as a test.
I use Cheerio + Node.js, and here was the code of my test:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.passion-de-vin.com/contact/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var parsedResults = [];
$('.form-headline').filter(function(i, element) {
var a = $(this).children();
var titre = a.first().text();
release2 = titre.replace(/(\r?\n)/g, '');
release = release2.replace(/\s\s/g, '');
titre = titre;
// Our parsed meta data object
var metadata = {
titre,
};
// Push meta-data into parsedResults array
parsedResults.push(metadata);
fs.writeFile('output.json', JSON.stringify(parsedResults, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');})
});
// Log our finished parse results in the terminal
console.log(parsedResults);
}
});
I have the result log in my JSON file.
Now I would like to know and understand how I can transmit information to that form, post a result and see or get the result of the post.
So far, all I have read has been unclear to me

Node .JS Crawler to JSON output is empty

So I am trying my hand at Node.js. I want to build a simple crawler which scans a page and then returns all links back in a json file. However, when I run the script it returns 0 links.
Here is my code in its entirety:
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = 'https://stackoverflow.com/questions';
//Create the blank array to fill:
var obj = {
table: []
};
var i = 0;
request(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
i++;
});
});
var json = JSON.stringify(obj);
console.log(json);
The output in the terminal is so:
$ !!
node nodetest.js
{"table":[]}
Can anyone see why this is blank? Bonus points for writing the final json to a file :)
You must use obj inside the success callback of the request, that's where it gets populated:
request(url, function(err, resp, body) {
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link) {
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
});
// Only here you can be sure that the "obj" variable is properly
// populated because that's where the HTTP request completes
var json = JSON.stringify(obj);
console.log(json);
});
In your code you have placed the console.log outside the request success which is asynchronous and thus the obj variable is not yet populated.
Also notice that you don't need the i variable. It will be passed to the each callback automatically, you don't need to be explicitly declaring or incrementing it.
As far as writing the result to a file is concerned, you could use the fs.writeFile function:
fs.writeFile("/tmp/test", json, function(err) {
if(!err) {
console.log("File successfully saved");
}
});

How to target the first id in row

I am trying to do web scraping and i would like to display the data in JSON format.
My task is to extract each post from the website and display its relevant data in JSON format. My issue is that i cannot seem to target the row () and then target each id. I can input the id in my code but i would like for the program to seacrh for the id and console log the data of each id in the row.
Example: I want to get the title for the first post by id.
I hope i am making sense.
The website i am trying to extract data from:
My code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var port = 8080;
var url= "https://news.ycombinator.com/";
request(url, function(err,resp,body){
var $ = cheerio.load(body);
var title = $('tr');
var uri
var author
var points
var comments
var rank
var posts = {
postTitle : title,
postUri : uri,
postAuthor : author,
postPoints : points,
postComments : comments,
postRank : rank
}
console.log(posts)
})
app.listen(port);
console.log('server is listening on' + port);
The trick with hackernews is that three tr elements display one row. Thats why each element of rows inherits three subsequent elements of tr. Inside rows.map each item is one row and you can access the attributes "rowwise".
let cheerio = require('cheerio')
let request = require('request');
const url = "https://news.ycombinator.com/";
request(url, function(err,resp,body){
let $ = cheerio.load(body);
const tr = $('.itemlist > tr');
let rows = Array((tr.length - 2)/3); //the last two are the More button
for (var i = 0; i < (tr.length - 2)/3; ++i){
rows[i] = tr.slice(3*i, 3*(i+1));
}
res = rows.map(function(item, index) {
return {
postTitle: $(item).find('.storylink').text(),
postUri: $(item).find('.storylink').attr('href'),
postComments: $(item).find('a+ a').text(),
}
})
console.log(res);
})
Which gives you:
[ { postTitle: 'CockroachDB beta-20161013',
postUri: 'https://jepsen.io/analyses/cockroachdb-beta-20161013',
postComments: '10 comments' },
{ postTitle: 'Attacking the Windows Nvidia Driver',
postUri: 'https://googleprojectzero.blogspot.com/2017/02/attacking-windows-nvidia-driver.html',
postComments: '7 comments' },
{ postTitle: 'DuckDuckGo Donates $300K to Raise the Standard of Trust Online',
postUri: 'https://spreadprivacy.com/2017-donations-d6e4e4230b88#.kazx95v27',
postComments: '25 comments' },
... ]