I am pretty new to web scraping techniques though I already have solid knowledge in terms of PHP / HTML / CSS.
After reading a few tutorials and a lot of tries, I finally managed to scrape my first results as a test.
I use Cheerio + Node.js, and here was the code of my test:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.passion-de-vin.com/contact/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var parsedResults = [];
$('.form-headline').filter(function(i, element) {
var a = $(this).children();
var titre = a.first().text();
release2 = titre.replace(/(\r?\n)/g, '');
release = release2.replace(/\s\s/g, '');
titre = titre;
// Our parsed meta data object
var metadata = {
titre,
};
// Push meta-data into parsedResults array
parsedResults.push(metadata);
fs.writeFile('output.json', JSON.stringify(parsedResults, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');})
});
// Log our finished parse results in the terminal
console.log(parsedResults);
}
});
I have the result log in my JSON file.
Now I would like to know and understand how I can transmit information to that form, post a result and see or get the result of the post.
So far, all I have read has been unclear to me
Related
So I am trying my hand at Node.js. I want to build a simple crawler which scans a page and then returns all links back in a json file. However, when I run the script it returns 0 links.
Here is my code in its entirety:
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = 'https://stackoverflow.com/questions';
//Create the blank array to fill:
var obj = {
table: []
};
var i = 0;
request(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
i++;
});
});
var json = JSON.stringify(obj);
console.log(json);
The output in the terminal is so:
$ !!
node nodetest.js
{"table":[]}
Can anyone see why this is blank? Bonus points for writing the final json to a file :)
You must use obj inside the success callback of the request, that's where it gets populated:
request(url, function(err, resp, body) {
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link) {
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
});
// Only here you can be sure that the "obj" variable is properly
// populated because that's where the HTTP request completes
var json = JSON.stringify(obj);
console.log(json);
});
In your code you have placed the console.log outside the request success which is asynchronous and thus the obj variable is not yet populated.
Also notice that you don't need the i variable. It will be passed to the each callback automatically, you don't need to be explicitly declaring or incrementing it.
As far as writing the result to a file is concerned, you could use the fs.writeFile function:
fs.writeFile("/tmp/test", json, function(err) {
if(!err) {
console.log("File successfully saved");
}
});
I am trying to do web scraping and i would like to display the data in JSON format.
My task is to extract each post from the website and display its relevant data in JSON format. My issue is that i cannot seem to target the row () and then target each id. I can input the id in my code but i would like for the program to seacrh for the id and console log the data of each id in the row.
Example: I want to get the title for the first post by id.
I hope i am making sense.
The website i am trying to extract data from:
My code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var port = 8080;
var url= "https://news.ycombinator.com/";
request(url, function(err,resp,body){
var $ = cheerio.load(body);
var title = $('tr');
var uri
var author
var points
var comments
var rank
var posts = {
postTitle : title,
postUri : uri,
postAuthor : author,
postPoints : points,
postComments : comments,
postRank : rank
}
console.log(posts)
})
app.listen(port);
console.log('server is listening on' + port);
The trick with hackernews is that three tr elements display one row. Thats why each element of rows inherits three subsequent elements of tr. Inside rows.map each item is one row and you can access the attributes "rowwise".
let cheerio = require('cheerio')
let request = require('request');
const url = "https://news.ycombinator.com/";
request(url, function(err,resp,body){
let $ = cheerio.load(body);
const tr = $('.itemlist > tr');
let rows = Array((tr.length - 2)/3); //the last two are the More button
for (var i = 0; i < (tr.length - 2)/3; ++i){
rows[i] = tr.slice(3*i, 3*(i+1));
}
res = rows.map(function(item, index) {
return {
postTitle: $(item).find('.storylink').text(),
postUri: $(item).find('.storylink').attr('href'),
postComments: $(item).find('a+ a').text(),
}
})
console.log(res);
})
Which gives you:
[ { postTitle: 'CockroachDB beta-20161013',
postUri: 'https://jepsen.io/analyses/cockroachdb-beta-20161013',
postComments: '10 comments' },
{ postTitle: 'Attacking the Windows Nvidia Driver',
postUri: 'https://googleprojectzero.blogspot.com/2017/02/attacking-windows-nvidia-driver.html',
postComments: '7 comments' },
{ postTitle: 'DuckDuckGo Donates $300K to Raise the Standard of Trust Online',
postUri: 'https://spreadprivacy.com/2017-donations-d6e4e4230b88#.kazx95v27',
postComments: '25 comments' },
... ]
I'm trying to write an XPath statement to fetch the contents of each row in a table, but only when the 2nd column of each row is not set to "TBA".
The page I am working off this page. I am new to using XPath.
I've come up with the following statement, which I've managed to test successfully (or appears successful anyway) with an online XPath tester, but have been unable to figure out how to apply it in node.js:
//*[#id="body_column_left"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]
This is my attempt below, I've tried variations but I can't get it to even validate as a valid XPath statement and as a result I've been lost in not very helpful stack traces:
var fs = require('fs');
var xpath = require('xpath');
var parse5 = require('parse5');
var xmlser = require('xmlserializer');
var dom = require('xmldom').DOMParser;
var request = require('request');
var getHTML = function (url, callback) {
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
return callback(body) // return the HTML
}
})
}
getHTML("http://au.cybergamer.com/pc/csgo/ladder/scheduled/", function (html) {
var parser = new parse5.Parser();
var document = parser.parse(html.toString());
var xhtml = xmlser.serializeToString(document);
var doc = new dom().parseFromString(xhtml);
var select = xpath.useNamespaces({"x": "http://www.w3.org/1999/xhtml"});
var nodes = select("//x:*[#id=\"body_column_left\"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]", doc);
console.log(nodes);
});
Any help would be appreciated!
I ended up solving this issue using cheerioinstead of xpath:
See below:
var $ = cheerio.load(html);
$('.s_grad br').replaceWith("\n");
$('.s_grad thead').remove();
$('.s_grad tr').each(function(i, elem) {
rows[i] = $(this).text();
rows[i] = rows[i].replace(/^\s*[\r\n]/gm, ""); // remove empty newlines
matches.push(new match($(this).find('a').attr('href').substring(7).slice(0, -1))) // create matches
});
How about using this xpath-html, I loved its simplicity.
const xpath = require("xpath-html");
const nodes = xpath
.fromPageSource(html)
.findElements("//img[starts-with(#src, 'https://cloud.shopback.com')]");
Ok so for fun I decided to scrape all the users who go to my college who are signed up on the website moodle.
This is the program I made with Node.js and cheerio that scrapes the site, but I can not seem to get the text that is inside the H2 tag.
This is the website I am scraping from, http://moodle.ramapo.edu/user/profile.php?id=2101
All I need to do is just change the ID number and it loops through every student.
var request = require('request'),
cheerio = require('cheerio');
urls = [];
//For just single page, eventually will loop through each page.
request('http://moodle.ramapo.edu/user/profile.php?id=2101', function(err, resp, body){
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('h2.main', '#yui_3_9_1_2_1410303448188_167').each(function(){
//Not sure how to retrieve just the text name of person
});
console.log(urls);
};
});
How do I just select the text inside the H2 tag so that I can log all of them to my console?
That's not the way I'd go about it. Below is a code snippet that should help you out, all you'll need to do is wrap it in a loop and iterate through the urls you want to scrape. I'd also suggest you check out this tutorial Scraping the Web With Node.js
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'http://moodle.ramapo.edu/user/profile.php?id=2101';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var name;
$('.main').filter(function(){
var data = $(this);
name = data.text();
console.log("name = " + name);
})
}
res.send('Check your console!')
})
})
app.listen('8081')
exports = module.exports = app;
I had a wild idea that I could build a website blog for an unsophisticated user friend using Google Drive Documents to back it. I was able to create a contentService that compiles a list of documents. However, I can't see a way to convert the document to HTML. I know that Google can render documents in a web page, so I wondered if it was possible to get a rendered version for use in my content service.
Is this possible?
You can try this code :
function getGoogleDocumentAsHTML(){
var id = DocumentApp.getActiveDocument().getId() ;
var forDriveScope = DriveApp.getStorageUsed(); //needed to get Drive Scope requested
var url = "https://docs.google.com/feeds/download/documents/export/Export?id="+id+"&exportFormat=html";
var param = {
method : "get",
headers : {"Authorization": "Bearer " + ScriptApp.getOAuthToken()},
muteHttpExceptions:true,
};
var html = UrlFetchApp.fetch(url,param).getContentText();
Logger.log(html);
}
Node.js Solution
Using the Google APIs Node.js Client
Here's how you can get a google doc as html using google drive's node.js client library.
// import googleapis npm package
var google = require('googleapis');
// variables
var fileId = '<google drive doc file id>',
accessToken = '<oauth access token>';
// oauth setup
var OAuth2 = google.auth.OAuth2,
OAuth2Client = new OAuth2();
// set oauth credentials
OAuth2Client.setCredentials({access_token: accessToken});
// google drive setup
var drive = google.drive({version: 'v3', auth: OAuth2Client});
// download file as text/html
var buffers = [];
drive.files.export(
{
fileId: fileId,
mimeType: 'text/html'
}
)
.on('error', function(err) {
// handle error
})
.on('data', function(data) {
buffers.push(data); // data is a buffer
})
.on('end', function() {
var buffer = Buffer.concat(buffers),
googleDocAsHtml = buffer.toString();
console.log(googleDocAsHtml);
});
Take a look at the Google Drive V3 download docs for more languages and options.
Google docs currently has a function to do this.
Just download to zip(.html) and you can have a zip archive with html & image (if inserted)
I know this is not solution based on code, but its working :)
There is no direct method in GAS to get an HTML version of a doc and this is quite an old enhancement request but the workaround described originally by Henrique Abreu works pretty well, I use it all the time...
The only annoying thing in the authorization process that needs to be called from the script editor which makes it uneasy to use in a shared application (with "script unable" users) but this only happens once ;).
There is also a Library created by Romain Vialard that makes things (a bit) easier... and adds a few other interesting functions.
Here is a little snipped for the new version of goole AOuth following the idea posted by Enrique:
function exportAsHTML(){
var forDriveScope = DriveApp.getStorageUsed(); //needed to get Drive Scope requested
var docID = DocumentApp.getActiveDocument().getId();
var url = "https://docs.google.com/feeds/download/documents/export/Export?id="+docID+"&exportFormat=html";
var param = {
method : "get",
headers : {"Authorization": "Bearer " + ScriptApp.getOAuthToken()},
muteHttpExceptions:true,
};
var html = UrlFetchApp.fetch(url,param).getContentText();
return html;
}
and then use the usual mailApp:
function mailer(){
var docbody = exportAsHTML();
MailApp.sendEmail({
to: "email#mail.com",
subject: "document emailer",
htmlBody: docbody });
}
Hope the new workaround helps
JD
You may use the solution here
/**
* Converts a file to HTML. The Advanced Drive service must be enabled to use
* this function.
*/
function convertToHtml(fileId) {
var file = Drive.Files.get(fileId);
var htmlExportLink = file.exportLinks['text/html'];
if (!htmlExportLink) {
throw 'File cannot be converted to HTML.';
}
var oAuthToken = ScriptApp.getOAuthToken();
var response = UrlFetchApp.fetch(htmlExportLink, {
headers:{
'Authorization': 'Bearer ' + oAuthToken
},
muteHttpExceptions: true
});
if (!response.getResponseCode() == 200) {
throw 'Error converting to HTML: ' + response.getContentText();
}
return response.getContentText();
}
Pass as fileId, the id of the google doc and to enable advanced drive services follow the instructions here.
I've had this problem as well. The HTML that the Document HTML Export spits out is really ugly, so this was my solution:
/**
* Takes in a Google Doc ID, gets that doc in HTML format, cleans up the markup, and returns the resulting HTML string.
*
* #param {string} the id of the google doc
* #param {boolean} [useCaching] enable or disable caching. default true.
* #return {string} the doc's body in html format
*/
function getContent(id, useCaching) {
if (!id) {
throw "Please call this API with a valid Google Doc ID";
}
if (useCaching == null) {
useCaching = true;
}
if (typeof useCaching != "boolean") {
throw "If you're going to specify useCaching, it must be boolean.";
}
var cache = CacheService.getScriptCache();
var cached = cache.get(id); // see if we have a cached version of our parsed html
if (cached && useCaching) {
var html = cached;
Logger.log("Pulling doc html from cache...");
} else {
Logger.log("Grabbing and parsing fresh html from the doc...");
try {
var doc = DriveApp.getFileById(id);
} catch (err) {
throw "Please call this API with a valid Google Doc ID. " + err.message;
}
var docName = doc.getName();
var forDriveScope = DriveApp.getStorageUsed(); // needed to get Drive Scope requested in ScriptApp.getOAuthToken();
var url = "https://docs.google.com/feeds/download/documents/export/Export?id=" + id + "&exportFormat=html";
var param = {
method: "get",
headers: {"Authorization": "Bearer " + ScriptApp.getOAuthToken()},
muteHttpExceptions:true,
};
var html = UrlFetchApp.fetch(url, param).getContentText();
// nuke the whole head section, including the stylesheet and meta tag
html = html.replace(/<head>.*<\/head>/, '');
// remove almost all html attributes
html = html.replace(/ (id|class|style|start|colspan|rowspan)="[^"]*"/g, '');
// remove all of the spans, as well as the outer html and body
html = html.replace(/<(span|\/span|body|\/body|html|\/html)>/g, '');
// clearly the superior way of denoting line breaks
html = html.replace(/<br>/g, '<br />');
cache.put(id, html, 900) // cache doc contents for 15 minutes, in case we get a lot of requests
}
Logger.log(html);
return html;
}
https://gist.github.com/leoherzog/cc229d14a89e6327336177bb07ac2980
Perhaps this would work for you...
function doGet() {
var blob = DriveApp.getFileById('myFileId').getAsHTML();
return HtmlService.createHtmlOutput(blob);
}