Node .JS Crawler to JSON output is empty - json

So I am trying my hand at Node.js. I want to build a simple crawler which scans a page and then returns all links back in a json file. However, when I run the script it returns 0 links.
Here is my code in its entirety:
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = 'https://stackoverflow.com/questions';
//Create the blank array to fill:
var obj = {
table: []
};
var i = 0;
request(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
i++;
});
});
var json = JSON.stringify(obj);
console.log(json);
The output in the terminal is so:
$ !!
node nodetest.js
{"table":[]}
Can anyone see why this is blank? Bonus points for writing the final json to a file :)

You must use obj inside the success callback of the request, that's where it gets populated:
request(url, function(err, resp, body) {
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link) {
var actualLink = $(link).attr('href');
obj.table.push({id: i, url:actualLink}); //add some data
});
// Only here you can be sure that the "obj" variable is properly
// populated because that's where the HTTP request completes
var json = JSON.stringify(obj);
console.log(json);
});
In your code you have placed the console.log outside the request success which is asynchronous and thus the obj variable is not yet populated.
Also notice that you don't need the i variable. It will be passed to the each callback automatically, you don't need to be explicitly declaring or incrementing it.
As far as writing the result to a file is concerned, you could use the fs.writeFile function:
fs.writeFile("/tmp/test", json, function(err) {
if(!err) {
console.log("File successfully saved");
}
});

Related

Copy data from download URL using Google Script

I'm new to App scripts and need help with copying the data to spreadsheet from URL.
However, URL is not a website but link which after clicking with directly download csv file into the computer. Also, its not ending with .csv as I have seen in other examples here.
URL basically coming to my inbox at a specific time. I'm trying to use Fetch URL but its not working at all.
Sample URL -
https://docs.google.com/spreadsheets/d/1oPUPPUmy7psliSznUItT0DnHvilXwZHzyrmdyHpHi18/export?format=csv
function ABC () {
const searchQuery = 'XYZ';
const threads = GmailApp.search(searchQuery, 0,1);
const urls = [];
threads.forEach(thread => {
const messages = thread.getMessages();
messages.forEach(message => {
const body = message.getBody();
var re = /\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'"".,<>?«»“”‘’]))/i;
const match = body.match(re);
if (match) { urls.push(match[1]); }
});
}) ;
Logger.log(urls);
url = urls.toString().replace("[","").replace("]","") ;
Logger.log(url);
function getData() {
var attValue = '';
// making a call to the target website
var response = UrlFetchApp.fetch(url);
//logging response from target website - In Script Editor > View > Logs
Logger.log(response.getContentText());
//parsing the response data from website
//https://developers.google.com/apps-script/reference/url-fetch/http-response
var rawData = response.getContentText();
var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var sheet = SpreadsheetApp.setActiveSheet(spreadsheet.getSheets()[1]);
var cell = sheet.getRange(1, 1);
cell.setValue(rawData);
}
};
Kindly help so that I can copy the data directly into spreadsheet or store the file in Google Drive with filename as combination of text and date.
Thanks
SUGGESTION
You can try the tweaked script below.
In my understanding, here is your goal:
Get your email messages that contain URLs (CSV file) via "XYZ" search terms.
Process the URL using URLFetchApp service
Place the CSV data into your second sheet tab.
Note: If there's anything else missing or something may have been misunderstood, feel free to let me know.
Tweaked Script
function ABC() {
/**TWEAKED: Created a function call method called "getData" */
const url = {
getData: function () {
const searchQuery = 'XYZ';
const threads = GmailApp.search(searchQuery, 0, 1);
const urls = [];
threads.forEach(thread => {
const messages = thread.getMessages();
messages.forEach(message => {
const body = message.getBody();
var re = /\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'"".,<>?«»“”‘’]))/i;
const match = body.match(re);
if (match) { urls.push(match[1]); }
});
});
Logger.log(urls);
/**TWEAKED: Instead of using the redundant replace method,
* used "regex" inside a single replace method to replace
* all [ and ] characters */
var geturl = urls.toString().replace(/\[|]/gm, "");
console.log(geturl)
return geturl;
}
}
var attValue = '';
/**TWEAKED: Call the "url" variable's "getData" function that will return the URL */
var response = UrlFetchApp.fetch(url.getData.call());
//logging response from target website - In Script Editor > View > Logs
Logger.log(response.getContentText());
//parsing the response data from website
//https://developers.google.com/apps-script/reference/url-fetch/http-response
var rawData = response.getContentText();
var spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var sheet = SpreadsheetApp.setActiveSheet(spreadsheet.getSheets()[1]);
var cell = sheet.getRange(1, 1);
cell.setValue(rawData);
};
Demonstration
After running the ABC() function on the Apps Script editor, the second sheet tab gets populated with the CSV data:
The Apps Script execution log view
References:
JavaScript Function call()

Data Studio Connector getData() not running

I can't seem to get the getData() function to run on this connector I'm building. Data studio displays my Schema properly, however when I go to 'explore' the data, an error is thrown. Looking in the project executions, the 'getData' function never runs at all.
Data Studio has encountered a system error.
Sorry, we encountered an error and were unable to complete your request.
There's no debug errors shown, and I'm not sure how to continue debugging this.
Here is my code...
var cc = DataStudioApp.createCommunityConnector();
function isAdminUser(){
return true
}
function responseToRows(requestedFields, response){
return response.map(function(item) {
var row = [];
requestedFields.asArray().forEach(function(field){
var id = field.getId()
row.push(item[id])
});
console.log(row);
return { values: row };
});
}
function getAuthType() {
var response = { type: 'NONE' };
return response;
}
function getConfig(){
var json = UrlFetchApp.fetch("<api-url>");
var data = JSON.parse(json);
var config = cc.getConfig();
var tables = data.TableNames
var configElement = config
.newSelectSingle()
.setId('tables')
.setName("Choose your data source")
.setHelpText('Choose your data source');
for(i=0;i<tables.length;i++){
configElement
.addOption(config.newOptionBuilder().setLabel(tables[i]).setValue(tables[i]))
}
return config.build();
}
function getSchema(request){
var fields = cc.getFields();
var types = cc.FieldType;
var table = request.configParams.tables;
var data = UrlFetchApp.fetch("<api-url>"+"?name="+table);
var itemArray = JSON.parse(data);
var singleRow = itemArray["Items"][0];
var keys = Object.keys(singleRow)
for(i=0;i<keys.length;i++){
var nestedKeys = Object.keys(singleRow[keys[i]])
var propName = keys[i];
var dataType = nestedKeys[0]
if(dataType == "S"){
fields.newDimension()
.setId(propName)
.setName(propName)
.setType(types.TEXT)
}else if (dataType == "N"){
fields.newMetric()
.setId(propName)
.setName(propName)
.setType(types.NUMBER)
}
}
console.log(fields.build());
console.log('get schema')
return { schema: fields.build() };
}
function getData(request){
var fields = cc.getFields();
console.log(fields);
console.log('getdata running');
// TODO: Create Schema for requested field
var table = request.configParams.tables;
var requestedFieldIds = request.fields.map(function(field) {
return field.name
});
var requestedFields = fields.forIds(requestedFieldIds);
// TODO: Fetch and Parse data from API
var response = UrlFetchApp.fetch("<api-url>"+"?name="+table);
var parsedResponse = JSON.parse(response)
// TODO: Transform parsed data and filter for requested fields
var rows = responseToRows(requestedFields, parsedResponse)
return {
schema: requestedFields.build(),
rows: rows
}
}
To see debug traces, you could simply log it with console.log() and take a look at your logs in the Google Apps Scripts dashboard :
https://script.google.com/home/executions
I don't know if this is related to your problem, but in my case I was trying to use URL Parameters and getData(request) wouldn't run no matter what values I input - it ended up being that I had to create a production deployment and Publish > Deploy from Manifest and then create an actual published version (not just FROM HEAD).

Node.js + Cheerio Scraping : How to post in a contact form?

I am pretty new to web scraping techniques though I already have solid knowledge in terms of PHP / HTML / CSS.
After reading a few tutorials and a lot of tries, I finally managed to scrape my first results as a test.
I use Cheerio + Node.js, and here was the code of my test:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://www.passion-de-vin.com/contact/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var parsedResults = [];
$('.form-headline').filter(function(i, element) {
var a = $(this).children();
var titre = a.first().text();
release2 = titre.replace(/(\r?\n)/g, '');
release = release2.replace(/\s\s/g, '');
titre = titre;
// Our parsed meta data object
var metadata = {
titre,
};
// Push meta-data into parsedResults array
parsedResults.push(metadata);
fs.writeFile('output.json', JSON.stringify(parsedResults, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');})
});
// Log our finished parse results in the terminal
console.log(parsedResults);
}
});
I have the result log in my JSON file.
Now I would like to know and understand how I can transmit information to that form, post a result and see or get the result of the post.
So far, all I have read has been unclear to me

Parsing HTML/XML with XPath in node.js

I'm trying to write an XPath statement to fetch the contents of each row in a table, but only when the 2nd column of each row is not set to "TBA".
The page I am working off this page. I am new to using XPath.
I've come up with the following statement, which I've managed to test successfully (or appears successful anyway) with an online XPath tester, but have been unable to figure out how to apply it in node.js:
//*[#id="body_column_left"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]
This is my attempt below, I've tried variations but I can't get it to even validate as a valid XPath statement and as a result I've been lost in not very helpful stack traces:
var fs = require('fs');
var xpath = require('xpath');
var parse5 = require('parse5');
var xmlser = require('xmlserializer');
var dom = require('xmldom').DOMParser;
var request = require('request');
var getHTML = function (url, callback) {
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
return callback(body) // return the HTML
}
})
}
getHTML("http://au.cybergamer.com/pc/csgo/ladder/scheduled/", function (html) {
var parser = new parse5.Parser();
var document = parser.parse(html.toString());
var xhtml = xmlser.serializeToString(document);
var doc = new dom().parseFromString(xhtml);
var select = xpath.useNamespaces({"x": "http://www.w3.org/1999/xhtml"});
var nodes = select("//x:*[#id=\"body_column_left\"]/div[4]/table/tbody/tr/[not(contains(./td[2], 'TBA'))]", doc);
console.log(nodes);
});
Any help would be appreciated!
I ended up solving this issue using cheerioinstead of xpath:
See below:
var $ = cheerio.load(html);
$('.s_grad br').replaceWith("\n");
$('.s_grad thead').remove();
$('.s_grad tr').each(function(i, elem) {
rows[i] = $(this).text();
rows[i] = rows[i].replace(/^\s*[\r\n]/gm, ""); // remove empty newlines
matches.push(new match($(this).find('a').attr('href').substring(7).slice(0, -1))) // create matches
});
How about using this xpath-html, I loved its simplicity.
const xpath = require("xpath-html");
const nodes = xpath
.fromPageSource(html)
.findElements("//img[starts-with(#src, 'https://cloud.shopback.com')]");

Scraping Node.js: Getting text from H2 header

Ok so for fun I decided to scrape all the users who go to my college who are signed up on the website moodle.
This is the program I made with Node.js and cheerio that scrapes the site, but I can not seem to get the text that is inside the H2 tag.
This is the website I am scraping from, http://moodle.ramapo.edu/user/profile.php?id=2101
All I need to do is just change the ID number and it loops through every student.
var request = require('request'),
cheerio = require('cheerio');
urls = [];
//For just single page, eventually will loop through each page.
request('http://moodle.ramapo.edu/user/profile.php?id=2101', function(err, resp, body){
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
$('h2.main', '#yui_3_9_1_2_1410303448188_167').each(function(){
//Not sure how to retrieve just the text name of person
});
console.log(urls);
};
});
How do I just select the text inside the H2 tag so that I can log all of them to my console?
That's not the way I'd go about it. Below is a code snippet that should help you out, all you'll need to do is wrap it in a loop and iterate through the urls you want to scrape. I'd also suggest you check out this tutorial Scraping the Web With Node.js
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'http://moodle.ramapo.edu/user/profile.php?id=2101';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var name;
$('.main').filter(function(){
var data = $(this);
name = data.text();
console.log("name = " + name);
})
}
res.send('Check your console!')
})
})
app.listen('8081')
exports = module.exports = app;