How do I recurrsively call multiple URLs using Puppeteer and Headless Chrome? - puppeteer

I am trying to write a program to scan multiple URLs at the same time (parallelizaiton) and I have extracted sitemap and stored it as an array in a Variable as shown below. But i am unable to open using Puppeteer. I am getting the below error:
originalMessage: 'Cannot navigate to invalid URL'
My code below. Can someone please help me out .
const sitemapper = require('#mastixmc/sitemapper');
const SitemapXMLParser = require('sitemap-xml-parser');
const url = 'https://edition.cnn.com/sitemaps/sitemap-section.xml';
/*If sitemapindex (link of xml or gz file) is written in sitemap, the URL will be accessed.
You can optionally specify the number of concurrent accesses and the number of milliseconds after processing and access to resume processing after a delay.
*/
const options = {
delay: 3000,
limit: 50000
};
const sitemapXMLParser = new SitemapXMLParser(url, options);
sitemapXMLParser.fetch().then(result => {
var locs = result.map(value => value.loc)
var locsFiltered = locs.toString().replace("[",'<br>');
const urls = locsFiltered
console.log(locsFiltered)
const puppeteer = require("puppeteer");
async function scrapeProduct(url) {
const urls = locsFiltered
const browser = await puppeteer.launch({
headless: false
});
for (i = 0; i < urls.length; i++) {
const page = await browser.newPage();
const url = urls[i];
const promise = page.waitForNavigation({
waitUntil: "networkidle2"
});
await page.goto(`${url}`);
}};
scrapeProduct();
});

You see invalid URL because you've convert an array into URL string by wrong method.
These line is a better one:
// var locsFiltered = locs.toString().replace("[",'<br>') // This is wrong
// const urls = locsFiltered // So value is invalid
// console.log(locsFiltered)
const urls = locs.map(value => value[0]) // This is better
So to scrape CNN sites, i've added puppeteer-cluster for speed:
const { Cluster } = require('puppeteer-cluster')
const sitemapper = require('#mastixmc/sitemapper')
const SitemapXMLParser = require('sitemap-xml-parser')
const url = 'https://edition.cnn.com/sitemaps/sitemap-section.xml'
async function scrapeProduct(locs) {
const urls = locs.map(value => value[0])
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2, // You can set this to any number you like
puppeteerOptions: {
headless: false,
devtools: false,
args: [],
}
})
await cluster.task(async ({ page, data: url }) => {
await page.goto(url, {timeout: 0, waitUntil: 'networkidle2'})
const screen = await page.screenshot()
// Store screenshot, do something else
})
for (i = 0; i < urls.length; i++) {
console.log(urls[i])
await cluster.queue(urls[i])
}
await cluster.idle()
await cluster.close()
}
/******
If sitemapindex (link of xml or gz file) is written in sitemap, the URL will be accessed.
You can optionally specify the number of concurrent accesses and the number of milliseconds after processing and access to resume processing after a delay.
*******/
const options = {
delay: 3000,
limit: 50000
}
const sitemapXMLParser = new SitemapXMLParser(url, options)
sitemapXMLParser.fetch().then(async result => {
var locs = result.map(value => value.loc)
await scrapeProduct(locs)
})

Related

Puppeteer - webScraper

I have trouble replace hyperlinks with regular text on a website that I scraped by Puppeteer.
I'm able to get the HTML file, but when it renders, the stylesheet doesn't work, the hyperlinks lead to errors.
Anybody can help me with this, please?
const puppeteer = require('puppeteer');
const fs = require('fs')
const scrapeWikipedia = async () => {
const browser = await puppeteer.launch({
headless: false
// defaultViewport: null
});
const page = await browser.newPage();
const wikiUrl = 'https://en.wikipedia.org/wiki/Groundhog_Day_(film)'
await page.goto(wikiUrl, {waitUntil: 'networkidle2'});
//replace hyperlink
await page.evaluate(_ => {
document.querySelectorAll('a[href^="javascript"]')
.forEach(a => {
a.href = '#'
})
});
//get html file get from the website with
const html = await page.content();
fs.writeFileSync("./outputHTML/index.html", html);
await browser.close();
}
scrapeWikipedia()

Add Revit Levels and 2D Minimap to your 3D for local viewer

In our project, we use a local viewer and we do not receive data from a request, but we have it locally.
And getting data from our local files it is impossible to insert it into the panel.
And the call doc.downloadAecModelData() returns null.
See more https://forge.autodesk.com/blog/add-revit-levels-and-2d-minimap-your-3d
Autodesk.Viewing.Initializer(options, async () => {
const config3d = {
useConsolidation: true,
useADP: false,
extensions: [
"Autodesk.Viewing.MarkupsCore",
"MarkupExtension",
"Autodesk.AEC.LevelsExtension",
"Autodesk.AEC.Minimap3DExtension"
]
};
const init_div = document.getElementById("init_div");
const svf_path = "/storage/" + decodeURIComponent(props.search.split("&&")[1]);
const p = "/storage/" + props.search.split("&&")[1];
const viewer = new Autodesk.Viewing.GuiViewer3D(init_div, config3d);
Autodesk.Viewing.endpoint.getItemApi = (endpoint, derivativeUrn, api) => {
return (
"/storage/5bf49c73e2ac26b2756b642ae1b29037/aa1bb3d5baaa229e0c15e674adb7aceec1a0fb061e9c0288d9979f801fb6460d.svf/Resource" +
decodeURIComponent(p.split("Resource")[1])
);
};
const paths = svf_path.split("/");
const [dest, svf_dir] = [paths[2], paths[3]];
const url = http://localhost:3000/api/viewer/dest/${dest}/svf/${svf_dir}/manifest;
const response = await fetch(url);
const manifest = await response.json();
const aec_url = /api/viewer/dest/${dest}/svf/${svf_dir}/aec;
const response_aec = await fetch(aec_url);
const aec = JSON.parse(await response_aec.json());
// console.log(aec);
const viewerDocument = new Autodesk.Viewing.Document(manifest);
const a = await viewerDocument.downloadAecModelData();
console.log(a);
console.log(manifest);
const viewable = viewerDocument.getRoot().getDefaultGeometry();
viewable.data.aec_odel_data = aec;
// console.log(viewable);
await Autodesk.Viewing.Initializer(options, () => {
// const aec = viewable.parent.children[1].data; //.urn//.split("Resource")[1];
// for resize and use viewer outer useEffect
props.set_loc_viewer(viewer);
viewer.start();
viewer.loadDocumentNode(viewerDocument, viewable, options);
});
As i know you can only use doc.downloadAecModelData() after document loaded, like this:
onDocumentLoadSuccess(doc: Autodesk.Viewing.Document) {doc.downloadAecModelData(); }

create new tab in puppeteer inside a loop cause Navigation timeout

Recently I am learning puppeteer using their docs and try to scrape some information.
First approach
First I collect a list of url from the mainpage. Second I create a new tab and go those url iterately and collect some data. I doubt when I enter the loop the new tab didn't work as I expect and freezed without giving any data. Eventually I got a error TimeoutError: Navigation timeout of 30000 ms exceeded. Is there any better approach?
(async () => {
const browser = await puppeteer.launch({ headless: true });
const mainpage = await browser.newPage();
console.log('goto main page'.green);
await mainpage.goto(mainURL);
console.log('collecting some url'.green);
const URLS = await mainpage.evaluate(() =>
Array.from(
document.querySelectorAll('.result-actions a'),
(element) => element.href
)
);
if (typeof URLS[0] === 'string') console.log('OK'.green);
console.log('collecting finished'.green);
const newTab= await browser.newPage();
console.log('create new tab'.green);
var data = [];
for (let i = 0, n = URLS.length; i < n; i++) {
//console.log(URLS[i]);
// use this new tab to collect some data then close this tab
// continue this process
await newTab.waitForNavigation();
await newTab.goto(URLS[i]);
await newTab.waitForSelector('.profile-phone-column span a');
console.log('Go each url using new tab'.green);
// collecting data
data.push(collected_data);
// close this tab
await collectNamePage.close();
console.log(data);
}
await mainpage.close();
await browser.close();
console.log('closing browser'.green);
})();
Second approach
This time I want to skip the part where I collect those data using a new tab. Hence I collect my urls using page.$$() and try to iterating using for...of over urls and collect my data using elementHandle.$(selector) but this approach also failed.
I am getting frustrated. Am I doing it wrong way or I didn't understand their documentation?
In your script, you do not need newTab.waitForNavigation(); at all. Usually, this is used when the navigation is caused by some event. When you just use .goto(), the page loading is waited automatically.
Even if you need waitForNavigation(), you usually do not await it before the navigation triggered, otherwise you just get the timeout. You await it with navigation trigger together:
await Promise.all([element.click(), page.waitForNavigation()]);
So try to just delete await newTab.waitForNavigation();.
Also, do not close the new tab in the loop, delete it after the loop.
Edited script:
const puppeteer = require('puppeteer');
const mainURL = 'https://www.psychologytoday.com/us/therapists/illinois/';
(async () => {
const browser = await puppeteer.launch({ headless: false });
const mainpage = await browser.newPage();
console.log('goto main page');
await mainpage.goto(mainURL);
console.log('collecting urls');
const URLS = await mainpage.evaluate(() =>
Array.from(
document.querySelectorAll('.result-actions a'),
(element) => element.href
)
);
if (typeof URLS[0] === 'string') console.log('OK');
console.log('collection finished');
const collectNamePage = await browser.newPage();
console.log('create new tab');
var data = [];
for (let i = 0, totalUrls = URLS.length; i < totalUrls; i++) {
console.log(URLS[i]);
await collectNamePage.goto(URLS[i]);
await collectNamePage.waitForSelector('.profile-phone-column span a');
console.log('create new tab and go there');
// collecting data
const [name, phone] = await collectNamePage.evaluate(
() => [
document.querySelector('.profile-middle .name-title-column h1').innerText,
document.querySelector('.profile-phone-column span a').innerText
]
);
data.push({ name, phone });
}
console.log(data);
await collectNamePage.close();
await mainpage.close();
await browser.close();
console.log('closing browser');
})();

Can we fetch HTTP return code of previous call?

I search doc and the web, but can't find how to get the HTTP code of a query.
Anyone knows ?
Example :
const puppeteer = require('puppeteer');
const fs = require('fs');
const debug = true;
var base_url = 'https://stackoverflow.com/';
(async () => {
const browser = await puppeteer.launch({
headless: true,
});
const page = await browser.newPage();
await page.goto('https://stackoverflow.com');
// how to get HTTP code of last call ?
await browser.close();
})();
There's response.status() but don't know how to just fetch last query and not all with
page.on('response', response => {
console.log("response code: ", response.status());
});
OK, get it, thanks #Take_Care:
response.status()
const puppeteer = require('puppeteer');
const fs = require('fs');
const debug = true;
var base_url = 'https://stackoverflow.com/';
(async () => {
const browser = await puppeteer.launch({
headless: true,
});
const page = await browser.newPage();
cons ret = await page.goto('https://stackoverflow.com');
console.log(ret.status());
await browser.close();
})();

Jawbone API Paginated Results with 'page_token'

The Jawbone API returns paginated results of 10 json objects per result set. How does one obtain the rest of the paginated results?
The API documentation for the sleeps method indicates the existence of a page_token argument in the next object of the result set. My output below is missing this. Furthermore,the FAQ indicates this page_token takes an INT (presumably epoch) timestamp.
2nd: "page_token" parameter: if the request contains the "page_token" parameter, the API will return all the workouts, in
reverse order, (capped by "limit" or default of 10) that were
completed before that page_token. The page_token is a timestamp, and
there's a special case, when the request comes with page_token=0 which
is interpreted as passing page_token = CURRENT_TIMESTAMP, ie, give all
the workouts (with a limit)
I am able to authenticate with the API and return a set of 10 results (first paginated page)... but no page_token.
...snip json...
"links": {
"next": "/nudge/api/v.1.0/users/jMdCUPXZ-InYXo1kcdOkvA/sleeps?start_time=1424699101&updated_after=0&limit=10&end_time=1438723789"
},
"size": 10
Have I misunderstood the documentation? Could it be the documentation is out of date (wrong)? Or more likely, I'm completely misunderstanding this and writing horrible JS for my node.js ...
Can someone set me straight and show me how I can retrieve ALL results, not just the first page?
var express = require('express');
var app = express();
var port = process.env.PORT || 5000;
var passport = require('passport');
var config = require('./config.json');
var ejs = require('ejs');
var https = require('https');
var fs = require('fs');
var bodyParser = require('body-parser');
var jbStrategy = require('passport-oauth').OAuth2Strategy;
var jsonfile = require('jsonfile');
var util = require('util');
var path = require('path');
/* Calculate date range */
var $today = new Date()
var $start = new Date($today); $start.setDate($today.getDate() - 180);
var $end = new Date($today);
var $startDate = Math.floor(($start).getTime()/1000);
var $endDate = Math.floor(($end).getTime()/1000);
app.use(express.logger('dev')); // log every request to the console
app.use(bodyParser.json()); // read cookies (needed for auth)
app.use(express.static(__dirname + '/public'));
app.set('view engine', 'ejs');
app.set('views', __dirname + '/views');
app.use(passport.initialize());
/* Default Authentication Path */
app.get('/',
passport.authorize('jawbone', {
scope : config.jawboneAuth.scope,
failureRedirect: '/'
})
);
/* oauth callback from jawbone */
app.get('/done', passport.authorize('jawbone', {
scope : config.jawboneAuth.scope,
failureRedirect: '/'
}), function(req, res) {
var result = JSON.parse(body); console.log(result);
res.redirect('/sleeps');
}
);
app.get('/sleeps', function(req, res) {
var options = {
access_token : config.jawboneAuth.accessToken,
refresh_token : config.jawboneAuth.refreshToken,
client_id : config.jawboneAuth.clientID,
client_secret : config.jawboneAuth.clientSecret
};
if (!config.jawboneAuth.accessToken) {
// if there's no accessToken, go get one
res.redirect('/');
} else {
var up = require('jawbone-up')(options);
var page_token = [];
do {
up.sleeps.get({
page_token : page_token,
start_time : $startDate,
end_time : $endDate
}, function(err, body) {
if (err) {
console.log('Error receiving Jawbone UP data');
res.send(err);
} else {
try {
var result = JSON.parse(body);
var next_page_path = result.data.links.next;
//var next_page_token = next_page_path.split(path.sep);
//var page_token = next_page_token[5];
//page_token = result.data.links.next
console.log(result.data);
res.json(result);
} // end try
catch(err) {
console.log(err);
res.render('userdata', {
requestTime: 0,
jawboneData: 'Unknown result'
});
} // end catch(err)
} // end else
} //end callback fun
); // end up.sleeps.get()
} // end do
while(page_token[0] > 1);
} // end if
}); // end sleeps route
// Setup the passport jawbone authorization strategy
passport.use('jawbone', new jbStrategy({
clientID : config.jawboneAuth.clientID,
clientSecret : config.jawboneAuth.clientSecret,
authorizationURL: config.jawboneAuth.authorizationURL,
tokenURL : config.jawboneAuth.tokenURL,
callbackURL : config.jawboneAuth.callbackURL,
scope : config.jawboneAuth.scope,
passReqToCallback : true
}, function(req, accessToken, refreshToken, profile, done) {
// establish a pseudo user session.
var user = {};
// If there's no preexisting accessToken,
// write one to the config file.
if (!config.jawboneAuth.accessToken){
config.jawboneAuth.accessToken = accessToken;
config.jawboneAuth.refreshToken = refreshToken;
jsonfile.writeFile('./config.json', config, {spaces: 2}, function(err) {
console.error(err);
})
}
done(null, user);
}));
// HTTPS
var sslOptions = {
key : fs.readFileSync('./.server.key'),
cert : fs.readFileSync('./.server.crt')
};
var secureServer = https.createServer(sslOptions, app).listen(port, function(){
console.log('Listening on ' + port);
});
Turns out there is an undocumented limit parameter that has replaced the page_token.
The Jawbone Developer documentation is currently out of date. As is their FAQ (API section Question# 12).
A GET request like this seems to do the trick
https://jawbone.com/nudge/api/v.1.1/users/#me/sleeps?start_time=1388603458&end_time=1420139458&limit=1000