How to intercept Server-Sent Events messages in Puppeteer - puppeteer

I can't seem to find any documentation on this use case. I wish to intercept the SSE messages from a source I have no control over, hence I can't chnage it from being Server-Sent Events.

Puppeteer doesn't have an native event to wait for SSE, but you can use the Chrome DevTools protocol to do that. The reference is here, and a example code would be this:
// some code
// after you have access to "Page":
const cdp = await page.target().createCDPSession();
await cdp.send('Network.enable');
await cdp.send('Page.enable');
cdp.on('Network.eventSourceMessageReceived', ({ requestId, timestamp, eventName, eventId, data }) => console.log(requestId, timestamp, eventName, eventId, data));
// now you can navigate to the page you are testing:
await page.goto('<url with SSE>');
EDIT:
I've prepared a complete example that has a expressjs playing the SSE server role and a puppeteer client that consumes the events:
const puppeteer = require('puppeteer');
const express = require('express');
const app = express();
app.get('/', (req, res) => {
res.send(`
<!DOCTYPE html>
<body>
<script>
var sseSource = new EventSource('/event-stream');
sseSource.addEventListener('MyEvent', (e) => {
console.log('[Page] Event Type:', e.type, '| Event Data:', e.data);
});
</script>
</body>
</html>
`);
});
app.get('/event-stream', (req, res) => {
res.writeHead(200, {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
});
res.write('\n');
const intervalId = setInterval(() => {
res.write(`event: MyEvent\n`);
res.write(`data: Test Message received at ${Date.now()}\n\n`);
}, 1000);
req.on('close', () => clearInterval(intervalId));
});
const server = app.listen(8080, async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setDefaultTimeout(0);
const cdp = await page.target().createCDPSession();
await cdp.send('Network.enable');
await cdp.send('Page.enable');
cdp.on('Network.eventSourceMessageReceived', ({ eventName, data }) => console.log(`[Node] Event Type: ${eventName} | Event Data: ${data}\n`));
page.on('console', (msg) => console.log(msg.text()));
await page.goto('http://localhost:8080/');
await page.waitFor(300000); // 5 minutes
await page.close();
await browser.close();
server.close();
});

Related

How to render a webpage using puppeteer

How can I get the fully rendered html+css of a client side rendered webpage? The page contents on puppeteer returns a very poorly rendered outcome with missing css
Simplified code:
const express = require('express')
const puppeteer = require('puppeteer');
const app = express()
const port = 3000
async function getHtml(url) {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox']
});
const page = await browser.newPage();
await page.goto(url,
{ waitUntil: ['networkidle0', 'networkidle2', 'load', 'domcontentloaded'] });
const k = await page.content()
await browser.close();
return k
};
app.get('/', (request, response) => {
getHtml(request.query.url)
.then(function (res) {
response.send(res);
})
.catch(function (err) {
console.error(err)
response.send(err);
})
});
app.listen(port)
Running this with any website; for example https://www.tesla.com/ gives something like
Although using the page.screenshot() method gives the desired results.
Any ideas on why this occurs? And more importantly, is there a way to get around this behaviour?

Why am I missing data when using a proxy with puppeteer?

When using a proxy for my puppeteer request to scrape a page, some data is missing such as price but the rest is there. When I remove the proxy, everything loads correctly.
const getData = async () => {
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox", "--ignore-certificate-errors", "--proxy-server=myproxy"],
});
const page = await browser.newPage();
await page.authenticate({ username: "username", password: "password" });
await page.goto(url, {
timeout: 300000,
waitUntil: "load",
});
const html = await page.evaluate(() => document.body.innerHTML);
const $ = cheerio.load(html);
const products = [];
$("[data-automation=product-results] > div").each((index, el) => {
const id = product.attr("data-product-id");
const name = product.find("[data-automation=name]").text();
const image = product.find("[data-automation=image]").attr("src");
const price = product.find("[data-automation=current-price]").text();
const data = {
id,
name,
image,
price,
};
products.push(data);
});
console.log(products);
};
I have tried: waitUntil: 'domcontentloaded' (same results), waitUntil: 'networkidle0' and waitUntil: 'networkidle2' both times out (5 minutes).
I don't quite understand why I am able to get all the data without using a proxy and only get partial data using a proxy.

Puppeteer wait until Cloudfare redirect is done

I would like to login on a site, which is using Cloudfare DDOS protection like this:
The code is simple:
const puppeteer = require('puppeteer');
const C = require('./constants');
const USERNAME_SELECTOR = 'input[name="username"]';
const PASSWORD_SELECTOR = 'input[name="password"]';
const CTA_SELECTOR = '.button';
var cloudscraper = require('cloudscraper');
async function startBrowser() {
const browser = await puppeteer.launch({
headless: true,
slowMo: 10000,
});
const page = await browser.newPage();
return {browser, page};
}
async function closeBrowser(browser) {
return browser.close();
}
async function playTest(url) {
const {browser, page} = await startBrowser();
page.setViewport({width: 1366, height: 768});
await page.goto(url, {waituntil: 'domcontentloaded'});
await page.screenshot({path: 'debug.png'});
await page.click(USERNAME_SELECTOR);
await page.keyboard.type(C.username);
await page.click(PASSWORD_SELECTOR);
await page.keyboard.type(C.password);
await page.click(CTA_SELECTOR);
await page.waitForNavigation();
await page.screenshot({path: 'ipt.png'});
}
(async () => {
await playTest("https://xy.com/login.php");
process.exit(1);
})();
When I check debug.png, I see Cloudfare DDOS protection page only. I don't really understand why, I added slowMo 10sec to wait with the execution.
You can add a simple waitForSelector to wait until the username selector appears,
await page.waitForSelector(USERNAME_SELECTOR);
await page.click(USERNAME_SELECTOR);

Load dynamic content with puppeteer?

I'm using this code to get the page data. It works but I get the data just one time.
The problem is that this data is updated lets say every second. I want to get it without reloading the page.
This is a simple example of what I want - http://novinite.win/clock.php
Is there a way to refresh the result without reloading the web page?
const puppeteer = require('puppeteer');
(async () => {
const url = process.argv[2];
const browser = await puppeteer.launch({
args: ['--no-sandbox']
})
const page = await browser.newPage();
page.on('request', (request) => {
console.log(`Intercepting: ${request.method} ${request.url}`);
request.continue();
});
await page.goto(url, {waitUntil: 'load'});
const html = await page.content();
console.log(html);
browser.close();
})();
You can await a Promise that logs the current textContent every 1000 ms (1 second) with setInterval and resolves after a set number of intervals (for example, 10 intervals):
'use strict';
const puppeteer = require( 'puppeteer' );
( async () =>
{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto( 'http://novinite.win/clock.php' );
await new Promise( ( resolve, reject ) =>
{
let i = 0;
const interval = setInterval( async () =>
{
console.log( await page.evaluate( () => document.getElementById( 'txt' ).textContent ) );
if ( ++i === 10 )
{
clearInterval( interval );
resolve( await browser.close() );
}
}, 1000 );
});
})();
Example Result:
19:24:15
19:24:16
19:24:17
19:24:18
19:24:19
19:24:20
19:24:21
19:24:22
19:24:23
19:24:24

Puppeteer can't catch failing request & errors

I trying to collect data from failing requests and js error.
I'm using the following site: https://nitzani1.wixsite.com/marketing-automation/3rd-page
The site has a request to https://api.fixer.io/1latest, which returns a status code of 404,
also the page contains thw following js error:
"Uncaught (in promise) Fetch did not succeed"
I've tried to code bellow to catch the 404 and js error but couldn't.
Not sure what I'm doing wrong, any idea as to how to solve it?
const puppeteer = require('puppeteer');
function wait (ms) {
return new Promise(resolve => setTimeout(() => resolve(), ms));
}
var run = async () => {
const browser = await puppeteer.launch({
headless: false,
args: ['--start-fullscreen']
});
page = await browser.newPage();
page.on('error', err=> {
console.log('err: '+err);
});
page.on('pageerror', pageerr=> {
console.log('pageerr: '+pageerr);
});
page.on('requestfailed', err => console.log('requestfailed: '+err));
collectResponse = [];
await page.on('requestfailed', rf => {
console.log('rf: '+rf);
});
await page.on('response', response => {
const url = response.url();
response.buffer().then(
b => {
// console.log(url+' : '+response.status())
},
e => {
console.log('response err');
}
);
});
await wait(500);
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://nitzani1.wixsite.com/marketing-automation/3rd-page', {
});
};
run();
The complete worked answer is:
const puppeteer = require('puppeteer');
const run = async () => {
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
// Catch all failed requests like 4xx..5xx status codes
page.on('requestfailed', request => {
console.log(`url: ${request.url()}, errText: ${request.failure().errorText}, method: ${request.method()}`)
});
// Catch console log errors
page.on("pageerror", err => {
console.log(`Page error: ${err.toString()}`);
});
// Catch all console messages
page.on('console', msg => {
console.log('Logger:', msg.type());
console.log('Logger:', msg.text());
console.log('Logger:', msg.location());
});
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://nitzani1.wixsite.com/marketing-automation/3rd-page', { waitUntil: 'domcontentloaded' });
await page.waitFor(10000); // To be sure all exceptions logged and handled
await browser.close();
};
run();
Save in .js file and easily run it.
Current puppeteer 8.0.0^ have a very small amount of information in message.text(). So we need to get a description of the error from JSHandle.
Please check this comment with fully descriptive console errors from JSHandle object
Check the link here https://stackoverflow.com/a/66801550/9026103