Using Puppeteer version: "9.0.0"
Unfortunately debugging in chrome dev tools does not work at all with this puppeteer version.
So I resorted to NDB.
Using NDB I can breakpoint anywhere apart from inside page.evaluate function and page.$$eval().
Running the script with ndb:
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start": "ndb node startscrape.js"
},
startscrape.js
( async () => {
const browser = await puppeteer.launch(
{
headless: false,
defaultViewport:null,
slowMo: 250,
devtools:true,
});
const page = await browser.newPage();
await page.goto('https://www.google.com');
await page.type('input', 'Here' );
await page.keyboard.press('Enter');
await page.waitForNavigation();
let x = () => {
debugger;
console.log('can I debug here?') //YES - breakpoints work upon executing x();
};
x();
let xa = await page.evaluate(() => {
console.log('Alive'); // Logging works in console but cannot breakpoint
let elements = document.getElementsByClassName('someitem');
return elements;
});
// Cannot debug inside here too
// let xa = await page.$$eval('body', (body) => {
// console.log('Alive');
// let elements = document.getElementsByClassName('serp-item');
// return elements;
// });
// breakpoint comes here
await page.goto('https://www.google.com');
//await browser.waitForTarget(() => false);
})();
It seems the function arguments of page.evaluate() and similar ones are not executed per se: their serialized (stringified) code is transferred from the Node.js context into the browser context, then a new function is recreated from that code and executed there. That is why the breakpoints in the initial function have no effect on the recreated function.
Related
I am running puppeteer in kiosk-printing mode to automate the printing of PDF to the printer. But I have to use waitForTimeout to give the printer sufficient time to, I guess, receive the print instructions. Otherwise, browser.close( ) shuts puppeteer down too quickly, and nothing prints.
How to wait for the print function to complete? I am not sure what the arbitrary waitForTimeout value should be.
const puppeteer = require('puppeteer');
(async () =>
{
const browser = await puppeteer.launch(
{
headless: false,
"args": [ "--kiosk-printing" ]
});
const page = await browser.newPage();
await page.goto(`file:///C:/Users/srich/Downloads/packing-list.pdf`);
await page.evaluate(() => { window.print(); });
await page.waitForTimeout(2000) ;
await browser.close( ) ;
})();
I tried to goto a web page after running window.print. Then wait for that navigation to complete. But the shutdown of puppeteer still came too quickly and the PDF did not print.
await page.evaluate(() => { window.print(); });
await page.goto(`https://google.com`);
await page.waitForSelector('input');
await browser.close( ) ;
in a project that requires:
Starting each session with logging in credentials + notification/OTP
work with multiple accounts asynchronously
remote debugging or monitoring of the session
5+ different operations can be requested on a open session , in any order
I want it to be a puppeteer cluster browser with one remote debugging port to monitor it
but couldn't integrate WsEndpoints
const puppeteer = require('puppeteer-extra');
const { Cluster } = require('puppeteer-cluster');
class SingleBrowserCluster {
browserInstance
options
constructor() {
if (SingleBrowserCluster._instance) {
//throw new Error("Singleton classes can't be instantiated more than once.")
}
else{
SingleBrowserCluster._instance = this;
// ... Your rest of the constructor code goes after this
console.log("pre optArgs");
const optArgs = [
'--remote-debugging-port=3002',//works if dockerised
'--remote-debugging-address=0.0.0.0',// at localhost.3002
'--window-size=1920,1080',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-gpu', '--no-zygote', //'--single-process',
];
console.log("pre options");
this.options = {
headless: true,//for dockerization
args: optArgs,
defaultViewport: null,
waitUntil: 'networkidle2'
};
console.log("Do launch now");
return this;
}
}
async screenshotMethod({ page, data: url }) {
await page.goto(url);
console.log(`%c worker X is running on ${url} `, `color:green;`);
console.log("will wait 20 second");
await page.waitForTimeout(20000)
const path = url.replace(/[^a-zA-Z]/g, '_') + '.png';
await page.screenshot({ path });
};
async launchCluster (){
try {
this.browserInstance = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 3,
puppeteerOptions: this.options
})
console.log(this.browserInstance);
return this.browserInstance;
} catch (error) {
console.log(`%c ERRORR`,`color:red;`);
console.log(error);
}
}
}
const StartScraper = async (Url, useProxy) => new Promise((resolve, reject) => {
(async () => {
// get proxy url from environment files
const proxyUrl = process.env.PROXY_URL;
//--disable-dev-shm-usage
// By default, Docker runs a container with a /dev/shm shared memory space 64MB. This is typically too small for Chrome and will cause Chrome to crash when rendering large pages.
//his will write shared memory files into /tmp instead of /dev/shm. See crbug.com/736452 for more details.
var instanceOne1= new SingleBrowserCluster()//.launchCluster()
var browser= await instanceOne1.launchCluster();
browser.queue('https://www.google.com/', instanceOne1.screenshotMethod);
//THE PROBLEM LINE
const wsEndpoint = browser.wsEndpoint();
try {
const page = (await browser.pages())[0];
await page.goto(Url, { waitUntil: 'load' });
return resolve(wsEndpoint);
} catch (e) {
browser.close();
return reject(false);
}
})();
});
how can i have WSendpoint of any session in a puppeteer-cluster
( more info:
i will put those in a session file
to provide my next selected consequtive operation a connection point on its session
localhost/StartScraper creates WSendpoint
localhost/login==WSendpoint==>Connects to existing session do login stuff
localhost/listItems==WSendpoint==>Connects to existing session do listItems stuff
...
)
I am trying to use args in my code to use a proxy service I have. If I remove the args altogether things run fine but if I have them in there I get an error stating: Error: Unable to restart chrome. I checked multiple examples and copied the same to my code but it seems to fail. Any ideas on how to implement this correctly?
Code:
const { Cluster } = require('puppeteer-cluster');
const vanillaPuppeteer = require('puppeteer');
const { addExtra } = require('puppeteer-extra');
const Stealth = require('puppeteer-extra-plugin-stealth')
async function main() {
// Create a custom puppeteer-extra instance using `addExtra`,
// so we could create additional ones with different plugin config.
const puppeteer = addExtra(vanillaPuppeteer)
puppeteer.use(Stealth())
let proxy_server = 'proxy.soax.com:9000';
let user = 'some_user_name';
let pass = 'some_password';
// Launch cluster with puppeteer-extra
const cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: false,
args: ['--proxy-server=' + proxy_server,
'--single-process',
'--no-zygote',
'--no-sandbox'],
sameDomainDelay: 1000,
retryDelay: 3000,
workerCreationDelay: 3000},
maxConcurrency: 2,
concurrency: Cluster.CONCURRENCY_CONTEXT,
monitor: false,
skipDuplicateUrls: true
})
// Define task handler
await cluster.task(async ({ page, data: url }) => {
await page.authenticate({
username: user,
password: pass,
});
await page.goto(url)
const { hostname } = new URL(url)
console.log(`checking on ${hostname}`)
await page.screenshot({ path: `${hostname}.png`, fullPage: true })
})
// Queue any number of tasks
cluster.queue('https://whatismyipaddress.com/')
await cluster.idle()
await cluster.close()
console.log(`All done`)
}
main().catch(console.warn)
I played around a bit and discovered by removing the arg --single-process then it works fine.
i just want to use XPath to get innerText using Puppeteer. This is code
import * as puppeteer from 'puppeteer-core';
(async () => {
// Make the browser visible by default, extend the timeout, and set a default viewport size
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
userDataDir: 'C:\\ctvbanhang\\browserData',
defaultViewport: { width: 1920, height: 1080 },
headless: false, // true = hide screen, false = show screen
timeout: 60000, // 60 seconds
});
// The browser automatically opens a page, so use that
const page = (await browser.pages())[0];
await page.goto('https://example.com/');
var XPath = "//h1/text()";// //'div.product-briefing > div > div > div > span';
// //await page.waitForSelector(selector);
await page.waitForXPath(XPath);
let result = await page.evaluate(element => {
console.log(element); //log in browser
console.log(typeof element); //log in browser
console.log(JSON.stringify(element)); //log in browser
return element;
}, (await page.$x(XPath))[0]);
console.log(result); //log in terminal
await page.waitFor(100000);
await browser.close();
})()
.then(() => {
console.log('Browser scans complete!');
})
Why the result is not the same?
this is result log in browser
and in terminal
According to the docs, various eval functions can transfer only serializable data (roughly, the data JSON can handle, with some additions). Your code returns a DOM element (Text node), which is not serializable (it has methods and circular references). Try to retrieve the data in the browser context and returns only serializable data. For example:
return element.wholeText;
Can someone explain why this code isn't working. I have a console log before I run page.evaluate() which logs what I expect, but the console log inside page.evaluate never runs.
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.example.com');
page.on('response', async response => {
const url = response.url();
if (url.includes('something')) {
console.log('this code runs');
await page.evaluate(() => {
console.log("this code doesn't run");
});
}
});
} catch (err) {
console.log(err);
}
})();
Console log doesn't work in page.evaluate()
https://github.com/GoogleChrome/puppeteer/issues/1944
Try to use this code for display console.log from evaluate
page.on('console', msg => {
for (let i = 0; i < msg.args().length; ++i)
console.log(`${i}: ${msg.args()[i]}`);
});
page.evaluate(() => console.log('hello', 5, {foo: 'bar'}));
https://pptr.dev/#?product=Puppeteer&version=v1.20.0&show=api-event-console
The code inside page.evaluate is run in the browser context, so the console.log works, but inside the Chrome console and not the Puppeteer one.
To display the logs of the Chrome context inside the Puppeteer console, you can set dumpio to true in the arguments when launching a browser using Puppeteer:
const browser = await puppeteer.launch({
dumpio: true
})
Console.log works but in the browser context. I'm guessing here that you are trying to see the log in the CLI. If you want to see the log set headless to false and then see the log in the browser console.