I came a across a website that puppeteer can't handle. When making screenshot, Protocol error (Runtime.callFunctionOn): Target closed or Protocol error (Emulation.setDeviceMetricsOverride): Target closed is triggered. Before taking a screenshot, I scroll it so that all images are loaded. The page is large so I set '--disable-dev-shm-usage','--shm-size=3gb', params in hope to prevent any memory issues. This is sample code with url included. Any idea why the page is closed in the middle of the operation? In addition to puppeteer-cluster ("^0.23.0"), I am also using puppeteer-extra-plugin-stealth("^2.9.0") and puppeteer-extra("^3.2.3")
import puppeteer from 'puppeteer-extra';
import {Cluster} from 'puppeteer-cluster';
import StealthPlugin from "puppeteer-extra-plugin-stealth";
puppeteer.use(StealthPlugin());
const cluster = await Cluster.launch({
puppeteer,
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 1,
puppeteerOptions:{
headless : true,
args: [
'--disable-setuid-sandbox',
'--no-sandbox',
'--window-size=1920,1080',
'--disable-dev-shm-usage',
'--shm-size=3gb',
]
}
});
await cluster.task(async ({ page, data: url }) => {
let response = await page.goto(url, { waitUntil:"networkidle2" });
await Screenshot(page, screenshotPaths);
});
async function autoScroll(page){
await page.evaluate(async () => {
try {
await new Promise((resolve, reject) => {
let totalHeight = 0;
let distance = 389;
let counter = 0;
let timer = setInterval(() => {
counter++;
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if((totalHeight >= scrollHeight - window.innerHeight) || counter > 100){
clearInterval(timer);
resolve();
}
}, 50);
});
}catch (e) {
console.log("we got scrolling error:");
console.log(e);
}
});
}
async function Screenshot(page) {
let save = true;
try {
await page.waitForTimeout(6000);
await page.setViewport({ width:390, height:844});
await autoScroll(page);
await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000);
if(save) await page.screenshot({path: "./mobile.jpg", fullPage: true});
await page.setViewport({ width:1920, height:1080});
await autoScroll(page);
await page.evaluate(() => window.scrollTo(0, 0));
await page.waitForTimeout(2000);
if(save) await page.screenshot({path: "./desktop.jpg", fullPage: true});
}catch(error) {
console.log("we got screenshot error");
console.log(error);
}
}
cluster.queue("https://www.sinsay.com/si/sl/sale/woman/view-all-clothes");
await cluster.idle();
await cluster.close();
stack trace:
ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed.
at /path/to/puppeteer/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:230:24
at new Promise (<anonymous>)
at CDPSession.send (/path/to/puppeteer/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:226:16)
at next (/path/to/puppeteer/node_modules/puppeteer-extra-plugin-stealth/evasions/sourceurl/index.js:32:41)
at CDPSession.send (/path/to/puppeteer/node_modules/puppeteer-extra-plugin-stealth/evasions/sourceurl/index.js:65:16)
at ExecutionContext._evaluateInternal (/path/to/puppeteer/node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:204:50)
at ExecutionContext.evaluate (/path/to/puppeteer/node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:110:27)
at DOMWorld.evaluate (/path/to/puppeteer/node_modules/puppeteer/lib/cjs/puppeteer/common/DOMWorld.js:123:24)
at processTicksAndRejections (node:internal/process/task_queues:96:5) {
originalMessage: ''
}
Related
HI Ive been trying to get puppeteer to take a screenshot of full pages, including all images. Unfortunately background images are getting omitted (see comparison below)... I can't figure out how to get them.
Here's my code
async function screeshotFullPage(url: string): Promise<string> {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
await page.evaluate(async () => {
const selectors = Array.from(document.querySelectorAll("img"));
//https://stackoverflow.com/questions/46160929/puppeteer-wait-for-all-images-to-load-then-take-screenshot
await document.body.scrollIntoView(false);
await Promise.all(
selectors.map((img) => {
if (img.complete) return;
return new Promise((resolve, reject) => {
img.addEventListener("load", resolve);
img.addEventListener("error", reject);
});
})
);
});
await sleep(5000); // resolves in 5 sec
const path = generateScreenshotPath();
await page.screenshot({
path,
fullPage: true,
});
return await browser.close();
}
await screeshotFullPage("https://chesskid.com")
Hi Guys can you please point my mistake on this code?
console.log(urls) is printing undefined.
Thanks in advance.
const puppeteer = require('puppeteer');
async function GetUrls() {
const browser = await puppeteer.launch( { headless: false,
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' })
const page = await browser.newPage();
await page.goto("https://some page");
await page.waitForSelector('a.review.exclick');
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.review.exclick');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
browser.close();
});
}
(async () => {
let URLS = await GetUrls();
console.log(URLS);
process.exit(1);
})();
Here is a list:
you don't have a return statement in your GetUrls() function
you close the browser after a return statement AND inside the page.evaluate() method
Keep in mind that anything that is executed within the page.evaluate() will relate to the browser context. To quickly test this, add a console.log("test") before let results = []; and you will notice that nothing appears in your Node.js console, it will appear in your browser console instead.
Therefore, the browser variable is visible within the GetUrls() function but NOT visible within the page.evaluate() method.
Here is the corrected code sample:
const puppeteer = require('puppeteer');
async function GetUrls() {
const browser = await puppeteer.launch({
headless: false,
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
})
const page = await browser.newPage();
await page.goto("https://some page");
await page.waitForSelector('a.review.exclick');
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.review.exclick');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
});
await browser.close();
return urls;
}
(async () => {
let URLS = await GetUrls();
console.log(URLS);
process.exit(1);
})();
I am trying to make it log the data found from inspect>network>preview but right now it logs inspect>network>headers.
Here is what I have:
const puppeteer = require("puppeteer");
const url =
"https://www.google.com/";
async function StartScraping() {
await puppeteer
.launch({
headless: false,
})
.then(async (browser) => {
const page = await browser.newPage();
await page.setViewport({
width: 1500,
height: 800,
});
page.on("response", async (response) => {
if (response.url().includes("Text")) {
console.log(await response);
}
});
await page.goto(url, {
waitUntil: "load",
timeout: 0,
});
});
}
StartScraping();
It depends how you want it formatted. More information can be found here: https://github.com/puppeteer/puppeteer/blob/9ef4153f6e3548ac3fd2ac75b4570343e53e3a0a/docs/api.md#class-response
I've modified your code a bit to where I think you would want the response:
page.on("response", async (response) => {
if (response.url().includes("Text")) {
console.log(await response.text());
}
});
I'm using puppeter for gathering information and something one of these informations cames []. This is an example of how I'm calling the functions of puppeter
try {
const browser = await puppeteer.launch({
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
],
});
const page = await browser.newPage();
await page.goto(url);
await page.waitForSelector('div#results');
await page.waitForSelector('h6.bestPrice');
const result = await page.evaluate(() => {
let items = document.querySelectorAll('maintable > tbody > tr.productsName > td > div.products');
if (items.length !== 0) {
let data = [];
items.forEach(function (el) {
let object = null;
el.querySelectorAll('h5 a').forEach(function (el) {
object = { product: el.innerHTML, img: [] };
});
el.querySelectorAll('p a').forEach(function (el) {
object.description = el.innerHTML;
});
el.querySelectorAll('h6.bestPriceText').forEach(function (el) {
object.price = el.innerHTML;
});
el.querySelectorAll('div.slide a.fancybox-thumbs img').forEach(function (el) {
object.img.push(el.getAttribute('src'));
});
data.push(object);
});
return data;
}
let error = null;
document.querySelectorAll('div.noResults div.info-message h2').forEach(function (el) {
error = ({ message: el.innerHTML });
});
return error;
});
await browser.close();
return result;
} catch (error) {
return { message: `Problems loading the page! Please, try again.${error}` };
}
};
The information which something comes empty is price, sorry but I can't reveal the url, but I would like to know if exists any problem with this design code.
A Friend suggest me to add a timeout, because it could be a problem with rendering
const resultado = await page.evaluate(async () => {
const timeout = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
await timeout(5000);
I trying to collect data from failing requests and js error.
I'm using the following site: https://nitzani1.wixsite.com/marketing-automation/3rd-page
The site has a request to https://api.fixer.io/1latest, which returns a status code of 404,
also the page contains thw following js error:
"Uncaught (in promise) Fetch did not succeed"
I've tried to code bellow to catch the 404 and js error but couldn't.
Not sure what I'm doing wrong, any idea as to how to solve it?
const puppeteer = require('puppeteer');
function wait (ms) {
return new Promise(resolve => setTimeout(() => resolve(), ms));
}
var run = async () => {
const browser = await puppeteer.launch({
headless: false,
args: ['--start-fullscreen']
});
page = await browser.newPage();
page.on('error', err=> {
console.log('err: '+err);
});
page.on('pageerror', pageerr=> {
console.log('pageerr: '+pageerr);
});
page.on('requestfailed', err => console.log('requestfailed: '+err));
collectResponse = [];
await page.on('requestfailed', rf => {
console.log('rf: '+rf);
});
await page.on('response', response => {
const url = response.url();
response.buffer().then(
b => {
// console.log(url+' : '+response.status())
},
e => {
console.log('response err');
}
);
});
await wait(500);
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://nitzani1.wixsite.com/marketing-automation/3rd-page', {
});
};
run();
The complete worked answer is:
const puppeteer = require('puppeteer');
const run = async () => {
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
// Catch all failed requests like 4xx..5xx status codes
page.on('requestfailed', request => {
console.log(`url: ${request.url()}, errText: ${request.failure().errorText}, method: ${request.method()}`)
});
// Catch console log errors
page.on("pageerror", err => {
console.log(`Page error: ${err.toString()}`);
});
// Catch all console messages
page.on('console', msg => {
console.log('Logger:', msg.type());
console.log('Logger:', msg.text());
console.log('Logger:', msg.location());
});
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://nitzani1.wixsite.com/marketing-automation/3rd-page', { waitUntil: 'domcontentloaded' });
await page.waitFor(10000); // To be sure all exceptions logged and handled
await browser.close();
};
run();
Save in .js file and easily run it.
Current puppeteer 8.0.0^ have a very small amount of information in message.text(). So we need to get a description of the error from JSHandle.
Please check this comment with fully descriptive console errors from JSHandle object
Check the link here https://stackoverflow.com/a/66801550/9026103