I'm trying to generate a pdf using local html but the css isn't showing up
(async () => {
var html = '<div style="width:500px; height:500px;background:blue">test</div>'
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setContent(html)
await page.pdf({path: 'hn.pdf', format: 'A4'});
await browser.close();
})();
How can I get the css to work?
Add printBackground: true to your page.pdf call
await page.pdf({path: 'hn.pdf', format: 'A4', printBackground: true});
The problem wasn't with CSS, thee CSS is showed but when you print a page the backgrounds are removed. Try adding color:red; to your CSS and you'll see that the CSS work.
Related
I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000
I have trouble replace hyperlinks with regular text on a website that I scraped by Puppeteer.
I'm able to get the HTML file, but when it renders, the stylesheet doesn't work, the hyperlinks lead to errors.
Anybody can help me with this, please?
const puppeteer = require('puppeteer');
const fs = require('fs')
const scrapeWikipedia = async () => {
const browser = await puppeteer.launch({
headless: false
// defaultViewport: null
});
const page = await browser.newPage();
const wikiUrl = 'https://en.wikipedia.org/wiki/Groundhog_Day_(film)'
await page.goto(wikiUrl, {waitUntil: 'networkidle2'});
//replace hyperlink
await page.evaluate(_ => {
document.querySelectorAll('a[href^="javascript"]')
.forEach(a => {
a.href = '#'
})
});
//get html file get from the website with
const html = await page.content();
fs.writeFileSync("./outputHTML/index.html", html);
await browser.close();
}
scrapeWikipedia()
I have tried multiple examples to get the splitwise login but unable to get it working.
Although, I'm quite new to puppeteer but login felt a simple usecase for understanding puppeteer.
const puppeteer = require('puppeteer')
const screenshot = 'login.png';
(async () => {
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
await page.goto("https://www.splitwise.com/login", {
waitUntil: 'networkidle2'
});
await page.type('#user_session_email', 'atest')
await page.type('#user_session_password', 'test')
await page.click('[name="commit"]')
await page.waitForNavigation()
browser.close()
console.log('See screenshot: ' + screenshot)
})()
Unfortunately, the page has two forms with identical ids (but different classes) and these forms have inputs with identical ids as well. You just need more specific selectors:
await page.type('form.form-stacked #user_session_email', 'atest')
await page.type('form.form-stacked #user_session_password', 'test')
await page.click('form.form-stacked [name="commit"]')
This does not seem to be a puppeteer issue.
It seems that javascript code in page is actively blocking triggered events somehow.
Are you able to set these values using regular javascript in the console?
I have a multipage form and am using puppeteer to test. Problem I have is that when I ask to click to the next button it does click through but it doesn't go to the next page of the form. I have tried waitforNavigation but it throws a timeout issue. I have tried to address the timeout issue but to no success.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://example.com/form/');
//await page.waitForNavigation({timeout: 10000, waitUntil: "load"}); // The promise resolves after navigation has finished
await page.click('#next_button_1'); // Clicking the link will indirectly cause a navigation
await page.setDefaultNavigationTimeout(0);
await page.waitForNavigation();
await page.screenshot({path: 'example.png',fullPage: true});
// other actions...
await browser.close();
})();
In the following example how do I wait for the pop up window to finish loading?
After clikcing the google icon you get a pop up window to login to gmail, when I try to interact
with the second page it is undefined (as I don't know how to wait for it to fully load.
Any advice?
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({headless: false});
page = await browser.newPage();
await page.goto("https://www.example.com/signin");
await page.waitForSelector(".Icon-google");
await page.click(".Icon-google");
const pages = await browser.pages();
console.log(pages[2].url());
})();
You can wait for a new target to be created.
const browser = await puppeteer.launch({headless: false});
page = await browser.newPage();
await page.goto("https://app.testim.io/#/signin");
await page.waitForSelector(".Icon-google");
const nav = new Promise(res => browser.on('targetcreated', res))
await page.click(".Icon-google");
await nav
const pages = await browser.pages();
console.log(pages.length);//number of pages increases !
console.log(pages.map(page => page.url()));
P.S. first I tried page.waitForNavigation() but it didn't work, probably because it's a popup.
const [newPage] = await Promise.all([
new Promise((resolve) => page.once('popup', resolve)),
page.click('something.that-will-open-the-popup')
]);
await newPage.waitForSelector('.page-is-loaded')