scrape from arays of url - puppeteer - puppeteer

I have lists of URLs... from : http://books.toscrape.com
Let objArray =
[
{"Url": "books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"},
{"Url": "books.toscrape.com/catalogue/tipping-the-velvet_999/index.html"},
{"Url": "books.toscrape.com/catalogue/soumission_998/index.html"}
]
As You Can See That All Links Have Similar Scraping.
I want to scrape the Titles, Prices And Stock Availability from above links.
I also try to loop through all of the URLs like this:
for (var i = 0; i < objArray.length; ++i) {
(async() => {
let browser;
try {
browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.goto(url);
const content = await page.content();
const $ = cheerio.load(content);
const Product_details = []
const instock = $(div[class="col-sm-6 product_main"] p[class="instockavailability"]).text();
const title = $(div[class="col-sm-6 product_main"] ).text();
const price = $(div[class="col-sm-6 product_main"] p[price_color]).text()
Product_details.push({
Stock: instock,
Title: title,
Price: price,
});
fs.writeFileSync("files.json", JSON.stringify(Product_details), "utf8")
console.log(Product_details)
}
Now my above code not working.....I want to get the product details like: titles, prices

You can separate each page logic into a function and try something like this:
(async () => {
let browser;
try {
browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
const url = "http://books.toscrape.com/";
const Product_details = [];
await page.goto(url);
Product_details.push(await getData(page, Product_details));
while (await page.$('li[class="next"] a')) {
await Promise.all([
page.waitForNavigation(),
page.click('li[class="next"] a'),
]);
Product_details.push(await getData(page, Product_details));
}
fs.writeFileSync("Details.json", JSON.stringify(Product_details), "utf8");
} catch (e) {
console.log('Error-> ', e);
await browser.close();
}
})();
async function getData(page, details) {
console.log(page.url());
const html = await page.content();
const $ = cheerio.load(html);
const statsTable = $('li[col-xs-6 col-sm-4 col-md-3 col-lg-3]');
statsTable.each(function() {
const title = $(this).find('h3').text();
const Price = $(this).find('p[class="price_color"]').text();
details.push({
Title: title,
Price: Price
});
});
}
UPD: Answer for the last edition of the question:
const objArray = [
{ Url: 'books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html' },
{ Url: 'books.toscrape.com/catalogue/tipping-the-velvet_999/index.html' },
{ Url: 'books.toscrape.com/catalogue/soumission_998/index.html' },
];
(async () => {
let browser;
try {
const Product_details = [];
for (const { Url } of objArray) {
browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.goto(`http://${Url}`);
const content = await page.content();
const $ = cheerio.load(content);
const instock = $('div[class="col-sm-6 product_main"] p[class="instockavailability"]').text().trim();
const title = $('div[class="col-sm-6 product_main"] h1').text().trim();
const price = $('div[class="col-sm-6 product_main"] p[class="price_color"]').text().trim;
Product_details.push({
Stock: instock,
Title: title,
Price: price,
});
await browser.close();
}
console.log(Product_details);
fs.writeFileSync('files.json', JSON.stringify(Product_details), 'utf8');
} catch (e) {
console.log('Error-> ', e);
await browser.close();
}
})();

Related

Why am I missing data when using a proxy with puppeteer?

When using a proxy for my puppeteer request to scrape a page, some data is missing such as price but the rest is there. When I remove the proxy, everything loads correctly.
const getData = async () => {
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox", "--ignore-certificate-errors", "--proxy-server=myproxy"],
});
const page = await browser.newPage();
await page.authenticate({ username: "username", password: "password" });
await page.goto(url, {
timeout: 300000,
waitUntil: "load",
});
const html = await page.evaluate(() => document.body.innerHTML);
const $ = cheerio.load(html);
const products = [];
$("[data-automation=product-results] > div").each((index, el) => {
const id = product.attr("data-product-id");
const name = product.find("[data-automation=name]").text();
const image = product.find("[data-automation=image]").attr("src");
const price = product.find("[data-automation=current-price]").text();
const data = {
id,
name,
image,
price,
};
products.push(data);
});
console.log(products);
};
I have tried: waitUntil: 'domcontentloaded' (same results), waitUntil: 'networkidle0' and waitUntil: 'networkidle2' both times out (5 minutes).
I don't quite understand why I am able to get all the data without using a proxy and only get partial data using a proxy.

pupeteer function not returning array

Hi Guys can you please point my mistake on this code?
console.log(urls) is printing undefined.
Thanks in advance.
const puppeteer = require('puppeteer');
async function GetUrls() {
const browser = await puppeteer.launch( { headless: false,
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' })
const page = await browser.newPage();
await page.goto("https://some page");
await page.waitForSelector('a.review.exclick');
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.review.exclick');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
browser.close();
});
}
(async () => {
let URLS = await GetUrls();
console.log(URLS);
process.exit(1);
})();
Here is a list:
you don't have a return statement in your GetUrls() function
you close the browser after a return statement AND inside the page.evaluate() method
Keep in mind that anything that is executed within the page.evaluate() will relate to the browser context. To quickly test this, add a console.log("test") before let results = []; and you will notice that nothing appears in your Node.js console, it will appear in your browser console instead.
Therefore, the browser variable is visible within the GetUrls() function but NOT visible within the page.evaluate() method.
Here is the corrected code sample:
const puppeteer = require('puppeteer');
async function GetUrls() {
const browser = await puppeteer.launch({
headless: false,
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
})
const page = await browser.newPage();
await page.goto("https://some page");
await page.waitForSelector('a.review.exclick');
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.review.exclick');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
});
await browser.close();
return urls;
}
(async () => {
let URLS = await GetUrls();
console.log(URLS);
process.exit(1);
})();

how to build linkedin bot using puppeteer

I want to build a LinkedIn automatic request sender.
Task to do?
open linkedin.com
login into LinkedIn using login details
Do search for people with the keyword
send a connection request with a note.
I am unable t
const select = require('puppeteer-select');
const FORM = {
USERNAME_SELECTOR: '#username',
PASSWORD_SELECTOR: '#password',
BUTTON_SELECTOR: '.btn__primary--large.from__button--floating'
};
const CREDENTIALS = {
USERNAME: 'Username',
PASSWORD: 'password'
};
const SEARCH = {
SEARCH_SELECTOR: '#global-nav-search',
KEYWORD: '',
CONNECT: '#ember52'
};
const escapeXpathString = str => {
const splitedQuotes = str.replace(/'/g, `', "'", '`);
return `concat('${splitedQuotes}', '')`;
};
const clickByText = async (page, text) => {
const escapedText = escapeXpathString(text);
const linkHandlers = await page.$x(`//a[contains(text(), ${escapedText})]`);
if (linkHandlers.length > 0) {
await linkHandlers[0].click();
} else {
throw new Error(`Link not found: ${text}`);
}
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.linkedin.com/login?trk=guest_homepage-basic_nav-header-signin', {waitUntil: 'networkidle0'});
await page.click(FORM.USERNAME_SELECTOR);
await page.keyboard.type(CREDENTIALS.USERNAME);
await page.click(FORM.PASSWORD_SELECTOR);
await page.keyboard.type(CREDENTIALS.PASSWORD);
await page.click(FORM.BUTTON_SELECTOR);
await page.waitForNavigation();
await page.click(SEARCH.SEARCH_SELECTOR);
await page.focus(SEARCH.SEARCH_SELECTOR);
await page.keyboard.type(SEARCH.KEYWORD);
await page.keyboard.press('Enter');
await page.waitForNavigation();
clickByText(page,`people`);
await page.waitForNavigation();
await page.screenshot({path: 'verify16.png', fullPage: true});
console.log("Current page:", page.url());
//from here
const invitation = await select(page).getSend('span:contains(Send)');
await invitation.click();
//getting error
await browser.close();
})();```
I cannot click on connect button -> I have also to add notes and do for all other connections.
const puppeteer = require('puppeteer');
const select = require('puppeteer-select');
const FORM = {
USERNAME_SELECTOR: '#username',
PASSWORD_SELECTOR: '#password',
BUTTON_SELECTOR: '.btn__primary--large.from__button--floating'
};
const CREDENTIALS = {
USERNAME: 'user',
PASSWORD: 'password'
};
const SEARCH = {
SEARCH_SELECTOR: '#global-nav-search',
KEYWORD: 'keyword',
CONNECT: '#ember52'
};
const escapeXpathString = str => {
const splitedQuotes = str.replace(/'/g, `', "'", '`);
return `concat('${splitedQuotes}', '')`;
};
const clickByText = async (page, text) => {
const escapedText = escapeXpathString(text);
const linkHandlers = await page.$x(`//a[contains(text(), ${escapedText})]`);
if (linkHandlers.length > 0) {
await linkHandlers[0].click();
} else {
throw new Error(`Link not found: ${text}`);
}
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.linkedin.com/login?trk=guest_homepage-basic_nav-header-signin', {waitUntil: 'networkidle0'});
await page.click(FORM.USERNAME_SELECTOR);
await page.keyboard.type(CREDENTIALS.USERNAME);
await page.click(FORM.PASSWORD_SELECTOR);
await page.keyboard.type(CREDENTIALS.PASSWORD);
await page.click(FORM.BUTTON_SELECTOR);
await page.waitForNavigation();
await page.click(SEARCH.SEARCH_SELECTOR);
await page.focus(SEARCH.SEARCH_SELECTOR);
await page.keyboard.type(SEARCH.KEYWORD);
await page.keyboard.press('Enter');
await page.waitForNavigation();
clickByText(page,`people`);
await page.waitForNavigation();
await page.screenshot({path: `verifytest.png`, fullPage: true});
const [button] = await page.$x("//button[contains(., 'Connect')]");
if (button) {
await button.click();
}
const [buttonNote] = await page.$x("//button[contains(., 'Add a note')]");
if (buttonNote) {
await buttonNote.click();
}
await page.keyboard.type('Pardon! buddy i am just testing my bot ~ Manvendra Yadav');
const [buttonSendNote] = await page.$x("//button[contains(., 'Send')]");
if (buttonSendNote) {
await buttonSendNote.click();
}
let elements = await page.$$('#main > div > div > div:nth-child(2) > ul > li');
// loop trough items
for (let i = 0; i < elements.length; i++) {
const [button] = await elements[i].$x("//button[contains(., 'Connect')]");
if (button) {
await button.click();
}
const [buttonNote] = await page.$x("//button[contains(., 'Add a note')]");
if (buttonNote) {
await buttonNote.click();
}
await page.keyboard.type('Pardon! buddy i am just testing my bot ~ Manvendra Yadav');
await page.screenshot({path: `verify${i}.png`, fullPage: true});
const [buttonSendNote] = await page.$x("//button[contains(., 'Send')]");
if (buttonSendNote) {
await buttonSendNote.click();
}
}
await browser.close();
})();

How to get text from xPath in Puppeteer node js

I need to get a text from the span tag and to verify whether the text equals to "check".
How can I achieve this in puppeteer?
Below is the example of the code I've written, if anyone could put me help me figure this out, please.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
headless: false,
// "slowMo": 50,
args: ["--start-fullscreen"],
defaultViewport: null,
});
//Page
const page2 = await browser.newPage();
await page2.goto("https://www.flipkart.com");
await page2.waitFor(2000);
await page2.$x("//input[#class='_2zrpKA _1dBPDZ']").then(async (ele) => {
await ele[0].type(username);
});
await page2.waitFor(2000);
await page2.$x("//input[#type='password']").then(async (ele) => {
await ele[0].type(password);
});
await page2.waitFor(2000);
await page2
.$x("//button[#class='_2AkmmA _1LctnI _7UHT_c']")
.then(async (ele) => {
await ele[0].click();
});
await page2.waitFor(2000);
await page2.$x("//input[#class='LM6RPg']").then(async (ele) => {
await ele[0].type("iPhone 11");
});
await page2.waitFor(2000);
await page2.$x("//button[#class='vh79eN']").then(async (ele) => {
await ele[0].click();
});
await page2.waitFor(2000);
await page2.$x("//div[#class='col col-7-12']/div").then(async (ele) => {
await ele[0].click();
});
await page2.waitFor(2000);
let [element] = await page2.$x('//span[#class="_2aK_gu"]');
let text = await page2.evaluate((element) => element.textContent, element);
if (text.includes("Check")) {
console.log("Check Present");
}
if (text.includes("Change")) {
console.log("Change Present");
}
})();
//get the xpath of the webelement
const [getXpath] = await page.$x('//div[]');
//get the text using innerText from that webelement
const getMsg = await page.evaluate(name => name.innerText, getXpath);
//Log the message on screen
console.log(getMsg)
Here is the complete code for getting div or any html element data using xpath....
const puppeteer = require("puppeteer");
async function scrape () {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto("https://twitter.com/elonmusk", {waitUntil: "networkidle2"})
await page.waitForXPath('/html/body/div[1]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/section/div/div/div[1]/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/div/div[1]/a/div/div[1]/span/span');
let [el] = await page.$x('/html/body/div[1]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/section/div/div/div[1]/div/div/article/div/div/div/div[2]/div[2]/div[1]/div/div/div[1]/div[1]/div/div[1]/a/div/div[1]/span/span');
// console.log()
const names = await page.evaluate(name => name.innerText, el);
console.log(names);
await browser.close();
};
scrape();
You can get the text form the selected element like this:
await page.goto(url, {waitUntil: "networkidle2"});
await page.waitForXPath('//span[#class="_2aK_gu"]');
//assuming it's the first element
let [element] = await page.$x('//span[#class="_2aK_gu"]');
let text = await page.evaluate(element => element.textContent, element);
Note that page.$x returns an array of ElementHandles, so the code here assumes it's the first element. I'd suggest you chose a more specific XPath than a class as many elements may have it.
For the condition:
if (text.includes("Check"))
//do this
else if (text.includes("Change"))
//do that

How can I add html elements to the current page? puppeter/carlo

I'm trying to add html elements to the current page from
page.setContenet
but when it reaches:
await page.setContent('<div><h1>hello world<h1></div>')
Refresh the page and say goodbye to ./index.html
Is there a way that these 2 functions work in the same window at the same time?
full code:
'use strict'
const path = require('path');
const carlo = require('carlo');
const puppeteer = require('puppeteer-core');
const { getExecutablePath } = require('./utils');
const run = async () => {
const executablePath = await getExecutablePath({
// useLocalChromium: true
});
console.log('Executable path:', executablePath);
launchPuppeteer({ executablePath });
}
run();
const launchPuppeteer = async launchOptions => {
const test = path.join(__dirname, 'public')
const final = test + '/index.html';
const browser = await puppeteer.launch({
headless: false,
args: [`--app=${final}`, '--window-size=1280,1024'],
...launchOptions
});
const [page] = await browser.pages();
await page.setViewport({width: 1280, height: 1024});
await page.setContent('<div><h1>hello world<h1></div>')
}
Just use this
await page.evaluate(()=>{
document.body.innerHTML += '<div>Test</div>';
})
Edit: what about this?