I am trying to get some historical stock data from here:
https://www1.nseindia.com/products/content/equities/equities/eq_security.htm
I am using puppeteer and this is what I have tried:
import puppeteer from 'puppeteer';
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://www1.nseindia.com/products/content/equities/equities/eq_security.htm');
await page.click('#symbol');
await page.keyboard.type('SONACOMS');
let getData = '#get';
await page.waitForSelector(getData);
await page.click(getData);
await page.waitForSelector('#historicalData');
await page.screenshot({path: 'nse.png'});
await browser.close();
})();
The input gets filled correctly , but the click does not seem to
be working. The code hangs forever.
To debug I tried following from the developer console:
document.querySelector('#symbol').value = 'SONACOMS';
document.querySelector('#get').click()
This works correctly. So I am not sure what I am missing in the puppeteer code.
The site is pretty wonky and I'm not sure what's causing the hang, but should be scrapable by bypassing the DOM and hitting the search URL directly:
const puppeteer = require("puppeteer"); // ^19.0.0
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
await page.setUserAgent(ua);
await page.goto(
"https://www1.nseindia.com/products/content/equities/equities/eq_security.htm",
{waitUntil: "domcontentloaded"}
);
const symbol = "SONACOMS";
const searchUrl = `https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=${symbol}&segmentLink=3&symbolCount=1&series=ALL&dateRange=day&fromDate=&toDate=&dataType=PRICEVOLUMEDELIVERABLE`;
await page.evaluate(`
fetch("${searchUrl}")
.then(res => res.text())
.then(html => document.body.innerHTML = html)
`);
const data = await page.$eval("table", el =>
[...el.querySelectorAll("tr")].map(e =>
[...e.querySelectorAll("th, td")].map(e =>
e.textContent.trim()
)
)
);
console.table(data);
const table = await page.$("table");
await table.screenshot({path: "nse.png"});
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Related
This is code
const browser = await puppeteer.launch({
headless: false,
timeout: 0,
defaultViewport: null,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--start-maximized",
"--ignore-certificate-errors",
],
ignoreDefaultArgs: ["--enable-automation"],
});
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
);
// set download path
const client = await page.target().createCDPSession();
await client.send("Page.setDownloadBehavior", {
behavior: "allow",
downloadPath: "D:\\Download",
});
// open uri
await page.goto(
"https://translate.google.com.hk/?hl=zh-CN&sourceid=cnhp&sl=en&tl=zh-CN&op=docs",
{
waitUntil: "networkidle2",
}
);
// upload pdf docuemnt
const [fileChooser] = await Promise.all([
page.waitForFileChooser(),
page.click("label"),
]);
await fileChooser.accept(["D:\\test.pdf"]);
// click translate button
const button = await page.waitForSelector(
"div[jsname='itaskb'] > div > button"
);
await button.evaluate((b) => b.click());
// click download button
const button2 = await page.waitForSelector(
"div[jsname='itaskb'] > button",
{
visible: true,
timeout: 0,
}
);
await button2.evaluate((b) => b.click());
The whole process is the same as my manual operation. But the translated document after download is not zh-CN, but the same as the uploaded document, which is en.
What happened? How do I proceed to get the translation I want.
I am trying to use puppeteer to login to the nike site but I get an error likely due to anti-bot. I've tried some things to avoid being detected but did not have any luck. Here is my code:
//const puppeteer = require('puppeteer');
const puppeteer = require("puppeteer-extra");
const pluginStealth = require("puppeteer-extra-plugin-stealth");
puppeteer.use(pluginStealth());
//Create Sleep function to use in Async/Await function
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
const randomDelay = (min, max) =>
Math.floor(Math.random() * (max - min + 1) + min);
(async () => {
await sleep(1000);
var browser;
browser = await puppeteer.launch({
executablePath: 'C:/Program Files/Google/Chrome/Application/chrome.exe',
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-web-security'],
});
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
);
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
});
await page.goto('https://www.nike.com/us/en_us/e/nike-plus-membership', {
waitUntil: 'networkidle0',
});
const emailSelector = '.nike-unite-text-input.emailAddress input';
await page.waitFor(emailSelector);
await page.waitFor(randomDelay(300, 600));
const inputs = [emailSelector, '.nike-unite-text-input.password input'];
await page.type(inputs[0], 'xyz#gmail.com', {
delay: randomDelay(200, 300),
});
await page.waitFor(randomDelay(300, 600));
await page.type(inputs[1], 'XYZDEFEWD!"', {
delay: randomDelay(200, 300),
});
const submitBtn = '.nike-unite-submit-button.loginSubmit input';
await page.waitFor(randomDelay(200, 500));
await page.click(submitBtn);
})();
Is there any way to identify what the website is using to detect that I am using puppeteer?
There could be a full proof solution of avoiding bot detection, but here are the someways you can try
Try proxying your IP through multiple countries
Try to add random intervals in your n/w calls
use random user agents instead of fixed one and also alter the viewport size.
anyone have luck on scraping wish.com?
i tried but seriously its a bit hard than expected.
from login page it always gives me an error timeout.
need help.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
await page.setViewport({width:960,height:768});
await page.goto('https://www.wish.com', {waitUntil: 'load', timeout: 100000})
const emailInput = '[placeholder="Email Address"][autocomplete="email"]'
const emailWords = 'test_account#gmail.com'
await page.waitForSelector( emailInput, { timeout: 0 })
await page.focus(emailInput)
await page.keyboard.type(emailWords)
const passwordInput = 'input[placeholder="Password"][autocomplete="current-password"]'
const passwordWords = 'testpassword'
await page.waitForSelector( passwordInput, { timeout: 0 })
await page.focus(passwordInput)
await page.keyboard.type(passwordWords)
await page.screenshot({ fullPage: true, path: 'wish.png' })
await Promise.all([
page.keyboard.press('Enter'),
page.waitForNavigation({ waitUntil: 'networkidle0', timeout: 1000000 }),
])
await page.screenshot({ fullPage: true, path: 'wish2.png' })
await page.goto('https://www.wish.com/search/nike', {waitUntil: 'load', timeout: 100000})
//await browser.close();
(async() => {
browser.close();
})();
})();
Yes, if you are in low network speed that could be happen. I solved with by using this code.
await page.waitFor(10000) to set wait 10 seconds to connect network. You can set any seconds with waitFor functions
Hope this helps you a lot.
Regards
I have a website login form I'm trying to log in to, I was able to get the username and password to type into the input forms. Then I wanted to wait submit the form, but when I do a page.Waitfor(), it seems to wipe out the input data fields. Can someone explain why or show a workaround?
async function Scraper(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36');
await page.goto('https://onlyfans.com/');
await page.waitFor('input[name=email]');
console.log("starting to do this");
await page.$eval('input[name=email]', el => el.value = 'xxx#gmail.com');
await page.$eval('input[name=password]', el => el.value = 'xxx');
let selector = 'button[type="submit"]';
await page.screenshot({
path: 'yoursite.png',
fullPage: true
});
await page.waitFor(5000);
await page.evaluate((selector) => document.querySelector(selector).click(), selector);
await page.screenshot({
path: 'yoursite4.png',
fullPage: true});
console.log("done");
Here is the differences between the two images:
Looks like there is a delay till the login button gets enabled. The following worked for me:
await page.goto('https://onlyfans.com/', {waitUntil: "networkidle0"});
await page.waitForSelector('input[name=email]');
await page.waitForSelector('input[name=password]');
await page.waitForSelector('button[type="submit"]');
await page.type('input[name=email]', 'xxx#gmail.com', {delay: 200});
await page.type('input[name=password]', 'xxx', {delay: 200});
await page.click('button[type="submit"]');
Using Puppeteer how can I get DOMContentLoaded, Load time. It would be great if some once can explain how to access dev tools object, Network from Puppeteer.
Probably you are asking about window.performance.timing, here is a simple example how to get this data in Puppeteer:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://en.wikipedia.org');
const performanceTiming = JSON.parse(
await page.evaluate(() => JSON.stringify(window.performance.timing))
);
console.log(performanceTiming);
await browser.close();
})();
But results are quite raw and not meaningful. You should calculate the difference between each value and navigationStart, here is a full example of how to do it (code comes from this article):
const puppeteer = require('puppeteer');
const extractDataFromPerformanceTiming = (timing, ...dataNames) => {
const navigationStart = timing.navigationStart;
const extractedData = {};
dataNames.forEach(name => {
extractedData[name] = timing[name] - navigationStart;
});
return extractedData;
};
async function testPage(page) {
await page.goto('https://en.wikipedia.org');
const performanceTiming = JSON.parse(
await page.evaluate(() => JSON.stringify(window.performance.timing))
);
return extractDataFromPerformanceTiming(
performanceTiming,
'domContentLoadedEventEnd',
'loadEventEnd'
);
}
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(await testPage(page));
await browser.close();
})();