anyone have luck on scraping wish.com?
i tried but seriously its a bit hard than expected.
from login page it always gives me an error timeout.
need help.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
await page.setViewport({width:960,height:768});
await page.goto('https://www.wish.com', {waitUntil: 'load', timeout: 100000})
const emailInput = '[placeholder="Email Address"][autocomplete="email"]'
const emailWords = 'test_account#gmail.com'
await page.waitForSelector( emailInput, { timeout: 0 })
await page.focus(emailInput)
await page.keyboard.type(emailWords)
const passwordInput = 'input[placeholder="Password"][autocomplete="current-password"]'
const passwordWords = 'testpassword'
await page.waitForSelector( passwordInput, { timeout: 0 })
await page.focus(passwordInput)
await page.keyboard.type(passwordWords)
await page.screenshot({ fullPage: true, path: 'wish.png' })
await Promise.all([
page.keyboard.press('Enter'),
page.waitForNavigation({ waitUntil: 'networkidle0', timeout: 1000000 }),
])
await page.screenshot({ fullPage: true, path: 'wish2.png' })
await page.goto('https://www.wish.com/search/nike', {waitUntil: 'load', timeout: 100000})
//await browser.close();
(async() => {
browser.close();
})();
})();
Yes, if you are in low network speed that could be happen. I solved with by using this code.
await page.waitFor(10000) to set wait 10 seconds to connect network. You can set any seconds with waitFor functions
Hope this helps you a lot.
Regards
Related
I am trying to get some historical stock data from here:
https://www1.nseindia.com/products/content/equities/equities/eq_security.htm
I am using puppeteer and this is what I have tried:
import puppeteer from 'puppeteer';
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://www1.nseindia.com/products/content/equities/equities/eq_security.htm');
await page.click('#symbol');
await page.keyboard.type('SONACOMS');
let getData = '#get';
await page.waitForSelector(getData);
await page.click(getData);
await page.waitForSelector('#historicalData');
await page.screenshot({path: 'nse.png'});
await browser.close();
})();
The input gets filled correctly , but the click does not seem to
be working. The code hangs forever.
To debug I tried following from the developer console:
document.querySelector('#symbol').value = 'SONACOMS';
document.querySelector('#get').click()
This works correctly. So I am not sure what I am missing in the puppeteer code.
The site is pretty wonky and I'm not sure what's causing the hang, but should be scrapable by bypassing the DOM and hitting the search URL directly:
const puppeteer = require("puppeteer"); // ^19.0.0
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
await page.setUserAgent(ua);
await page.goto(
"https://www1.nseindia.com/products/content/equities/equities/eq_security.htm",
{waitUntil: "domcontentloaded"}
);
const symbol = "SONACOMS";
const searchUrl = `https://www1.nseindia.com/products/dynaContent/common/productsSymbolMapping.jsp?symbol=${symbol}&segmentLink=3&symbolCount=1&series=ALL&dateRange=day&fromDate=&toDate=&dataType=PRICEVOLUMEDELIVERABLE`;
await page.evaluate(`
fetch("${searchUrl}")
.then(res => res.text())
.then(html => document.body.innerHTML = html)
`);
const data = await page.$eval("table", el =>
[...el.querySelectorAll("tr")].map(e =>
[...e.querySelectorAll("th, td")].map(e =>
e.textContent.trim()
)
)
);
console.table(data);
const table = await page.$("table");
await table.screenshot({path: "nse.png"});
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
This is code
const browser = await puppeteer.launch({
headless: false,
timeout: 0,
defaultViewport: null,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--start-maximized",
"--ignore-certificate-errors",
],
ignoreDefaultArgs: ["--enable-automation"],
});
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
);
// set download path
const client = await page.target().createCDPSession();
await client.send("Page.setDownloadBehavior", {
behavior: "allow",
downloadPath: "D:\\Download",
});
// open uri
await page.goto(
"https://translate.google.com.hk/?hl=zh-CN&sourceid=cnhp&sl=en&tl=zh-CN&op=docs",
{
waitUntil: "networkidle2",
}
);
// upload pdf docuemnt
const [fileChooser] = await Promise.all([
page.waitForFileChooser(),
page.click("label"),
]);
await fileChooser.accept(["D:\\test.pdf"]);
// click translate button
const button = await page.waitForSelector(
"div[jsname='itaskb'] > div > button"
);
await button.evaluate((b) => b.click());
// click download button
const button2 = await page.waitForSelector(
"div[jsname='itaskb'] > button",
{
visible: true,
timeout: 0,
}
);
await button2.evaluate((b) => b.click());
The whole process is the same as my manual operation. But the translated document after download is not zh-CN, but the same as the uploaded document, which is en.
What happened? How do I proceed to get the translation I want.
I am trying to use puppeteer to login to the nike site but I get an error likely due to anti-bot. I've tried some things to avoid being detected but did not have any luck. Here is my code:
//const puppeteer = require('puppeteer');
const puppeteer = require("puppeteer-extra");
const pluginStealth = require("puppeteer-extra-plugin-stealth");
puppeteer.use(pluginStealth());
//Create Sleep function to use in Async/Await function
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
const randomDelay = (min, max) =>
Math.floor(Math.random() * (max - min + 1) + min);
(async () => {
await sleep(1000);
var browser;
browser = await puppeteer.launch({
executablePath: 'C:/Program Files/Google/Chrome/Application/chrome.exe',
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-web-security'],
});
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
);
await page.setExtraHTTPHeaders({
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
});
await page.goto('https://www.nike.com/us/en_us/e/nike-plus-membership', {
waitUntil: 'networkidle0',
});
const emailSelector = '.nike-unite-text-input.emailAddress input';
await page.waitFor(emailSelector);
await page.waitFor(randomDelay(300, 600));
const inputs = [emailSelector, '.nike-unite-text-input.password input'];
await page.type(inputs[0], 'xyz#gmail.com', {
delay: randomDelay(200, 300),
});
await page.waitFor(randomDelay(300, 600));
await page.type(inputs[1], 'XYZDEFEWD!"', {
delay: randomDelay(200, 300),
});
const submitBtn = '.nike-unite-submit-button.loginSubmit input';
await page.waitFor(randomDelay(200, 500));
await page.click(submitBtn);
})();
Is there any way to identify what the website is using to detect that I am using puppeteer?
There could be a full proof solution of avoiding bot detection, but here are the someways you can try
Try proxying your IP through multiple countries
Try to add random intervals in your n/w calls
use random user agents instead of fixed one and also alter the viewport size.
I'm trying to submit a login form, but all I get is a timeout after 30 seconds.
My code is rather simple and I can't find anything wrong:
const puppeteer = require('puppeteer');
const creds = {
user: "1234",
password: "1234"
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setViewport({width: 1280, height: 800});
await page.goto('https://shop2.idena.de/NewShop/');
await page.type('input[name="FORM_LOGIN"]', creds.user);
await page.type('input[name="FORM_PASSWD"]', creds.password);
await Promise.all([
page.click('button[name="FORM_TYPE"]'),
page.waitForNavigation()
]);
await page.screenshot({path: 'example.png', fullPage: true});
await browser.close();
})();
Any ideas what's going wrong here?
Change the order of the promises a bit, it could be possible, the navigation happens super fast and the waitForNavigation is just waiting for nothing. Or maybe your website loads very slow after clicking the login button.
await Promise.all([
page.waitForNavigation({timeout: 60000}),
page.click('button[name="FORM_TYPE"]'),
]);
If I use your example with headful option, I get this dialog that prevents the page from loading:
So this addition can help (not sure if some dialog emerges with correct credentials):
const puppeteer = require('puppeteer');
const creds = {
user: "1234",
password: "1234"
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setViewport({width: 1280, height: 800});
await page.goto('https://shop2.idena.de/NewShop/');
await page.type('input[name="FORM_LOGIN"]', creds.user);
await page.type('input[name="FORM_PASSWD"]', creds.password);
page.on('dialog', async dialog => {
console.log(dialog.message());
await dialog.accept();
});
await Promise.all([
page.click('button[name="FORM_TYPE"]'),
page.waitForNavigation()
]);
await page.screenshot({path: 'example.png', fullPage: true});
await browser.close();
})();
EDIT: I'm updating my answer since more infomration has been provided in the original question.
The problem is that there's a dialog you need to confirm/dismiss:
Perhaps you didn't see it because the script was too fast. I recommend debugging puppeteer scripts with headless set to false and slowMo to some number greater than 0:
const browser = await puppeteer.launch({ headless: false, slowMo: 200 });
Then you need to get rid of the dialog:
page.on('dialog', async (dialog) => {
await dialog.accept();
});
The whole script that now passes:
const puppeteer = require('puppeteer');
const creds = {
user: "1234",
password: "1234"
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setViewport({width: 1280, height: 800});
await page.goto('https://shop2.idena.de/NewShop/');
await page.type('input[name="FORM_LOGIN"]', creds.user);
await page.type('input[name="FORM_PASSWD"]', creds.password);
page.on('dialog', async (dialog) => {
await dialog.accept();
});
await Promise.all([
page.click('button[name="FORM_TYPE"]'),
page.waitForNavigation()
]);
await page.screenshot({path: 'example.png', fullPage: true});
await browser.close();
})();
I have a website login form I'm trying to log in to, I was able to get the username and password to type into the input forms. Then I wanted to wait submit the form, but when I do a page.Waitfor(), it seems to wipe out the input data fields. Can someone explain why or show a workaround?
async function Scraper(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36');
await page.goto('https://onlyfans.com/');
await page.waitFor('input[name=email]');
console.log("starting to do this");
await page.$eval('input[name=email]', el => el.value = 'xxx#gmail.com');
await page.$eval('input[name=password]', el => el.value = 'xxx');
let selector = 'button[type="submit"]';
await page.screenshot({
path: 'yoursite.png',
fullPage: true
});
await page.waitFor(5000);
await page.evaluate((selector) => document.querySelector(selector).click(), selector);
await page.screenshot({
path: 'yoursite4.png',
fullPage: true});
console.log("done");
Here is the differences between the two images:
Looks like there is a delay till the login button gets enabled. The following worked for me:
await page.goto('https://onlyfans.com/', {waitUntil: "networkidle0"});
await page.waitForSelector('input[name=email]');
await page.waitForSelector('input[name=password]');
await page.waitForSelector('button[type="submit"]');
await page.type('input[name=email]', 'xxx#gmail.com', {delay: 200});
await page.type('input[name=password]', 'xxx', {delay: 200});
await page.click('button[type="submit"]');