How to iterate elements in Puppeteer and extract their contents? - puppeteer

I use Xpath to make items selection:
let elements = await page.$x("//h4[contains(.,'Lieux')]/../..//span");
This gives me an array of N elements (good !)
How to iterate through these elements and get its content ?
I tried:
elements.forEach(async element => {
let text = await element.$eval('.', el => el.innerText); // get its own innerText
console.log(text);
});
Any idea ?

Related

accessing table cells from puppeteer

I need to be able to validate the values in specific table cells, and later, to click on a particular cell that holds a link. I get that Node and the browser (or emulator) are two different process spaces, so I can't pass references. I was hoping that puppeteer would hide this fact in a read-only manner such as return someArray; in a function run in the browser context being "magically" replicated by puppeteer on the Node side, but alas.
test("get a certain row from a certain table", async function getRow() {
await page.waitForSelector("#actionItemsView-table");
const
cellText = await page.evaluate(function getCells() {
const
row = Array.from(document.querySelectorAll("#actionItemsView-table tbody tr"));
for (let i = row.length - 1; i >= 0; --i) {
// the row we want is the one that has text of interest in cells[2]
if (row[i].cells[2] === "some text that identifies the row of interest") {
return row[i]; // we can't pass this back to Node, so this is wrong
// but some version of this what we need to do
}
}
return null; // no such row
});
console.log(cellText); // cellText is an empty array
}, testTimeout);
Lacking that, I have run through various intermediate experiments, all the way to this, seemingly simplest case, just to get something that works and then work my way back up to what I need, but this doesn't work either:
test("get the text from a single cell", async function getInnerText() {
await page.waitForSelector("#actionItemsView-table");
const
cellText = await page.evaluate(function getText() {
let
ct,
row = Array.from(document.querySelectorAll("#actionItemsView-table tbody tr"));
for (let i = row.length - 1; i >= 0; --i) {
if (row[i].cells[2] === "some text that identifies the row of interest") {
ct = row[i].cells[3].innerText; // the text of the next cell to the right
break;
}
}
return ct; // ct is not a string!
});
console.log(cellText); // cellText is undefined!
}, testTimeout);
If I do things like
document.querySelect("#actionItemsView-table").rows[2].cells[3].innerText
they work, so my selectors and javascript syntax seems to be correct.
There has to be a way to do this and it has to be way easier than I have made it -- what am I missing? Why is the above not working but something like this does work:
await page.$eval("input[name=emailAddr]", function setId(el, id) { el.value = id; return id; }, id);
Here's an easier way to find that cell:
await page.evaluate(() => {
let td = [...document.querySelectorAll("td")].find(td => td.innerText === "something")
return td?.nextElementSibling?.innerText
})
This will return the text or undefined

Scrape paragraphs in html with anchor links in Puppeteer

I am trying to scrape a text on a website with puppeteer. Now I have reached the point where I can read the p-tag between two h2-tags, only this paragraph texts also contain words with internal links. With the current code I get the plain paragraph texts in an array as ouput but actually I need to have the text with the tags in it. Is this possible with puppeteer?
My current code for paragraph scrape is:
const puppeteer = require('puppeteer');
const plaatsengids = async () => {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://www.plaatsengids.nl/urmond');
let paragraphs = await page.evaluate(() => {
const status = document.querySelector('h2[name="status"]');
const naam = document.querySelector('h2[name="naam"]');
return [...document.querySelectorAll('p')]
.filter(p => p.compareDocumentPosition(status) & Node.DOCUMENT_POSITION_PRECEDING &&
p.compareDocumentPosition(naam) & Node.DOCUMENT_POSITION_FOLLOWING)
.map(p => p.textContent.replace("- ",""));
});
console.log(paragraphs);
await browser.close();
return paragraphs;
};
module.exports = plaatsengids;
The page I am trying to scrape is:
https://www.plaatsengids.nl/urmond
For example, the words that contain an a-tag in the text are circled in red here:
If I understand correctly, you can try p.innerHTML.replace("- ","") instead of p.textContent.replace("- ","").

Any time a section is mentioned in the document, I want that mention to become a link to the corresponding bookmark

Goal: I have a very long document with many unique sections that each have bookmarks. Any time a section is mentioned in the document, I want that mention to become a link to the corresponding bookmark. It doesn't have to be event-driven, I intend to do it from a menu.
I have the below code written to get a list of the names of each bookmarked line so I can match it to the words in the doc. I'm trying to figure out what line of code to use to link specific text to that bookmark. I've tried to use the setLinkUrl("beginningofurl" + id[i]) code, but the ID of the bookmarks doesn't tell me if it's a header or regular text, and sometimes it is just regular text. I'm wondering if there's a better way of doing this?
var DOC = DocumentApp.getActiveDocument();
function Setlink() {
var bookmarks = DOC.getBookmarks();
var names = [];
for (var i = 0; i < bookmarks.length; i++){
names.push(bookmarks[i].getPosition().getSurroundingText().getText());
}
Logger.log(names);
}
Headings are a property of Paragraph elements. To check a Bookmark to see if it is in a paragraph of a certain Paragraph Heading, we need to get the Position, then the Element, and then check if the Element is indeed a Paragraph before we can check the Paragraph Heading.
We can put our test for if an Element is a heading in a predicate function named isElementInHeading that will return true or false when given an Element.
function isElementInHeading(element) {
if (element.getType() !== DocumentApp.ElementType.PARAGRAPH) {
return false;
}
const {ParagraphHeading} = DocumentApp;
switch (element.getHeading()) {
case ParagraphHeading.HEADING1:
case ParagraphHeading.HEADING2:
case ParagraphHeading.HEADING3:
case ParagraphHeading.HEADING4:
case ParagraphHeading.HEADING5:
case ParagraphHeading.HEADING6:
return true;
}
return false;
}
This can be used to both filter the bookmarks to include only those that mark headings, and to skip over the same headings when using setLinkUrl.
The strategy in this example is to collect both the bookmark's ID and the desired text in one go using a reducer function, then search through the document for each bit of text, check that we didn't just find the header again, and then apply the link.
I am not quite sure how you are getting the URL, but I found just copying and pasting the URL into the script as const url = "https://docs.google.com/.../edit#bookmark="; worked for me.
// for Array.prototype.reduce
function getHeadingBookmarksInfo(bookmarks, bookmark) {
const element = bookmark.getPosition().getElement();
if (isElementInHeading(element)) {
return [
...bookmarks,
{ id: bookmark.getId(), text: element.getText() }
];
}
return bookmarks;
}
function updateLinks() {
const doc = DocumentApp.getActiveDocument();
const bookmarks = doc.getBookmarks();
const headingBookmarksInfo = bookmarks.reduce(getHeadingBookmarksInfo, []);
const body = doc.getBody();
headingBookmarksInfo.forEach(function(info) {
const {id, text} = info;
let foundRef = body.findText(text);
while (foundRef !== null) {
const element = foundRef.getElement();
if (!isElementInHeading(element.getParent())) {
element.asText()
.setLinkUrl(
foundRef.getStartOffset(),
foundRef.getEndOffsetInclusive(),
url + id // assumes url is hardcoded in global scope
);
}
foundRef = body.findText(text, foundRef);
}
});
}

Get an attribute of a page element in pupeeter/apify

I could fetch the textContent of a html element in pupeeter:
var website_element = await page.$('a[itemprop="url"]');
var website= await (await website_element .getProperty('textContent')).jsonValue();
yet, sometimes the textContent is not enough, see the following html:
<a itemprop="url" href="https://www.4-b.ch/de/4b-fenster-fassaden/home/">
https://www.4-b.ch/de/4b-fenster-fassad...</a>
the result is obscure: "https://www.4-b.ch/de/4b-fenster-fassad..." with ... at the end.
So, i better get the href attribute.
But when:
var website_element = await page.$('a[itemprop="url"]');
var website = await (await website_element.getAttribute('href')).jsonValue();
The result is TypeError: website_element.getAttribute is not a function
Any suggestion?
There's an easy and fast way to do this using the page.$eval function:
var website = await page.$eval('a[itemprop="url"]', el => el.href);
What page.$eval does is that it first finds an element in the DOM using the provided selector (first argument) and then invokes the callback (second argument) with the found element as its only argument. The return value of the callback becomes the return value of page.$eval() itself.
it works:
var website_element = await page.$('a[itemprop="url"]');
var website = await (await website_element.getProperty('href')).jsonValue();

How to use .findElement(DocumentApp.ElementType.TABLE_OF_CONTENTS) to get and parse a Document's Table of Contents Element

My goal is to parse a TableOfContents element in a Google Document and write it to another one. I want to do this for every document in a folder.
Having gone to the bother of converting each document to the type generated by DocsList just so I can use this method [ which a document generated by DocumentApp does not have. Why, I don't understand, because otherwise the two 'documents' are similar when it comes to finding parts. ], I find that what I get back is a SearchResult. How is this elusive construction used? I've tried converting it into a TableOfContents element [ ele = searchResult.asTableOfContents() ], which does not error out, but nothing I do allows me parse through its child elements to recover their text works. Interestingly enough, if you get a TableOfContents element by parsing through the document's paragraphs to get it, THAT let's you parse the TOC.
Would someone speak to this question. I sure would appreciate a code snippet because I'm getting nowhere, and I have put some hours into this.
The asTableOfContents() method is only there to help the editor's autocomplete function. It has no run-time impact, and cannot be used to cast to a different type. (See ContainerElement documentation.)
To parse the table of contents, start by retrieving the element from the SearchResult. Below is an example that goes through the items in a document's table of contents to produce an array of item information.
Example Document
Parsing results
On a simple document with a few headings and a table of contents, here's what it produced:
[13-08-20 16:31:56:415 EDT]
[
{text=Heading 1.0, linkUrl=#heading=h.50tkhklducwk, indentFirstLine=18.0, indentStart=18.0},
{text=Heading 1.1, linkUrl=#heading=h.ugj69zpoikat, indentFirstLine=36.0, indentStart=36.0},
{text=Heading 1.2, linkUrl=#heading=h.xb0y0mu59rag, indentFirstLine=36.0, indentStart=36.0},
{text=Heading 2.0, linkUrl=#heading=h.gebx44eft4kq, indentFirstLine=18.0, indentStart=18.0}
]
Code
function test_parseTOC() {
var fileId = '--Doc-ID--';
Logger.log( parseTOC( fileId ) );
}
function parseTOC( docId ) {
var contents = [];
var doc = DocumentApp.openById(docId);
// Define the search parameters.
var searchElement = doc.getBody();
var searchType = DocumentApp.ElementType.TABLE_OF_CONTENTS;
// Search for TOC. Assume there's only one.
var searchResult = searchElement.findElement(searchType);
if (searchResult) {
// TOC was found
var toc = searchResult.getElement().asTableOfContents();
// Parse all entries in TOC. The TOC contains child Paragraph elements,
// and each of those has a child Text element. The attributes of both
// the Paragraph and Text combine to make the TOC item functional.
var numChildren = toc.getNumChildren();
for (var i=0; i < numChildren; i++) {
var itemInfo = {}
var tocItem = toc.getChild(i).asParagraph();
var tocItemAttrs = tocItem.getAttributes();
var tocItemText = tocItem.getChild(0).asText();
// Set itemInfo attributes for this TOC item, first from Paragraph
itemInfo.text = tocItem.getText(); // Displayed text
itemInfo.indentStart = tocItem.getIndentStart(); // TOC Indentation
itemInfo.indentFirstLine = tocItem.getIndentFirstLine();
// ... then from child Text
itemInfo.linkUrl = tocItemText.getLinkUrl(); // URL Link in document
contents.push(itemInfo);
}
}
// Return array of objects containing TOC info
return contents;
}
Bad news
The bad news is that you are limited in what you can do to a table of contents from a script. You cannot insert a TOC or add new items to an existing one.
See Issue 2502 in the issue tracker, and star it for updates.
If you can post code or explain your issue with DocsList vs DocumentApp, it could be looked at. The elements of a Google Document can only be manipulated via DocumentApp.
I modified the above code to re-create the TOC in a table only with the desired levels(i.e. h1, h2). The only caveat is that TOC must be present & updated before running this.
function findToc(body, level = 2) {
const indent = 18;
let contents = [];
const tocType = TABLE_OF_CONTENTS;
const tocContainer = body.findElement(tocType);
if (tocContainer) {
// TOC was found
const toc = tocContainer.getElement().asTableOfContents();
const totalLines = toc.getNumChildren();
for (let lineIndex = 0; lineIndex < totalLines; lineIndex++) {
const tocItem = toc.getChild(lineIndex).asParagraph();
const { INDENT_START } = tocItem.getAttributes();
const isDesiredLevel = Number(INDENT_START) <= indent * (level - 1);
if (isDesiredLevel) {
contents.push(tocItem.copy());
}
}
}
return contents;
}
function addToTable(cellText) {
body = DocumentApp.openById(docId).getBody();
const table = body.appendTable();
const tr = table.insertTableRow(0);
const td = tr.insertTableCell(0);
cellText.forEach(text => {
td.appendParagraph(text);
})
}
function parseTOC(docId) {
body = DocumentApp.openById(docId).getBody();
const contents = findToc(body);
addToTable(contents);
}