Google Apps Script; Docs; convert selected element to HTML - google-apps-script

I am just starting with Google Apps Script and following the Add-on quickstart
https://developers.google.com/apps-script/quickstart/docs
In the quickstart you can create a simple add-on to get a selection from a document and translate it with the LanguageApp service. The example gets the underlying text using this:
function getSelectedText() {
var selection = DocumentApp.getActiveDocument().getSelection();
if (selection) {
var text = [];
var elements = selection.getSelectedElements();
for (var i = 0; i < elements.length; i++) {
if (elements[i].isPartial()) {
var element = elements[i].getElement().asText();
var startIndex = elements[i].getStartOffset();
var endIndex = elements[i].getEndOffsetInclusive();
text.push(element.getText().substring(startIndex, endIndex + 1));
} else {
var element = elements[i].getElement();
// Only translate elements that can be edited as text; skip images and
// other non-text elements.
if (element.editAsText) {
var elementText = element.asText().getText();
// This check is necessary to exclude images, which return a blank
// text element.
if (elementText != '') {
text.push(elementText);
}
}
}
}
if (text.length == 0) {
throw 'Please select some text.';
}
return text;
} else {
throw 'Please select some text.';
}
}
It gets the text only: element.getText(), without any formatting.
I know the underlying object is not html, but is there a way to get the selection converted into a HTML string? For example, if the selection has a mix of formatting, like bold:
this is a sample with bold text
Then is there any method, extension, library, etc, -- like element.getHTML() -- that could return this?
this is a sample with <b>bold</b> text
instead of this?
this is a sample with bold text

There is a script GoogleDoc2HTML by Omar AL Zabir. Its purpose is to convert the entire document into HTML. Since you only want to convert rich text within the selected element, the function relevant to your task is processText from the script, shown below.
The method getTextAttributeIndices gives the starting offsets for each change of text attribute, like from normal to bold or back. If there is only one change, that's the attribute for the entire element (typically paragraph), and this is dealt with in the first part of if-statement.
The second part deals with the general case, looping over the indices and inserting HTML markup corresponding to the attributes.
The script isn't maintained, so consider it as a starting point for your own code, rather than a ready-to-use library. There are some unmerged PRs that improve the conversion process, in particular for inline links.
function processText(item, output) {
var text = item.getText();
var indices = item.getTextAttributeIndices();
if (indices.length <= 1) {
// Assuming that a whole para fully italic is a quote
if(item.isBold()) {
output.push('<b>' + text + '</b>');
}
else if(item.isItalic()) {
output.push('<blockquote>' + text + '</blockquote>');
}
else if (text.trim().indexOf('http://') == 0) {
output.push('' + text + '');
}
else {
output.push(text);
}
}
else {
for (var i=0; i < indices.length; i ++) {
var partAtts = item.getAttributes(indices[i]);
var startPos = indices[i];
var endPos = i+1 < indices.length ? indices[i+1]: text.length;
var partText = text.substring(startPos, endPos);
Logger.log(partText);
if (partAtts.ITALIC) {
output.push('<i>');
}
if (partAtts.BOLD) {
output.push('<b>');
}
if (partAtts.UNDERLINE) {
output.push('<u>');
}
// If someone has written [xxx] and made this whole text some special font, like superscript
// then treat it as a reference and make it superscript.
// Unfortunately in Google Docs, there's no way to detect superscript
if (partText.indexOf('[')==0 && partText[partText.length-1] == ']') {
output.push('<sup>' + partText + '</sup>');
}
else if (partText.trim().indexOf('http://') == 0) {
output.push('' + partText + '');
}
else {
output.push(partText);
}
if (partAtts.ITALIC) {
output.push('</i>');
}
if (partAtts.BOLD) {
output.push('</b>');
}
if (partAtts.UNDERLINE) {
output.push('</u>');
}
}
}
}

Ended up making a script to support my use-case of bold+links+italics:
function getHtmlOfElement(element) {
var text = element.editAsText();
var string = text.getText();
var indices = text.getTextAttributeIndices();
var output = [];
for (var i = 0; i < indices.length; i++) {
var offset = indices[i];
var startPos = offset;
var endPos = i+1 < indices.length ? indices[i+1]: string.length;
var partText = string.substring(startPos, endPos);
var isBold = text.isBold(offset);
var isItalic = text.isItalic(offset);
var linkUrl = text.getLinkUrl(offset);
if (isBold) {
output.push('<b>');
}
if (isItalic) {
output.push('<i>');
}
if (linkUrl) {
output.push('<a href="' + linkUrl + '">');
}
output.push(partText);
if (isBold) {
output.push('</b>');
}
if (isItalic) {
output.push('</i>');
}
if (linkUrl) {
output.push('</a>');
}
}
return output.join("");
}
You can simply call it using something like:
getHtmlOfElement(myTableCell); // returns something like "<b>Bold</b> test."

This is obviously a workaround, but you can copy/paste a Google Doc into a draft in Gmail and then that draft can be turned into HTML using
GmailApp.getDraft(draftId).getMessage().getBody().toString();
I found this thread trying to skip that step by going straight from a Doc to HTML, but I thought I'd share.

Related

Get the first hyperlink and its text value

I hope everyone is in good health health and condition.
Recently, I have been working on Google Docs hyperlinks using app scripts and learning along the way. I was trying to get all hyperlink and edit them and for that I found an amazing code from this post. I have read the code multiple times and now I have a good understanding of how it works.
My confusion
My confusion is the recursive process happening in this code, although I am familiar with the concept of Recursive functions but when I try to modify to code to get only the first hyperlink from the document, I could not understand it how could I achieve that without breaking the recursive function.
Here is the code that I am trying ;
/**
* Get an array of all LinkUrls in the document. The function is
* recursive, and if no element is provided, it will default to
* the active document's Body element.
*
* #param {Element} element The document element to operate on.
* .
* #returns {Array} Array of objects, vis
* {element,
* startOffset,
* endOffsetInclusive,
* url}
*/
function getAllLinks(element) {
var links = [];
element = element || DocumentApp.getActiveDocument().getBody();
if (element.getType() === DocumentApp.ElementType.TEXT) {
var textObj = element.editAsText();
var text = element.getText();
var inUrl = false;
for (var ch=0; ch < text.length; ch++) {
var url = textObj.getLinkUrl(ch);
if (url != null) {
if (!inUrl) {
// We are now!
inUrl = true;
var curUrl = {};
curUrl.element = element;
curUrl.url = String( url ); // grab a copy
curUrl.startOffset = ch;
}
else {
curUrl.endOffsetInclusive = ch;
}
}
else {
if (inUrl) {
// Not any more, we're not.
inUrl = false;
links.push(curUrl); // add to links
curUrl = {};
}
}
}
if (inUrl) {
// in case the link ends on the same char that the element does
links.push(curUrl);
}
}
else {
var numChildren = element.getNumChildren();
for (var i=0; i<numChildren; i++) {
links = links.concat(getAllLinks(element.getChild(i)));
}
}
return links;
}
I tried adding
if (links.length > 0){
return links;
}
but it does not stop the function as it is recursive and it return back to its previous calls and continue running.
Here is the test document along with its script that I am working on.
https://docs.google.com/document/d/1eRvnR2NCdsO94C5nqly4nRXCttNziGhwgR99jElcJ_I/edit?usp=sharing
I hope you will understand what I am trying to convey, Thanks for giving a look at my post. Stay happy :D
I believe your goal as follows.
You want to retrieve the 1st link and the text of link from the shared Document using Google Apps Script.
You want to stop the recursive loop when the 1st element is retrieved.
Modification points:
I tried adding
if (links.length > 0){
return links;
}
but it does not stop the function as it is recursive and it return back to its previous calls and continue running.
About this, unfortunately, I couldn't understand where you put the script in your script. In this case, I think that it is required to stop the loop when links has the value. And also, it is required to also retrieve the text. So, how about modifying as follows? I modified 3 parts in your script.
Modified script:
function getAllLinks(element) {
var links = [];
element = element || DocumentApp.getActiveDocument().getBody();
if (element.getType() === DocumentApp.ElementType.TEXT) {
var textObj = element.editAsText();
var text = element.getText();
var inUrl = false;
for (var ch=0; ch < text.length; ch++) {
if (links.length > 0) break; // <--- Added
var url = textObj.getLinkUrl(ch);
if (url != null) {
if (!inUrl) {
// We are now!
inUrl = true;
var curUrl = {};
curUrl.element = element;
curUrl.url = String( url ); // grab a copy
curUrl.startOffset = ch;
}
else {
curUrl.endOffsetInclusive = ch;
}
}
else {
if (inUrl) {
// Not any more, we're not.
inUrl = false;
curUrl.text = text.slice(curUrl.startOffset, curUrl.endOffsetInclusive + 1); // <--- Added
links.push(curUrl); // add to links
curUrl = {};
}
}
}
if (inUrl) {
// in case the link ends on the same char that the element does
links.push(curUrl);
}
}
else {
var numChildren = element.getNumChildren();
for (var i=0; i<numChildren; i++) {
if (links.length > 0) { // <--- Added or if (links.length > 0) break;
return links;
}
links = links.concat(getAllLinks(element.getChild(i)));
}
}
return links;
}
In this case, I think that if (links.length > 0) {return links;} can be modified to if (links.length > 0) break;.
Note:
By the way, when Google Docs API is used, both the links and the text can be also retrieved by a simple script as follows. When you use this, please enable Google Docs API at Advanced Google services.
function myFunction() {
const doc = DocumentApp.getActiveDocument();
const res = Docs.Documents.get(doc.getId()).body.content.reduce((ar, {paragraph}) => {
if (paragraph && paragraph.elements) {
paragraph.elements.forEach(({textRun}) => {
if (textRun && textRun.textStyle && textRun.textStyle.link) {
ar.push({text: textRun.content, url: textRun.textStyle.link.url});
}
});
}
return ar;
}, []);
console.log(res) // You can retrieve 1st link and test by console.log(res[0]).
}

How to select all underlined text in a paragraph

I'm trying to create a google apps script that will format certain parts of a paragraph. For example, text that is underlined will become bolded/italicized as well.
One docs add-on I have tried has a similar feature: https://imgur.com/a/5Cw6Irn (this is exactly what I'm trying to achieve)
How can I write a function that will select a certain type of text and format it?
**I managed to write a script that iterates through every single letter in a paragraph and checks if it's underlined, but it becomes extremely slow as the paragraph gets longer, so I'm looking for a faster solution.
function textUnderline() {
var selectedText = DocumentApp.getActiveDocument().getSelection();
if(selectedText) {
var elements = selectedText.getRangeElements();
for (var index = 0; index < elements.length; index++) {
var element = elements[index];
if(element.getElement().editAsText) {
var text = element.getElement().editAsText();
var textLength = text.getText().length;
//For every single character, check if it's underlined and then format it
for (var i = 0; i < textLength; i++) {
if(text.isUnderline(i)) {
text.setBold(i, i, true);
text.setBackgroundColor(i,i,'#ffff00');
} else {
text.setFontSize(i, i, 8);
}
}
}
}
}
}
Use getTextAttributeIndices:
There is no need to check each character in the selection. You can use getTextAttributeIndices() to get the indices in which the text formatting changes. This method:
Retrieves the set of text indices that correspond to the start of distinct text formatting runs.
You just need to iterate through these indices (that is, check the indices in which text formatting changes), which are a small fraction of all character indices. This will greatly increase efficiency.
Code sample:
function textUnderline() {
var selectedText = DocumentApp.getActiveDocument().getSelection();
if(selectedText) {
var elements = selectedText.getRangeElements();
for (var index = 0; index < elements.length; index++) {
var element = elements[index];
if(element.getElement().editAsText) {
var text = element.getElement().editAsText();
var textRunIndices = text.getTextAttributeIndices();
var textLength = text.getText().length;
for (let i = 0; i < textRunIndices.length; i++) {
const startOffset = textRunIndices[i];
const endOffset = i + 1 < textRunIndices.length ? textRunIndices[i + 1] - 1 : textLength - 1;
if (text.isUnderline(textRunIndices[i])) {
text.setBold(startOffset, endOffset, true);
text.setBackgroundColor(startOffset, endOffset,'#ffff00');
} else {
text.setFontSize(startOffset, endOffset, 8);
}
}
}
}
}
}
Reference:
getTextAttributeIndices()
Based on the example shown in the animated gif, it seems your procedure needs to
handle a selection
set properties if the selected region is of some format (e.g. underlined)
set properties if the selected region is NOT of some format (e.g. not underlined)
finish as fast as possible
and your example code achieves all these goals expect the last one.
The problem is that you are calling the text.set...() functions at each index position. Each call is synchronous and blocks the code until the document is updated, thus your run time grows linearly with each character in the selection.
My suggestion is to build up a collection of subranges from the selection range and then for each subrange use text.set...(subrange.start, subrange.end) to apply the formatting. Now the run time will be dependent on chunks of characters, rather than single characters. i.e., you will only update when the formatting switches back and forth from, in your example, underlined to not underlined.
Here is some example code that implements this subrange idea. I separated the specific predicate function (text.isUnderline) and specific formatting effects into their own functions so as to separate the general idea from the specific implementation.
// run this function with selection
function transformUnderlinedToBoldAndYellow() {
transformSelection("isUnderline", boldYellowOrSmall);
}
function transformSelection(stylePredicateKey, stylingFunction) {
const selectedText = DocumentApp.getActiveDocument().getSelection();
if (!selectedText) return;
const getStyledSubRanges = makeStyledSubRangeReducer(stylePredicateKey);
selectedText.getRangeElements()
.reduce(getStyledSubRanges, [])
.forEach(stylingFunction);
}
function makeStyledSubRangeReducer(stylePredicateKey) {
return function(ranges, rangeElement) {
const {text, start, end} = unwrapRangeElement(rangeElement);
if (start >= end) return ranges; // filter out empty selections
const range = {
text, start, end,
styled: [], notStyled: [] // we will extend our range with subranges
};
const getKey = (isStyled) => isStyled ? "styled" : "notStyled";
let currentKey = getKey(text[stylePredicateKey](start));
range[currentKey].unshift({start: start});
for (let index = start + 1; index <= end; ++index) {
const isStyled = text[stylePredicateKey](index);
if (getKey(isStyled) !== currentKey) { // we are switching styles
range[currentKey][0].end = index - 1; // note end of this style
currentKey = getKey(isStyled);
range[currentKey].unshift({start: index}); // start new style range
}
}
ranges.push(range);
return ranges;
}
}
// a helper function to unwrap a range selection, deals with isPartial,
// maps RangeElement => {text, start, end}
function unwrapRangeElement(rangeElement) {
const isPartial = rangeElement.isPartial();
const text = rangeElement.getElement().asText();
return {
text: text,
start: isPartial
? rangeElement.getStartOffset()
: 0,
end: isPartial
? rangeElement.getEndOffsetInclusive()
: text.getText().length - 1
};
}
// apply specific formatting to satisfy the example
function boldYellowOrSmall(range) {
const {text, start, end, styled, notStyled} = range;
styled.forEach(function setTextBoldAndYellow(range) {
text.setBold(range.start, range.end || end, true);
text.setBackgroundColor(range.start, range.end || end, '#ffff00');
});
notStyled.forEach(function setTextSmall(range) {
text.setFontSize(range.start, range.end || end, 8);
});
}

Google Apps Script. Get all links from document

Hi all) I need to get all links from google document. I found that general approach:
function getAllLinks(element) {
var links = [];
element = element || DocumentApp.getActiveDocument().getBody();
if (element.getType() === DocumentApp.ElementType.TEXT) {
var textObj = element.editAsText();
var text = element.getText();
Logger.log("text " + text);
var inUrl = false;
for (var ch=0; ch < text.length; ch++) {
var url = textObj.getLinkUrl(ch);
if (url != null) {
if (!inUrl) {
// We are now!
inUrl = true;
var curUrl = {};
curUrl.element = element;
curUrl.url = String( url ); // grab a copy
curUrl.startOffset = ch;
}
else {
curUrl.endOffsetInclusive = ch;
}
}
else {
if (inUrl) {
// Not any more, we're not.
inUrl = false;
links.push(curUrl); // add to links
curUrl = {};
}
}
}
}
else {
var numChildren = element.getNumChildren();
for (var i=0; i<numChildren; i++) {
links = links.concat(getAllLinks(element.getChild(i)));
}
}
Logger.log(links);
}
It works perfectly fine if i, for example, type url in text, but if add link via menu ("Insert" -> "Link") it doesn't work, function getLinkUrl() returns null. Documentation contains info about Link class, i thought all links represented by it, but don't understand why i can't get link inserted via menu.
I thought maybe i can use some regular expression on text of document element, but if i add link via menu item i can specify custom label for link, which may not contain url in it.
Have anyone faced this scenario? What i missed?

How do I find and select a next bold word

From the https://gist.github.com/oshliaer/d468759b3587cfb424348fa722765187 , It is possible to select a particular word from the findText, I want to implement the same for bold words only
I have a function to find bold. How do I modify the above gist?
var startFlag = x;
var flag = false;
for (var i = x; i < y; i++) {
if (text.isBold(i) && !flag) {
startFlag = i;
flag = true;
} else if (!text.isBold(i) && flag) {
flag = false;
rangeBuilder.addElement(text, startFlag, i - 1);
doc.setSelection(rangeBuilder.build());
return;
}
}
if (flag) {
rangeBuilder.addElement(text, startFlag, i - 1);
doc.setSelection(rangeBuilder.build());
return;
}
Let's assume another algorithm
/*
* #param {(DocumentApp.ElementType.LIST_ITEM | DocumentApp.ElementType.PARAGRAPH)} element
*/
function hasBold(element, start) {
var text = element.editAsText();
var length = element.asText().getText().length;
var first = -1;
var end = -1;
while (start < length) {
if (first < 0 && text.isBold(start)) {
first = start;
}
if (first > -1 && !text.isBold(start)) {
end = start - 1;
return {
s: first,
e: end
}
}
start++;
}
if (first > -1) {
return {
s: first,
e: length - 1
}
}
return false;
}
It's not clean but I've tested it and it works fine.
hasBold lets us finding bolds in the current element.
Finally, we have to loop this feature within document.getBody().
You could to get the full code here find next bold text in google document.
Also you could try it on a copy
A new idea
The Direct searcing
The best way is to use a callback while it is checked
var assay = function (re) {
var text = re.getElement()
.asText();
for (var offset = re.getStartOffset(); offset <= re.getEndOffsetInclusive(); offset++) {
if (!text.isBold(offset)) return false;
}
return true;
}
function findNextBold() {
var sp = 'E.';
Docer.setDocument(DocumentApp.getActiveDocument());
var rangeElement = Docer.findText(sp, Docer.getActiveRangeElement(), assay);
rangeElement ? Docer.selectRangeElement(rangeElement) : Docer.setCursorBegin();
}
The Approx searching
var assay = function(re) {
var text = re.getElement().asText();
var startOffset = re.getStartOffset();
var endOffset = re.getEndOffsetInclusive() + 1;
for (var offset = startOffset; offset < endOffset; offset++) {
if (!text.isBold(offset)) return false;
}
return this.test(text.getText().slice(startOffset, endOffset));
}
function findNextBold() {
var searchPattern = '[^ ]+#[^ ]+';
var testPattern = new RegExp('^[A-Z0-9._%+-]+#[A-Z0-9.-]+\.[A-Z]{2,4}$');
Docer.setDocument(DocumentApp.getActiveDocument());
var rangeElement = Docer.findText(searchPattern, Docer.getActiveRangeElement(), assay.bind(testPattern));
rangeElement ? Docer.selectRangeElement(rangeElement) : Docer.setCursorBegin();
}
Docer
Yes. it is possible to find bold text. You need to use findText(searchPattern) to search the contents of the element for the specific text pattern using regular expressions. The provided regular expression pattern is independently matched against each text block contained in the current element. Then, use isBold() to retrieve the bold setting. It is a Boolean which returns whether the text is bold or null.

GAS is it possible to replace getActiveDocument().getSelection() at once?

My User has the following selection in his Gdoc.
Now from the sidebar he wants to to replace the selection he made on the document. The GAS question is if it is possible to do that at once, something like:
var selection = DocumentApp.getActiveDocument().getSelection()
selection.replace("newtext")
Or do I have to loop through selection.getRangeElements() in order to delete them (or replace them) and than in someway place the new text in that position?
Not, that's not possible (well, if it is, it's not documented).
You have to loop through the selected elements, mainly because the selection may take part of paragraphs, forcing you to manage that. i.e. deleting just the selected part. And for completed selected elements, you can just remove them entirely (like images).
Here's an implementation on how to do this (part of the Kaylan's Translate script modified by me to properly replace images and partially selected paragraphs.
function replaceSelection(newText) {
var selection = DocumentApp.getActiveDocument().getSelection();
if (selection) {
var elements = selection.getRangeElements();
var replace = true;
for (var i = 0; i < elements.length; i++) {
if (elements[i].isPartial()) {
var element = elements[i].getElement().asText();
var startIndex = elements[i].getStartOffset();
var endIndex = elements[i].getEndOffsetInclusive();
var text = element.getText().substring(startIndex, endIndex + 1);
element.deleteText(startIndex, endIndex);
if( replace ) {
element.insertText(startIndex, newText);
replace = false;
}
} else {
var element = elements[i].getElement();
if( replace && element.editAsText ) {
element.clear().asText().setText(newText);
replace = false;
} else {
if( replace && i === elements.length -1 ) {
var parent = element.getParent();
parent[parent.insertText ? 'insertText' : 'insertParagraph'](parent.getChildIndex(element), newText);
replace = false; //not really necessary since it's the last one
}
element.removeFromParent();
}
}
}
} else
throw "Hey, select something so I can replace!";
}