Google AppScript dies with "Service Unavailable: Docs" - google-apps-script

I wrote an app script that should break down a wall of text into separate paragraphs.
function onOpen() {
DocumentApp.getUi()
.createMenu('Formatting tool')
.addItem('Make Paragraphs', 'breakIntoParagraphs')
.addToUi();
}
function breakIntoParagraphs() {
var body = DocumentApp.getActiveDocument().getBody();
var counter = 0;
body.replaceText("\\v\\v+", "°"); // the ° is more convenient to handle
var rangeElement = body.findText("°");
while (rangeElement != null) {
var start = rangeElement.getStartOffset();
var paragraph = rangeElement.getElement().getParent();
var childIndex = body.getChildIndex(paragraph);
var endRangeElement = body.findText("°", rangeElement);
if (endRangeElement != null) {
var end = endRangeElement.getStartOffset();
var endParagraph = endRangeElement.getElement().getParent();
var endChildIndex = body.getChildIndex(endParagraph);
if ( childIndex != endChildIndex) {
Logger.log("this spans paragraphs!"); // deal with this case later
}
Logger.log(paragraph.asText());
var text = body.editAsText().deleteText(start, end - 1 ); // -1, so the concluding ° remains
Logger.log("deleted text: \"" + text + "\"");
var newParagraph = body.insertParagraph(childIndex, text);
newParagraph.editAsText.replaceText("°", ""); // remove markers
}
rangeElement = body.findText("°", rangeElement);
counter++;
if (counter > 2) {
break;
}
}
}
Unfortunately, it gives me an ugly red warning "Service unavailable: Docs". In the process of writing this, I learned that this means as much as "something fishy happened, and you have to figure out what that is, yourself." That can be timeouts, complex regular expressions, infinite loops (which give timeouts, too) etc. Google's issue tracking system has several of those.
Now I tried to avoid every complex or non-standard thing, and even made sure to break the loop in case of too many repetitions, but I still get the "Service unavailable: Docs". What could be causing this, and how can I fix it?

Related

xpath in apps script?

I made a formula to extract some Wikipedia data in Google Seets which works fine. Here is the formula:
=regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Geography')]]"))),"\[[^\]]+\]","")&char(10)&char(10)&iferror(regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Education')]]"))),"\[[^\]]+\]",""))
Where D2 is a URL like https://en.wikipedia.org/wiki/Abbeville,_Alabama
This extracts some Geography and Education data from the Wikipedia page. Trouble is that importxml only runs a few times before it dies due to quota.
So I thought maybe better to use Apps Script where there are much higher limits on fetching and parsing. I could not see a good way however of using Xpath in Apps Script. Older posts on the web discuss using a deprecated service called Xml but it seems to no longer work. There is a Service called XmlService which looks like it may do the job but you can't just plug in an Xpath. It looks like a lot of sweating to get to the result. Any solutions out there where you can just plug in Xpath?
Here is an alternative solution I actually do in a case like this.
I have used XmlService but only for parsing the content, not for using Xpath. This makes use of the element tags and so far pretty consistent on my tests. Although, it might need tweaks when certain tags are in the result and you might have to include them into the exclusion condition.
Tested the code below in both links:
https://en.wikipedia.org/wiki/Abbeville,_Alabama#Geography
https://en.wikipedia.org/wiki/Montgomery,_Alabama#Education
My test shows that the formula above used did not return the proper output from the 2nd link while the code does. (Maybe because it was too long)
Code:
function getGeoAndEdu(path) {
var data = UrlFetchApp.fetch(path).getContentText();
// wikipedia is divided into sections, if output is cut, increase the number
var regex = /.{1,100000}/g;
var results = [];
// flag to determine if matches should be added
var foundFlag = false;
do {
m = regex.exec(data);
if (foundFlag) {
// if another header is found during generation of data, stop appending the matches
if (matchTag(m[0], "<h2>"))
foundFlag = false;
// exclude tables, sub-headers and divs containing image description
else if(matchTag(m[0], "<div") || matchTag(m[0], "<h3") ||
matchTag(m[0], "<td") || matchTag(m[0], "<th"))
continue;
else
results.push(m[0]);
}
// start capturing if either IDs are found
if (m != null && (matchTag(m[0], "id=\"Geography\"") ||
matchTag(m[0], "id=\"Education\""))) {
foundFlag = true;
}
} while (m);
var output = results.map(function (str) {
// clean tags for XmlService
str = str.replace(/<[^>]*>/g, '').trim();
decode = XmlService.parse('<d>' + str + '</d>')
// convert html entity codes (e.g.  ) to text
return decode.getRootElement().getText();
// filter blank results due to cleaning and empty sections
// separate data and remove citations before returning output
}).filter(result => result.trim().length > 1).join("\n").replace(/\[\d+\]/g, '');
return output;
}
// check if tag is found in string
function matchTag(string, tag) {
var regex = RegExp(tag);
return string.match(regex) && string.match(regex)[0] == tag;
}
Output:
Difference:
Formula ending output
Script ending output
Education ending in wikipedia
Note:
You still have quota when using UrlFetchApp but should be better than IMPORTXML's limit depending on the type of your account.
Reference:
Apps Script Quotas
Sorry I got very busy this week so I didn't reply. I took a look at your answer which seems to work fine, but it was quite code heavy. I wanted something I would understand so I coded my own solution. not that mine is any simpler. It's just my own code so it's easier for me to follow:
function getTextBetweenTags(html, paramatersInFirstTag, paramatersInLastTag) { //finds text values between 2 tags and removes internal tags to leave plain text.
//eg getTextBetweenTags(html,[['class="mw-headline"'],['id="Geography"']],[['class="wikitable mw-collapsible mw-made-collapsible"']])
// **Note: you may want to replace &#number; with ascII number
var openingTagPos = null;
var closingTagPos = null;
var previousChar = '';
var readingTag = false;
var newTag = '';
var tagEnd = false;
var regexFirstTagParams = [];
var regexLastTagParams = [];
//prepare regexes to test for parameters in opening and closing tags. put regexes in arrays so each condition can be tested separately
for (var i in paramatersInFirstTag) {
regexFirstTagParams.push(new RegExp(escapeRegex(paramatersInFirstTag[i][0])))
}
for (var i in paramatersInLastTag) {
regexLastTagParams.push(new RegExp(escapeRegex(paramatersInLastTag[i][0])))
}
var startTagIndex = null;
var endTagIndex = null;
var matches = 0;
for (var i = 0; i < html.length - 1; i++) {
var nextChar = html.substr(i, 1);
if (nextChar == '<' && previousChar != '\\') {
readingTag = true;
}
if (nextChar == '>' && previousChar != '\\') { //if end of tag found, check tag matches start or end tag
readingTag = false;
newTag += nextChar;
//test for firstTag
if (startTagIndex == null) {
var alltestsPass = true;
for (var j in regexFirstTagParams) {
if (!regexFirstTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
startTagIndex = i + 1;
//console.log('Start Tag',startTagIndex)
matches++;
}
}
//test for lastTag
else if (startTagIndex != null) {
var alltestsPass = true;
for (var j in regexLastTagParams) {
if (!regexLastTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
endTagIndex = i + 1;
matches++;
}
}
if(startTagIndex && endTagIndex) break;
newTag = '';
}
if (readingTag) newTag += nextChar;
previousChar = nextChar;
}
if (matches < 2) return 'No matches';
else return html.substring(startTagIndex, endTagIndex).replace(/<[^>]+>/g, '');
}
function escapeRegex(string) {
if (string == null) return string;
return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
My function requires an array of attributes for the start tag and an array of attributes for the end tag. It gets any text in between and removes any tags found inbetween. One issue I also noticed was there were often special characters (eg  ) so they need to be replaced. I did that outside the scope of the function above.
The function could be easily improved to check the tag type (eg h2), but it wasn't necessary for the wikipedia case.
Here is a function where I called the above function. the html variable is just the result of UrlFetchApp.fetch('some wikipedia city url').getContextText();
function getWikiTexts(html) {
var geography = getTextBetweenTags(html, [['class="mw-headline"'], ['id="Geography']], [['class="mw-headline"']]);
var economy = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Economy']], 'span', [['class="mw-headline"']])
var education = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Education']], 'span', [['class="mw-headline"']])
var returnString = '';
if (geography != 'No matches' && !/Wikipedia/.test(geography)) returnString += geography + '\n';
if (economy != 'No matches' && !/Wikipedia/.test(economy)) returnString += economy + '\n';
if (education != 'No matches' && !/Wikipedia/.test(education)) returnString += education + '\n';
return returnString
}
Thanks for posting your answer.

How to prevent error throwing in Google Apps Script?

Please see the code herein under:
function binanceOrderBook() {
try {
muteHttpExceptions = true;
var workSpreadsheet = SpreadsheetApp.getActiveSpreadsheet();
var mySheet = workSpreadsheet.getSheetByName('Order Books');
if(mySheet == 'Sheet'){
mySheet.activate();
} else {
mySheet = workSpreadsheet.insertSheet('Order Books', 1).activate();
}
var ui = SpreadsheetApp.getUi();
var string = 'https://api.binance.com/api/v3/depth?';
var symbolResponse = ui.prompt('Pair Name', 'Please enter the pair symbol.\n\nExamples: BTCUSDT or ETHBTC:', ui.ButtonSet.OK_CANCEL);
var symbolButton = symbolResponse.getSelectedButton();
if(symbolButton == ui.Button.CANCEL){return}
var mySymbol = symbolResponse.getResponseText();
mySymbol = mySymbol.toUpperCase();
string = string + "symbol=" + mySymbol;
var limitResponse = ui.prompt('Limit:', 'Please enter Limit (Period Quantity).\nValid limits are:5, 10, 20, 50, 100, 500, 1000. \n Default limit is 100.\n You can leave it blank and simply click OK.', ui.ButtonSet.OK_CANCEL);
if(limitResponse.getSelectedButton() == ui.Button.CANCEL){return}
var myLimit = Number(limitResponse.getResponseText());
if(myLimit != 5 && myLimit != 10 && myLimit != 20 && myLimit != 50 && myLimit != 100 && myLimit != 500 && myLimit != 1000){myLimit = 100;}
string = string + "&limit=" + myLimit;
var myDate = new Date().toUTCString();
var jsonOrderBookData = JSON.parse(UrlFetchApp.fetch('https://api.binance.com/api/v3/depth?symbol=' + mySymbol + '&limit=' + myLimit));
reporter(jsonOrderBookData);
} catch (e){
exceptionHandler(e)
}
}
The problem I have is to run UrlFetchApp.fetch again when it encounters an error. I need to run it several times to get the result. So, I need to prevent the script from stopping when an error (code -1003) occurs, but how can I do that?
EDIT: There is a function windows.onerror in javascript which can be set to prevent the program from stopping. Is it useable in GAS? if yes, how? if No, is there a similar solution for GAS?
You could call binanceOrderBook() from within your catch statement. E.g.
...
} catch (e){
binanceOrderBook()
exceptionHandler(e)
}
Of course you probably should have some condition that exits the function if a certain error occurs, or if you know that the function needs to run no more than x number of times you could check that it has run less than x times before executing. For example,
const maxValue = 10 // whatever the max number of executions should be
function binanceOrderBook(executions) {
if (executions >= maxValue) return;
try {
...
} catch(e) {
binanceOrderBook((executions || 0) + 1));
exceptionHandler(e); // note that I am including this here because it's in your original example, but as it is written now, exception handler won't be called until binanceOrderBook executes without an error.
}
}
[Edit] To answer your second question, there is no equivalent to window.onerror that I know of in GAS. However, window.onerror is a global event handler and so would affect errors thrown by any functions defined in your project. To address a concern with a single function call like this, you are better off using a try catch statement as you have.

Script that would find and mark the same words in the paragraph

I'm a fiction writer and I used to do my writing in MS Word. I've written some macros to help me edit the fiction text and one of them check the paragraph and marks (red) the duplicate (or triplicate words, etc). Example:
"I came **home**. And while at **home** I did this and that."
Word "home" is used twice and worth checking if I really can't change the sentence.
Now I mostly use google documents for writing, but I still have to do my editing in MS Word, mostly just because of this macro - I am not able to program it in the google script.
function PobarvajBesede() {
var doc = DocumentApp.getActiveDocument();
var cursor = DocumentApp.getActiveDocument().getCursor();
var surroundingText = cursor.getSurroundingText().getText();
var WordsString = WORDS(surroundingText);
Logger.log(WordsString);
//so far, so good. But this doesn't work:
var SortedWordsString = SORT(WordsString[1],1,False);
// and I'm lost.
}
function WORDS(input) {
var input = input.toString();
var inputSplit = input.split(" ");
// Logger.log(inputSplit);
inputSplit = inputSplit.toString();
var punctuationless = inputSplit.replace(/[.,\/#!$%\?^&\*;:{}=\-_`~()]/g," ");
var finalString = punctuationless.replace(/\s{2,}/g," ");
finalString = finalString.toLowerCase();
return finalString.split(" ") ;
}
If I could only get a list of words (in uppercase, longer than 3 characters), sorted by the number of their appearances in the logger, it would help me a lot:
HOME (2)
AND (1)
...
Thank you.
Flow:
Transform the string to upper case and sanitize the string of all non ascii characters
After splitting the string to word array, reduce the array to a object of word:count
Map the reduced object to a 2D array [[word,count of this word],[..],...] and sort the array by the inner array's count.
Snippet:
function wordCount(str) {
str = str || 'I came **home**. And while at **home** I did this and that.';
var countObj = str
.toUpperCase() //'I CAME **HOME**...'
.replace(/[^A-Z ]/g, '') //'I CAME HOME...'
.split(' ') //['I', 'CAME',..]
.reduce(function(obj, word) {
if (word.length >= 3) {
obj[word] = obj[word] ? ++obj[word] : 1;
}
return obj;
}, {}); //{HOME:2,DID:1}
return Object.keys(countObj)
.map(function(word) {
return [word, countObj[word]];
}) //[['HOME',2],['CAME',1],...]
.sort(function(a, b) {
return b[1] - a[1];
});
}
console.info(wordCount());
To read and practice:
Object
Array methods
This is a combination of TheMaster answer and some of my work. I need to learn more about the way he did it so I spent some learning time today. This function eliminates some problems I was having the carriage returns and it also removes items that only appear once. You should probably pick TheMasters solution as I couldn't have done it without his work.
function getDuplicateWords() {
var str=DocumentApp.getActiveDocument().getBody().getText();
var countObj = str
.toUpperCase()
.replace(/\n/g,' ')
.replace(/[^A-Z ]/g, '')
.split(' ')
.reduce(function(obj, word) {
if (word.length >= 2) {
obj[word] = obj[word] ? ++obj[word] : 1;
}
return obj;
}, {});
var oA=Object.keys(countObj).map(function(word){return [word, countObj[word]];}).filter(function(elem){return elem[1]>1;}).sort(function(a,b){return b[1]-a[1]});
var userInterface=HtmlService.createHtmlOutput(oA.join("<br />"));
DocumentApp.getUi().showSidebar(userInterface);
}
function onOpen() {
DocumentApp.getUi().createMenu('MyMenu')
.addItem('Get Duplicates','getDuplicateWords' )
.addToUi();
}
And yes I was having problems with get the results to change in my last solution.

Google Script for Gmail not consistent

I have a filter that adds the "unprocessed" label on all incoming emails.
Then a Google Script searches every minute for any email threads that have the "unprocessed" label, processes the messages, and conditionally apply a label to the corresponding thread.
I don't know what I have done wrong, but only SOME of the processed threads get the label. And it works randomly... For example only 3 out of 6 threads got the label, or 1 out of 3.
I have to re-apply the "unprocessed" label, and just run the script again to fix them.
function processGmail() {
var startTime = new Date().getTime();
var mailerRegex = /X-Mailer:(.*)/g;
var scannerLabel = GmailApp.getUserLabelByName("Scanner");
var unprocessedLabel = GmailApp.getUserLabelByName("unprocessed");
var countMessages = 0;
GmailApp.search("label:unprocessed").forEach(
function(emailThread) {
emailThread.getMessages().forEach(
function(message) {
var raw = message.getRawContent();
var result;
var doReturn = false;
while((matches = mailerRegex.exec(raw)) !== null) {
if (matches.some(function(match){return match.indexOf('Canon MFP') >= 0;})) {
emailThread.addLabel(scannerLabel);
emailThread.moveToArchive();
doReturn = true;
break;
}
}
emailThread.removeLabel(unprocessedLabel);
++countMessages;
if (doReturn) {
return;
}
}
);
}
);
var endTime = new Date().getTime();
Logger.log("Processed " + countMessages + " in " + (endTime-startTime) + "ms.");
}
Turns out the bug was Javascript related.
I had forgotten that the regex.exec needs to be looped until a null is returned, only then it will start a-new for a new input.
The fix was removing break :)

Scrape site to report css selector occurrence in HTML

I want to see how much of my team's code has been integrated into a large scale site.
I believe I can achieve this (albeit roughly), by getting statistics on the number of occurrences certain CSS selectors appear across all the HTML pages. I have some unique CSS class selectors that I would like to use when scraping the site to analyze:
On how many pages the selector occurs.
On any page it does, how many times.
I've looked around but can't find any tools - does anyone know of any, or could suggest any idea's that may help me quickly achieve this ?
Thanks in advance.
Thanks to everyone for their advice.
In the end I decided that there was no one tool that could help me gather the statistics in the way I described so I already started to build up the application I needed in Node. Although I've not used Node before I've found it quick to grasp with an intermediate knowledge of Javascript.
For anyone looking to do the same:
I've used Simplecrawler to run over the site and Cheerio to find selectors and from this I can create a simple report created in Json using FS.
I'd recommend you to use Google App Scripting. You might manage to crawl site's pages and count the CSS selector occurrences with regex. Modify he following code to search each page for CSS selector. The code explanation is here.
Code
function onOpen() {
DocumentApp.getUi() // Or DocumentApp or FormApp.
.createMenu('New scrape web docs')
.addItem('Enter Url', 'showPrompt')
.addToUi();
}
function showPrompt() {
var ui = DocumentApp.getUi();
var result = ui.prompt(
'Scrape whole website into text!',
'Please enter website url (with http(s)://):',
ui.ButtonSet.OK_CANCEL);
// Process the user's response.
var button = result.getSelectedButton();
var url = result.getResponseText();
var links=[];
var base_url = url;
if (button == ui.Button.OK) { // User clicked "OK".
if(!isValidURL(url))
{
ui.alert('Your url is not valid.');
}
else {
// gather initial links
var inner_links_arr = scrapeAndPaste(url, 1); // first run and clear the document
links = links.concat(inner_links_arr); // append an array to all the links
var new_links=[]; // array for new links
var processed_urls =[url]; // processed links
var link, current;
while (links.length)
{
link = links.shift(); // get the most left link (inner url)
processed_urls.push(link);
current = base_url + link;
new_links = scrapeAndPaste(current, 0); // second and consecutive runs we do not clear up the document
//ui.alert('Processed... ' + current + '\nReturned links: ' + new_links.join('\n') );
// add new links into links array (stack) if appropriate
for (var i in new_links){
var item = new_links[i];
if (links.indexOf(item) === -1 && processed_urls.indexOf(item) === -1)
links.push(item);
}
/* // alert message for debugging
ui.alert('Links in stack: ' + links.join(' ')
+ '\nTotal links in stack: ' + links.length
+ '\nProcessed: ' + processed_urls.join(' ')
+ '\nTotal processed: ' + processed_urls.length);
*/
}
}
}
}
function scrapeAndPaste(url, clear) {
var text;
try {
var html = UrlFetchApp.fetch(url).getContentText();
// some html pre-processing
if (html.indexOf('</head>') !== -1 ){
html = html.split('</head>')[1];
}
if (html.indexOf('</body>') !== -1 ){ // thus we split the body only
html = html.split('</body>')[0] + '</body>';
}
// fetch inner links
var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) {
// matched text: match[0]
if (match[1].indexOf('#') !== 0
&& match[1].indexOf('http') !== 0
//&& match[1].indexOf('https://') !== 0
&& match[1].indexOf('mailto:') !== 0
&& match[1].indexOf('.pdf') === -1 ) {
inner_links_arr.push(match[1]);
}
// match start: match.index
// capturing group n: match[n]
match = linkRegExp.exec(html);
}
text = getTextFromHtml(html);
outputText(url, text, clear); // output text into the current document with given url
return inner_links_arr; //we return all inner links of this doc as array
} catch (e) {
MailApp.sendEmail(Session.getActiveUser().getEmail(), "Scrape error report at "
+ Utilities.formatDate(new Date(), "GMT", "yyyy-MM-dd HH:mm:ss"),
"\r\nMessage: " + e.message
+ "\r\nFile: " + e.fileName+ '.gs'
+ "\r\nWeb page under scrape: " + url
+ "\r\nLine: " + e.lineNumber);
outputText(url, 'Scrape error for this page cause of malformed html!', clear);
}
}
function getTextFromHtml(html) {
return getTextFromNode(Xml.parse(html, true).getElement());
}
function getTextFromNode(x) {
switch(x.toString()) {
case 'XmlText': return x.toXmlString();
case 'XmlElement': return x.getNodes().map(getTextFromNode).join(' ');
default: return '';
}
}
function outputText(url, text, clear){
var body = DocumentApp.getActiveDocument().getBody();
if (clear){
body.clear();
}
else {
body.appendHorizontalRule();
}
var section = body.appendParagraph(' * ' + url);
section.setHeading(DocumentApp.ParagraphHeading.HEADING2);
body.appendParagraph(text);
}
function isValidURL(url){
var RegExp = /^(([\w]+:)?\/\/)?(([\d\w]|%[a-fA-f\d]{2,2})+(:([\d\w]|%[a-fA-f\d]{2,2})+)?#)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,4}(:[\d]+)?(\/([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?$/;
if(RegExp.test(url)){
return true;
}else{
return false;
}
}