Prevent UrlFetchApp creating special entities - html

Is there any way to prevent Apps Script URlFetchApp from creating special characters? For example on a webpage there might be an apostrophe but when I look in the fetched page source that apostrophe would be written as '

Use this decode function to decode your string:
function myFunction(str) {
var str = "'<a>Content © <#>&<&#># </a>"
var decoded = decode(str);
Logger.log(decoded)
}
function decode(str) {
return str.replace(/&#(\d+);/g, function(match, dec) {
return String.fromCharCode(dec);
}).replace(/(&#x(\d+);)/g, function(match, str1) {
var hex = match.toString().match(/(\d+)/g)[0];
var str = '';
for (var n = 0; n < hex.length; n += 2) {
str += String.fromCharCode(parseInt(hex.substr(n, 2), 16));
}
return str;
});
}
This will decode the HTML entities in JavaScript. Combined both my references. Decodes ones with x prepended and the others without.
Reference:
https://stackoverflow.com/a/23270912/16132436
https://www.codegrepper.com/code-examples/javascript/hex+to+ascii+function+javascript

Related

xpath in apps script?

I made a formula to extract some Wikipedia data in Google Seets which works fine. Here is the formula:
=regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Geography')]]"))),"\[[^\]]+\]","")&char(10)&char(10)&iferror(regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Education')]]"))),"\[[^\]]+\]",""))
Where D2 is a URL like https://en.wikipedia.org/wiki/Abbeville,_Alabama
This extracts some Geography and Education data from the Wikipedia page. Trouble is that importxml only runs a few times before it dies due to quota.
So I thought maybe better to use Apps Script where there are much higher limits on fetching and parsing. I could not see a good way however of using Xpath in Apps Script. Older posts on the web discuss using a deprecated service called Xml but it seems to no longer work. There is a Service called XmlService which looks like it may do the job but you can't just plug in an Xpath. It looks like a lot of sweating to get to the result. Any solutions out there where you can just plug in Xpath?
Here is an alternative solution I actually do in a case like this.
I have used XmlService but only for parsing the content, not for using Xpath. This makes use of the element tags and so far pretty consistent on my tests. Although, it might need tweaks when certain tags are in the result and you might have to include them into the exclusion condition.
Tested the code below in both links:
https://en.wikipedia.org/wiki/Abbeville,_Alabama#Geography
https://en.wikipedia.org/wiki/Montgomery,_Alabama#Education
My test shows that the formula above used did not return the proper output from the 2nd link while the code does. (Maybe because it was too long)
Code:
function getGeoAndEdu(path) {
var data = UrlFetchApp.fetch(path).getContentText();
// wikipedia is divided into sections, if output is cut, increase the number
var regex = /.{1,100000}/g;
var results = [];
// flag to determine if matches should be added
var foundFlag = false;
do {
m = regex.exec(data);
if (foundFlag) {
// if another header is found during generation of data, stop appending the matches
if (matchTag(m[0], "<h2>"))
foundFlag = false;
// exclude tables, sub-headers and divs containing image description
else if(matchTag(m[0], "<div") || matchTag(m[0], "<h3") ||
matchTag(m[0], "<td") || matchTag(m[0], "<th"))
continue;
else
results.push(m[0]);
}
// start capturing if either IDs are found
if (m != null && (matchTag(m[0], "id=\"Geography\"") ||
matchTag(m[0], "id=\"Education\""))) {
foundFlag = true;
}
} while (m);
var output = results.map(function (str) {
// clean tags for XmlService
str = str.replace(/<[^>]*>/g, '').trim();
decode = XmlService.parse('<d>' + str + '</d>')
// convert html entity codes (e.g.  ) to text
return decode.getRootElement().getText();
// filter blank results due to cleaning and empty sections
// separate data and remove citations before returning output
}).filter(result => result.trim().length > 1).join("\n").replace(/\[\d+\]/g, '');
return output;
}
// check if tag is found in string
function matchTag(string, tag) {
var regex = RegExp(tag);
return string.match(regex) && string.match(regex)[0] == tag;
}
Output:
Difference:
Formula ending output
Script ending output
Education ending in wikipedia
Note:
You still have quota when using UrlFetchApp but should be better than IMPORTXML's limit depending on the type of your account.
Reference:
Apps Script Quotas
Sorry I got very busy this week so I didn't reply. I took a look at your answer which seems to work fine, but it was quite code heavy. I wanted something I would understand so I coded my own solution. not that mine is any simpler. It's just my own code so it's easier for me to follow:
function getTextBetweenTags(html, paramatersInFirstTag, paramatersInLastTag) { //finds text values between 2 tags and removes internal tags to leave plain text.
//eg getTextBetweenTags(html,[['class="mw-headline"'],['id="Geography"']],[['class="wikitable mw-collapsible mw-made-collapsible"']])
// **Note: you may want to replace &#number; with ascII number
var openingTagPos = null;
var closingTagPos = null;
var previousChar = '';
var readingTag = false;
var newTag = '';
var tagEnd = false;
var regexFirstTagParams = [];
var regexLastTagParams = [];
//prepare regexes to test for parameters in opening and closing tags. put regexes in arrays so each condition can be tested separately
for (var i in paramatersInFirstTag) {
regexFirstTagParams.push(new RegExp(escapeRegex(paramatersInFirstTag[i][0])))
}
for (var i in paramatersInLastTag) {
regexLastTagParams.push(new RegExp(escapeRegex(paramatersInLastTag[i][0])))
}
var startTagIndex = null;
var endTagIndex = null;
var matches = 0;
for (var i = 0; i < html.length - 1; i++) {
var nextChar = html.substr(i, 1);
if (nextChar == '<' && previousChar != '\\') {
readingTag = true;
}
if (nextChar == '>' && previousChar != '\\') { //if end of tag found, check tag matches start or end tag
readingTag = false;
newTag += nextChar;
//test for firstTag
if (startTagIndex == null) {
var alltestsPass = true;
for (var j in regexFirstTagParams) {
if (!regexFirstTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
startTagIndex = i + 1;
//console.log('Start Tag',startTagIndex)
matches++;
}
}
//test for lastTag
else if (startTagIndex != null) {
var alltestsPass = true;
for (var j in regexLastTagParams) {
if (!regexLastTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
endTagIndex = i + 1;
matches++;
}
}
if(startTagIndex && endTagIndex) break;
newTag = '';
}
if (readingTag) newTag += nextChar;
previousChar = nextChar;
}
if (matches < 2) return 'No matches';
else return html.substring(startTagIndex, endTagIndex).replace(/<[^>]+>/g, '');
}
function escapeRegex(string) {
if (string == null) return string;
return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
My function requires an array of attributes for the start tag and an array of attributes for the end tag. It gets any text in between and removes any tags found inbetween. One issue I also noticed was there were often special characters (eg  ) so they need to be replaced. I did that outside the scope of the function above.
The function could be easily improved to check the tag type (eg h2), but it wasn't necessary for the wikipedia case.
Here is a function where I called the above function. the html variable is just the result of UrlFetchApp.fetch('some wikipedia city url').getContextText();
function getWikiTexts(html) {
var geography = getTextBetweenTags(html, [['class="mw-headline"'], ['id="Geography']], [['class="mw-headline"']]);
var economy = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Economy']], 'span', [['class="mw-headline"']])
var education = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Education']], 'span', [['class="mw-headline"']])
var returnString = '';
if (geography != 'No matches' && !/Wikipedia/.test(geography)) returnString += geography + '\n';
if (economy != 'No matches' && !/Wikipedia/.test(economy)) returnString += economy + '\n';
if (education != 'No matches' && !/Wikipedia/.test(education)) returnString += education + '\n';
return returnString
}
Thanks for posting your answer.

Script that would find and mark the same words in the paragraph

I'm a fiction writer and I used to do my writing in MS Word. I've written some macros to help me edit the fiction text and one of them check the paragraph and marks (red) the duplicate (or triplicate words, etc). Example:
"I came **home**. And while at **home** I did this and that."
Word "home" is used twice and worth checking if I really can't change the sentence.
Now I mostly use google documents for writing, but I still have to do my editing in MS Word, mostly just because of this macro - I am not able to program it in the google script.
function PobarvajBesede() {
var doc = DocumentApp.getActiveDocument();
var cursor = DocumentApp.getActiveDocument().getCursor();
var surroundingText = cursor.getSurroundingText().getText();
var WordsString = WORDS(surroundingText);
Logger.log(WordsString);
//so far, so good. But this doesn't work:
var SortedWordsString = SORT(WordsString[1],1,False);
// and I'm lost.
}
function WORDS(input) {
var input = input.toString();
var inputSplit = input.split(" ");
// Logger.log(inputSplit);
inputSplit = inputSplit.toString();
var punctuationless = inputSplit.replace(/[.,\/#!$%\?^&\*;:{}=\-_`~()]/g," ");
var finalString = punctuationless.replace(/\s{2,}/g," ");
finalString = finalString.toLowerCase();
return finalString.split(" ") ;
}
If I could only get a list of words (in uppercase, longer than 3 characters), sorted by the number of their appearances in the logger, it would help me a lot:
HOME (2)
AND (1)
...
Thank you.
Flow:
Transform the string to upper case and sanitize the string of all non ascii characters
After splitting the string to word array, reduce the array to a object of word:count
Map the reduced object to a 2D array [[word,count of this word],[..],...] and sort the array by the inner array's count.
Snippet:
function wordCount(str) {
str = str || 'I came **home**. And while at **home** I did this and that.';
var countObj = str
.toUpperCase() //'I CAME **HOME**...'
.replace(/[^A-Z ]/g, '') //'I CAME HOME...'
.split(' ') //['I', 'CAME',..]
.reduce(function(obj, word) {
if (word.length >= 3) {
obj[word] = obj[word] ? ++obj[word] : 1;
}
return obj;
}, {}); //{HOME:2,DID:1}
return Object.keys(countObj)
.map(function(word) {
return [word, countObj[word]];
}) //[['HOME',2],['CAME',1],...]
.sort(function(a, b) {
return b[1] - a[1];
});
}
console.info(wordCount());
To read and practice:
Object
Array methods
This is a combination of TheMaster answer and some of my work. I need to learn more about the way he did it so I spent some learning time today. This function eliminates some problems I was having the carriage returns and it also removes items that only appear once. You should probably pick TheMasters solution as I couldn't have done it without his work.
function getDuplicateWords() {
var str=DocumentApp.getActiveDocument().getBody().getText();
var countObj = str
.toUpperCase()
.replace(/\n/g,' ')
.replace(/[^A-Z ]/g, '')
.split(' ')
.reduce(function(obj, word) {
if (word.length >= 2) {
obj[word] = obj[word] ? ++obj[word] : 1;
}
return obj;
}, {});
var oA=Object.keys(countObj).map(function(word){return [word, countObj[word]];}).filter(function(elem){return elem[1]>1;}).sort(function(a,b){return b[1]-a[1]});
var userInterface=HtmlService.createHtmlOutput(oA.join("<br />"));
DocumentApp.getUi().showSidebar(userInterface);
}
function onOpen() {
DocumentApp.getUi().createMenu('MyMenu')
.addItem('Get Duplicates','getDuplicateWords' )
.addToUi();
}
And yes I was having problems with get the results to change in my last solution.

Hash txt strings in GAS, incorrect line ending

I want hash (md5) some txt strings in GAS, and have a problem, may be
incorrect line ending.
Example:
test
test
correct hash 76ce9f441de2ed5de337d391ad4516b7
using GAS i getting wrong hash: e8230113fbba92427c1c41cf34a80c13
function test() {
var data = 'test\
test';
Logger.log(data.MD5());
return (data.MD5());
}
String.prototype.MD5 = function(charset, toByte) {
charset = charset || Utilities.Charset.UTF_8;
var digest = Utilities.computeDigest(Utilities.DigestAlgorithm.MD5, this, charset);
if (toByte) return digest;
var __ = '';
for (i = 0; i < digest.length; i++) {
var byte = digest[i];
if (byte < 0) byte += 256;
var bStr = byte.toString(16);
if (bStr.length == 1) bStr = '0' + bStr;
__ += bStr;
}
return __;
}
As already #Ameen mentioned you are checking different strings
function test(){
var s1 = 'test\ntest';
var s2 = 'test\r\ntest';
Logger.log(s1.MD5() === '76ce9f441de2ed5de337d391ad4516b7');
Logger.log(s2.MD5() === '76ce9f441de2ed5de337d391ad4516b7');
}
[19-03-22 18:03:11:441 MSK] false
[19-03-22 18:03:11:442 MSK] true
A string containing "\r\n" for non-Unix platforms, or a string containing "\n" for Unix platforms.
It seems you're working under Windows.

Understanding ES6 tagged template literal

Following code snippet is used on Mozilla (https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals) to explain Tagged Template literal, please help me understand what following function is doing, i am unable to get the actual flow of the function, since they have used keys.foreach and when i inspected in Chrome, keys was a function, so not able to understand
function template(strings, ...keys) {
return (function(...values) {
var dict = values[values.length - 1] || {};
var result = [strings[0]];
keys.forEach(function(key, i) {
var value = Number.isInteger(key) ? values[key] : dict[key];
result.push(value, strings[i + 1]);
});
return result.join('');
});
}
var t1Closure = template`${0}${1}${0}!`;
t1Closure('Y', 'A'); // "YAY!"
var t2Closure = template`${0} ${'foo'}!`;
t2Closure('Hello', {foo: 'World'}); // "Hello World!"
Most of the complexity in the example comes from the overloaded function and the forEach invocation, not from the tagged template literals. It might better have been written as two separate cases:
function dictionaryTemplate(strings, ...keys) {
return function(dict) {
var result = "";
for (var i=0; i<keys.length; i++)
result += strings[i] + dict[keys[i]];
result += strings[i];
return result;
};
}
const t = dictionaryTemplate`${0} ${'foo'}!`;
t({0: 'Hello', foo: 'World'}); // "Hello World!"
function argumentsTemplate(strings, ...keys) {
is (!keys.every(Number.isInteger))
throw new RangeError("The keys must be integers");
return function(...values) {
var result = "";
for (var i=0; i<keys.length; i++)
result += strings[i] + values[keys[i]];
result += strings[i];
return result;
};
}
const t = argumentsTemplate`${0}${1}${0}!`;
t('Y', 'A'); // "YAY!"
Template is a custom function defined by us to parse the template string, whenever a function is used to parse the template stringThe first argument of a tag function contains an array of string values. The remaining arguments are related to the expressions. so here specifically we have written the function to that given output I had got confused because when in inspected keys inside the forEach, i got a function in console, but inspecting the function before forEach gave keys as the array of configurable string ${0} and ${1} in first example

GoogleScript Spreadsheet Custom Function Handling a range of cells and getting their values

I have a Goggle Spreadsheet with some data, and I want to write a custom function to use in the sheet, which accepts a range of cells and a delimiter character, takes each cell value, splits it by the delimiter, and counts the total.
For example
Column A has the following values in rows 1-3: {"Sheep","Sheep,Dog","Cat"}
My function would be called like this: =CountDelimitedValues(A1:A3;",");
It should return the value: 4 (1+2+1)
The problem I am having is in my custom script I get errors like
"TypeError: cannot get function GetValues from type Sheep"
This is my current script:
function CountArrayList(arrayList, delimiter) {
var count = 0;
//for (i=0; i<array.length; i++)
//{
//count += array[i].split(delimiter).length;
//}
var newArray = arrayList.GetValues();
return newArray.ToString();
//return count;
}
I understand that the parameter arraylist is receiving an array of objects from the spreadsheet, however I don't know how to get the value out of those objects, or perhaps cast them into strings.
Alternatively I might be going about this in the wrong way? I have another script which extracts the text from a cell between two characters which works fine for a single cell. What is it about a range of cells that is different?
That's something you can achieve without using script but plain old formula's:
=SUM(ARRAYFORMULA(LEN(A1:A3)-LEN(SUBSTITUTE(A1:A3; ","; "")) + 1))
Credit goes here: https://webapps.stackexchange.com/q/37744/29140
something like this works :
function CountArrayList(arrayList) {
return arrayList.toString().split(',').length
}
wouldn't it be sufficient ?
edit Oooops, sorry I forgot the user defined delimiter, so like this
function CountArrayList(arrayList,del) {
return arrayList.toString().split(del).length
}
usage : =CountArrayList(A1:C1;",")
NOTE : in this example above it would be dangerous to use another delimiter than "," since the toString() joins the array elements with commas... if you really need to do so try using a regex to change the commas to what you use and apply the split on that.
try like this :
function CountArrayList(arrayList,del) {
return arrayList.toString().replace(/,/g,del).split(del).length
}
Another solution I have was that I needed to implicitly cast the objects in the array being passed as a string.
For example this function accepts the array of cells, and outputs their contents as a string with del as the delimiter (similar to the String.Split() function). Note the TrimString function and that it is being passed an element of the array.
function ArrayToString(array,del) {
var string = "";
for (i=0; i < array.length; i++) {
if (array[i] != null) {
var trimmedString = TrimString(array[i]);
if (trimmedString != "") {
if (string.length > 0) {
string += del;
}
string += trimmedString;
}
}
}
return string;
}
Below is the TrimString function.
function TrimString(string) {
var value = "";
if (string != "" && string != null) {
var newString = "";
newString += string;
var frontStringTrimmed = newString.replace(/^\s*/,"");
var backStringTrimmed = frontStringTrimmed.replace(/\s*$/,"");
value = backStringTrimmed;
}
return value;
}
What I found is that this code threw a TypeError unless I included the declaration of the newString variable, and added the array element object to it, implicitly casting the array element object as a string. Otherwise the replace() functions could not be called.