When you set the default view in admin settings, and you have multiple views checked to be available for user to switch, the default view doesn't get highlighted as "active".
To achieve this, in previous versions of Telescope (before RefactorScope) I modified the file client/components/menu/menu_component.js adding this (I had TOP as my default view, so I shamefully hardcoded it):
if (currentPath === "/" && getRoute(this) === "/top") {
itemClass += " item-active"
}
I understand, that editing Telescope source files was not wise thing to do, but it was the quickest and simplest solution.
Now, after the big Telescope refactor, I would like to do it the right way.
But the ultimate question is, what ist the right way to do this?
After some digging in Telescope source, I came to this solution:
Create a file named i.e. custom_view_menu.js in your custom package with this content:
getRoute = function (item) {
// if route is a Function return its result, else apply Router.path() to it
return typeof item.route == "function" ? item.route() : Router.path(item.route);
}
Template.menuItem.helpers({
itemClass: function () {
var itemClass = "";
var currentPath = Router.current().location.get().path;
if (this.adminOnly) {
itemClass += " item-admin";
}
if (getRoute(this) === currentPath || getRoute(this) === Meteor.absoluteUrl() + currentPath.substr(1)) {
// substr(1) is to avoid having two "/" in the URL
itemClass += " item-active";
}
if (this.label === Settings.get("defaultView") && currentPath === "/") {
itemClass += " item-active";
}
if (this.itemClass) {
itemClass += " "+this.itemClass;
}
return itemClass;
}
});
It's essentials copied from original source (https://github.com/TelescopeJS/Telescope/blob/master/packages/telescope-core/lib/client/templates/menu/menu_component.js) with this snipped added:
if (this.label === Settings.get("defaultView") && currentPath === "/") {
itemClass += " item-active";
}
I hope that it will help someone and I wasn't the only one trying to do this :)
Related
I'm using AngularJS ver. 1.2.15 on my project. And, I have a select element on one of my views as per below:
<select class="select-white form-control form-select" id="cat2_{{feed.id}}" ng-model="feed.operationstatusid" ng-change="updateCategoryAndStatus(feed, true)"></select>
And, I'm feeding this element like this:
function SetCategory2(cat1Id, feed) {
var feedId = feed.id;
var fromRuleOpStatusId = -1;
$('#cat2_' + feedId).find('option').remove();
if (cat1Id > -1) {
$('#cat2_' + feedId).append($('<option></option>').text(lang.SelectSubCategory).val(0));
$.each($scope.category2, function (index, cat2Item) {
$('#cat2_' + feedId).append($('<option></option>').text(cat2Item.statusdescription).val(cat2Item.id));
});
var isselected = false;
$.each($scope.category2, function (index, cat2Item) {
if (feed.operationstatusid == cat2Item.id) {
$('#cat2_' + feedId).val(cat2Item.id);
fromRuleOpStatusId = -1;
isselected = true;
}
else {
var feedStr = "";
if (feed.title != undefined && feed.title != null) {
feedStr = feed.title.toLowerCase();
}
if ($scope.catTitleRulesTwo) {
$.each($scope.catTitleRulesTwo, function (r_index, r_item) {
if (cat2Item.id == r_item.titleCode && !isselected) {
if (feedStr != undefined && feedStr != null && r_item != undefined && r_item != null) {
String.prototype.contains = function (str) { return this.toLowerCase().indexOf(str) !== -1; };
var text = feedStr;
if (eval(r_item.ruleexpression)) {
$('#cat2_' + feedId).val(cat2Item.id);
fromRuleOpStatusId = cat2Item.id;
isselected = true;
}
}
}
});
}
}
});
if (fromRuleOpStatusId != -1) {
feed.operationstatusid = fromRuleOpStatusId;
}
}
else {
$('#cat2_' + feedId).append($('<option></option>').text(lang.SelectSubCategory).val(0));
}
}
I am aware of the facts about eval function, but the project I'm working on is quite old, so does the code. Anyway, this is about business logic and quite irrelevant with the thing I'm going to ask (or so I was thinking).
As you can see I'm appending all the options before I set the value of the selectbox with using .val(...). I have also checked that values do match along with the data types. But, when I observe this function step by step, I saw that selected value does show up without flaw. After the code finish with my above mentioned function (SetCategory2), code goes through on of the function located on AngularJS file, named xhr.onreadystatechange. It's not a long function, so I'm sharing it also on below.
xhr.onreadystatechange = function() {
if (xhr && xhr.readyState == 4) {
var responseHeaders = null,
response = null;
if(status !== ABORTED) {
responseHeaders = xhr.getAllResponseHeaders();
response = ('response' in xhr) ? xhr.response : xhr.responseText;
}
completeRequest(callback,
status || xhr.status,
response,
responseHeaders);
}
};
After the code released from this function, respective selectbox's value is pointed at the empty option.
I have run into topics which talks about this behaviour might due to invalid option-value match, but as I described above, I append all my options before deciding the value. So, I can't figure out what I'm missing.
Thank you in advance.
I have a simple VIN Decoder script that I built for my Vehicle DB Sheet. I want to allow other sheets to use the functions defined in the script without copying the code to the script containers for each spreadsheet. I guess I essentially want to have a private (to my account or domain) add-on. I have tried reading about how to deploy an add-on to Google Workplace but all the tutorials are either old or just provide sample code that doesn't answer how to do it. I am sure this is not a huge project to deploy this code as an add-on. Anyone?
Here is the code I am trying to deploy...
const nhtsaGateway = 'https://vpic.nhtsa.dot.gov/api/';
const nhtsaVINDecode = '/vehicles/DecodeVin/';
function decodeVIN(theVIN,theVariable) {
var response, jsonData, retValue, success;
success = false;
if (typeof(theVIN) === 'undefined') {
theVIN = 'WD4PF0CD3KP053982';
Logger.log('No VIN Submitted -- Assuming this is a test\nUsing Test VIN = [' + theVIN + ']');
}
response = UrlFetchApp.fetch(nhtsaGateway + nhtsaVINDecode + theVIN +'?format=JSON');
jsonData = JSON.parse(response.getContentText());
Logger.log(jsonData.Message);
if (typeof(theVariable) === 'undefined') {
Logger.log(jsonData);
return(jsonData);
}
jsonData.Results.every(function(element, index) {
Logger.log('<<<' + index + '>>>');
Logger.log(element.Value);
Logger.log(element.ValueId);
Logger.log(element.Variable);
Logger.log(element.VariableId);
if (element.Variable === theVariable) {
Logger.log('Found theVariable = ' + element.Variable);
retValue = element.Value;
success = true;
return (false);
} else {
return (true);
}
})
if (success) {
Logger.log(retValue);
return (retValue);
} else {
Logger.log('We should not be here --> ' + theVariable + ' <-- is not defined in the NHTSA response.');
}
}
function vinYear(theVIN) {return (decodeVIN(theVIN,'Model Year'))}
function vinMake (theVIN) {return (decodeVIN(theVIN,'Make'))}
function vinSeries (theVIN) {return (decodeVIN(theVIN,'Series'))}
function vinModel (theVIN) {return (decodeVIN(theVIN,'Model'))}
function vinGVWR (theVin) {return (decodeVIN('1FTYR2CM2KKB15306', 'Gross Vehicle Weight Rating From'))}
So the usage in the target spreadsheet would be this formula in a cell
=vinModel("1FTYR2CM2KKB15306")
you don't need to make an addon or an extension, just a library:
https://developers.google.com/apps-script/guides/libraries
Try adding the name of the library in front of your function after adding the library, e.g.
Mylibrary.decodeVIN()
I made a formula to extract some Wikipedia data in Google Seets which works fine. Here is the formula:
=regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Geography')]]"))),"\[[^\]]+\]","")&char(10)&char(10)&iferror(regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Education')]]"))),"\[[^\]]+\]",""))
Where D2 is a URL like https://en.wikipedia.org/wiki/Abbeville,_Alabama
This extracts some Geography and Education data from the Wikipedia page. Trouble is that importxml only runs a few times before it dies due to quota.
So I thought maybe better to use Apps Script where there are much higher limits on fetching and parsing. I could not see a good way however of using Xpath in Apps Script. Older posts on the web discuss using a deprecated service called Xml but it seems to no longer work. There is a Service called XmlService which looks like it may do the job but you can't just plug in an Xpath. It looks like a lot of sweating to get to the result. Any solutions out there where you can just plug in Xpath?
Here is an alternative solution I actually do in a case like this.
I have used XmlService but only for parsing the content, not for using Xpath. This makes use of the element tags and so far pretty consistent on my tests. Although, it might need tweaks when certain tags are in the result and you might have to include them into the exclusion condition.
Tested the code below in both links:
https://en.wikipedia.org/wiki/Abbeville,_Alabama#Geography
https://en.wikipedia.org/wiki/Montgomery,_Alabama#Education
My test shows that the formula above used did not return the proper output from the 2nd link while the code does. (Maybe because it was too long)
Code:
function getGeoAndEdu(path) {
var data = UrlFetchApp.fetch(path).getContentText();
// wikipedia is divided into sections, if output is cut, increase the number
var regex = /.{1,100000}/g;
var results = [];
// flag to determine if matches should be added
var foundFlag = false;
do {
m = regex.exec(data);
if (foundFlag) {
// if another header is found during generation of data, stop appending the matches
if (matchTag(m[0], "<h2>"))
foundFlag = false;
// exclude tables, sub-headers and divs containing image description
else if(matchTag(m[0], "<div") || matchTag(m[0], "<h3") ||
matchTag(m[0], "<td") || matchTag(m[0], "<th"))
continue;
else
results.push(m[0]);
}
// start capturing if either IDs are found
if (m != null && (matchTag(m[0], "id=\"Geography\"") ||
matchTag(m[0], "id=\"Education\""))) {
foundFlag = true;
}
} while (m);
var output = results.map(function (str) {
// clean tags for XmlService
str = str.replace(/<[^>]*>/g, '').trim();
decode = XmlService.parse('<d>' + str + '</d>')
// convert html entity codes (e.g. ) to text
return decode.getRootElement().getText();
// filter blank results due to cleaning and empty sections
// separate data and remove citations before returning output
}).filter(result => result.trim().length > 1).join("\n").replace(/\[\d+\]/g, '');
return output;
}
// check if tag is found in string
function matchTag(string, tag) {
var regex = RegExp(tag);
return string.match(regex) && string.match(regex)[0] == tag;
}
Output:
Difference:
Formula ending output
Script ending output
Education ending in wikipedia
Note:
You still have quota when using UrlFetchApp but should be better than IMPORTXML's limit depending on the type of your account.
Reference:
Apps Script Quotas
Sorry I got very busy this week so I didn't reply. I took a look at your answer which seems to work fine, but it was quite code heavy. I wanted something I would understand so I coded my own solution. not that mine is any simpler. It's just my own code so it's easier for me to follow:
function getTextBetweenTags(html, paramatersInFirstTag, paramatersInLastTag) { //finds text values between 2 tags and removes internal tags to leave plain text.
//eg getTextBetweenTags(html,[['class="mw-headline"'],['id="Geography"']],[['class="wikitable mw-collapsible mw-made-collapsible"']])
// **Note: you may want to replace &#number; with ascII number
var openingTagPos = null;
var closingTagPos = null;
var previousChar = '';
var readingTag = false;
var newTag = '';
var tagEnd = false;
var regexFirstTagParams = [];
var regexLastTagParams = [];
//prepare regexes to test for parameters in opening and closing tags. put regexes in arrays so each condition can be tested separately
for (var i in paramatersInFirstTag) {
regexFirstTagParams.push(new RegExp(escapeRegex(paramatersInFirstTag[i][0])))
}
for (var i in paramatersInLastTag) {
regexLastTagParams.push(new RegExp(escapeRegex(paramatersInLastTag[i][0])))
}
var startTagIndex = null;
var endTagIndex = null;
var matches = 0;
for (var i = 0; i < html.length - 1; i++) {
var nextChar = html.substr(i, 1);
if (nextChar == '<' && previousChar != '\\') {
readingTag = true;
}
if (nextChar == '>' && previousChar != '\\') { //if end of tag found, check tag matches start or end tag
readingTag = false;
newTag += nextChar;
//test for firstTag
if (startTagIndex == null) {
var alltestsPass = true;
for (var j in regexFirstTagParams) {
if (!regexFirstTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
startTagIndex = i + 1;
//console.log('Start Tag',startTagIndex)
matches++;
}
}
//test for lastTag
else if (startTagIndex != null) {
var alltestsPass = true;
for (var j in regexLastTagParams) {
if (!regexLastTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
endTagIndex = i + 1;
matches++;
}
}
if(startTagIndex && endTagIndex) break;
newTag = '';
}
if (readingTag) newTag += nextChar;
previousChar = nextChar;
}
if (matches < 2) return 'No matches';
else return html.substring(startTagIndex, endTagIndex).replace(/<[^>]+>/g, '');
}
function escapeRegex(string) {
if (string == null) return string;
return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
My function requires an array of attributes for the start tag and an array of attributes for the end tag. It gets any text in between and removes any tags found inbetween. One issue I also noticed was there were often special characters (eg ) so they need to be replaced. I did that outside the scope of the function above.
The function could be easily improved to check the tag type (eg h2), but it wasn't necessary for the wikipedia case.
Here is a function where I called the above function. the html variable is just the result of UrlFetchApp.fetch('some wikipedia city url').getContextText();
function getWikiTexts(html) {
var geography = getTextBetweenTags(html, [['class="mw-headline"'], ['id="Geography']], [['class="mw-headline"']]);
var economy = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Economy']], 'span', [['class="mw-headline"']])
var education = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Education']], 'span', [['class="mw-headline"']])
var returnString = '';
if (geography != 'No matches' && !/Wikipedia/.test(geography)) returnString += geography + '\n';
if (economy != 'No matches' && !/Wikipedia/.test(economy)) returnString += economy + '\n';
if (education != 'No matches' && !/Wikipedia/.test(education)) returnString += education + '\n';
return returnString
}
Thanks for posting your answer.
Why does HTML5 validation fail when having duplicate element IDs but on different <template>. I'm planning to use only one template at a time so the actual DOM ID won't be duplicated.
something like:
<template id="companyAccount">
<li><label>Company: <input type="text" id="account_name"></label></li>
<li><label>Street: <input id="account_street" ...
...
</template>
<template id="residentialAccount">
<li><label>Name: <input type="text" id="account_name"></label></li>
<li><label>Street: <input id="account_street" ...
...
</template>
<script>
...
let template = $(isResidential ? '#residentialAccount' : '#companyAccount').get(0).content;
$('#account_info').empty().append(template.cloneNode(true));
</script>
As TJBlackman mentioned each value for id attributes must be unique. However when working with code it's up to you to determine if the code being imported contains a duplicate id. Additionally I've pretty much stopped using validators for the most part as they aren't well maintained (and the W3C's CSS validator is atrocious) and the consoles, proper error handling and using the XML parser for HTML5 will tell you pretty much everything you need to know.
You have a couple of options. You can use data-account="residential" (don't make the mistake of using camelCasing as that will eventually put you in direct conflict with the standards bodies) and detect the attribute via document.querySelectorAll:
function $(o)
{
var a = true;
try {document.querySelectorAll(o);}
catch(err) {a = false; console.log('Error: "'+o+'" is not a valid CSS selector.'); sound.notice();}
return (a && document.querySelectorAll && document.querySelectorAll(o)) ? document.querySelectorAll(o) : false;
}
Usage: $('[data-account="residential"]')[0].length and $('[data-account="residential"]')[0].value.
Alternatively you can do what I do with my platform which has things tightly integrated though this is the function I use that you might decide to slim down for your specific purposes. You can use document.createTreeWalker and literally go through every single element to scan for walker.currentNode.hasAttribute('id') to test against document.getElementById. This code is used before importing XML in to the DOM on my platform:
/******** part of larger ajax() function ********/
if (xhr.readyState == 4 && xhr.status != 204)
{}
//This code occurs within the above condition.
var r = jax_id_duplication_prevention(xhr.responseXML,param_id_container_pos,id_container);
if (r)
{
if (param_id_container_pos=='after') {id_container.parentNode.insertBefore(xml.getElementsByTagName('*')[0],id_container.nextSibling);}
else if (param_id_container_pos=='before') {id_container.parentNode.insertBefore(document.importNode(xml.getElementsByTagName('*')[0],true),id_container);}
else if (param_id_container_pos=='first')
{
if (id_container.childNodes.length > 0) {id_container.insertBefore(document.importNode(xml.getElementsByTagName('*')[0],true),id_container.firstChild);}
else {id_container.appendChild(document.importNode(xml.getElementsByTagName('*')[0],true));}
}
else if (param_id_container_pos=='inside') {id_container.appendChild(document.importNode(xml.getElementsByTagName('*')[0],true));}
else if (param_id_container_pos=='replace') {id_container.parentNode.replaceChild(document.importNode(xml.getElementsByTagName('*')[0],true),id_container);}
else if (param_id_container_pos=='fragment')
{
if (option.fragment) {delete option.fragment;}
option.fragment = document.importNode(new DOMParser().parseFromString(xhr.responseText,'application/xml').childNodes[0],true);
if (id_container && typeof id_container == 'function') {id_container();}
}
else {alert('Error: unknown position to import data to: '+id_container_pos);}
}
/******** part of larger ajax() function ********/
function ajax_id_duplication_prevention(xml,param_id_container_pos,id_container)
{
var re = true;
if (typeof id_container == 'string' && id_container.length > 0 && id_(id_container)) {id_container = id_(id_container);}
if (typeof option.id_fade == 'string' && option.id_fade.length > 0 && id_(option.id_fade)) {element_del(option.id_fade); option.id_fade = '';}
if (typeof xml.firstChild.hasAttribute == 'function')
{
if (xml.firstChild.hasAttribute('id') && xml.firstChild.getAttribute('id').length > 0 && id_(xml.firstChild.getAttribute('id')) && id_(xml.firstChild.id).parentNode.id=='liquid') {change(xml.firstChild.id,'fade');}
if (xml.firstChild.hasAttribute('id') && xml.firstChild.getAttribute('id').length > 0 && id_(xml.firstChild.id) && !id_(xml.firstChild.id).parentNode.id=='liquid') {re = false;}
else if (typeof document.createTreeWalker=='function')
{
var idz = [];
try
{
var walker = document.createTreeWalker(xml,NodeFilter.SHOW_ELEMENT,null,false);
while (walker.nextNode())
{
if (walker.currentNode.hasAttribute('id') && walker.currentNode.getAttribute('id').length > 0)
{
if (walker.currentNode.id==undefined && walker.currentNode.nodeName.toLowerCase()=='parsererror') {console.log('Error: a parser error was detected.');}
else if (walker.currentNode.id==undefined) {alert('walker.currentNode.nodeName = '+walker.currentNode.nodeName+'\n\n'+document.serializeToString(xml));}
else
{
for (var i = 0; i<id_('liquid').childNodes.length; i++)
{
if (id_('liquid').childNodes[i].nodeType==1 && id_(walker.currentNode.id) && is_node_parent(walker.currentNode.id,id_('liquid').childNodes[i]) && (param_id_container_pos!='replace' || walker.currentNode.id!=id_container.id))
{
if (param_id_container_pos != 'replace' && id_container != walker.currentNode.id) {element_del(id_('liquid').childNodes[i]);}//If changing operator test: ajax('get','?ajax=1&web3_url=/'+url_section()+'/'+url_page(),'replace',push_current_id());
}
}
var n = id_(walker.currentNode.id);
if (in_array(walker.currentNode.id,idz))
{
var fd = new FormData();
fd.append('ajax','error_xml');
fd.append('post_error','Duplicate id <code>'+walker.currentNode.id+'</code>.');
fd.append('post_url',url_window().split(url_base())[1].split('?')[0]);
fd.append('post_xml',new XMLSerializer().serializeToString(xml));
if (fd) {ajax('post',path+'/themes/',fd);}
modal.alert('Error: can not import XML, the id \''+walker.currentNode.id+'\' was detected twice in the layer being imported. Duplicated ID\'s break expected functionality and are illegal. While the XML content was not imported it is still possible that the related request was successful. It is possible to override this problem by simply doing a full request (press the Go button in your browser\'s graphic user interface) however if the id is referenced programmatically the website may exhibit unusual behavior.');
break;
setTimeout(function()
{
history.back();
push_reload();
console.log('Developer: duplicate id '+walker.currentNode.id+' was encounterted.');
if (status >= 9) {modal.xml('Duplicate ID Error', '%3Cp%3EError%3A%20the%20id%20%3Ccode%3E'+walker.currentNode.id+'%3C%2Fcode%3E%20occurred%20twice%20and%20therefore%20the%20page%20can%20not%20be%20viewed.%3C%2Fp%3E%3Cp%3EFor%20debugging%20and%20fixing%20purposes%20you%20should%20consider%20opening%20the%20URL%20in%20a%20new%20tab.%3C%2Fp%3E%3Cdiv%20class%3D%22center%20margin%22%3E%3Cinput%20onclick%3D%22modal.close()%3B%22%20tabindex%3D%223%22%20type%3D%22button%22%20value%3D%22Close%22%20%2F%3E%3C%2Fdiv%3E');}
else {modal.xml('Duplicate ID Error', '%3Cp%3EError%3A%20the%20id%20%3Ccode%3E'+walker.currentNode.id+'%3C%2Fcode%3E%20occurred%20twice%20and%20therefore%20the%20page%20can%20not%20be%20viewed.%3C%2Fp%3E%3Cdiv%20class%3D%22center%20margin%22%3E%3Cinput%20onclick%3D%22modal.close()%3B%22%20tabindex%3D%223%22%20type%3D%22button%22%20value%3D%22Close%22%20%2F%3E%3C%2Fdiv%3E');}
re = false;
},4000);
}
else {idz.push(walker.currentNode.id);}
}
if (id_(walker.currentNode.id) && (param_id_container_pos!='replace' && walker.currentNode.id!=id_container.id && !is_node_parent(walker.currentNode.id,id_container)))
{//ajax replace (carousel loader) complications if changed.
re = false;
modal.alert('Error: unable to import page, the id \''+walker.currentNode.id+'\' already exists in the DOM.');
break;
}
}
}
}
catch (err) {}//IE9
}
}
return re;
}
Regardless of how you approach addressing the issue it is not one addressed by the standards bodies and must be explicitly handled by developers. Failure to handle duplicate id attribute/values will result in the wrong element being chosen at some point which may quietly accrue compromised/malformed data over time that no one might notice for years and thus could easily hinder if not outright destroy any business relations effected by it. Good luck!
I want to see how much of my team's code has been integrated into a large scale site.
I believe I can achieve this (albeit roughly), by getting statistics on the number of occurrences certain CSS selectors appear across all the HTML pages. I have some unique CSS class selectors that I would like to use when scraping the site to analyze:
On how many pages the selector occurs.
On any page it does, how many times.
I've looked around but can't find any tools - does anyone know of any, or could suggest any idea's that may help me quickly achieve this ?
Thanks in advance.
Thanks to everyone for their advice.
In the end I decided that there was no one tool that could help me gather the statistics in the way I described so I already started to build up the application I needed in Node. Although I've not used Node before I've found it quick to grasp with an intermediate knowledge of Javascript.
For anyone looking to do the same:
I've used Simplecrawler to run over the site and Cheerio to find selectors and from this I can create a simple report created in Json using FS.
I'd recommend you to use Google App Scripting. You might manage to crawl site's pages and count the CSS selector occurrences with regex. Modify he following code to search each page for CSS selector. The code explanation is here.
Code
function onOpen() {
DocumentApp.getUi() // Or DocumentApp or FormApp.
.createMenu('New scrape web docs')
.addItem('Enter Url', 'showPrompt')
.addToUi();
}
function showPrompt() {
var ui = DocumentApp.getUi();
var result = ui.prompt(
'Scrape whole website into text!',
'Please enter website url (with http(s)://):',
ui.ButtonSet.OK_CANCEL);
// Process the user's response.
var button = result.getSelectedButton();
var url = result.getResponseText();
var links=[];
var base_url = url;
if (button == ui.Button.OK) { // User clicked "OK".
if(!isValidURL(url))
{
ui.alert('Your url is not valid.');
}
else {
// gather initial links
var inner_links_arr = scrapeAndPaste(url, 1); // first run and clear the document
links = links.concat(inner_links_arr); // append an array to all the links
var new_links=[]; // array for new links
var processed_urls =[url]; // processed links
var link, current;
while (links.length)
{
link = links.shift(); // get the most left link (inner url)
processed_urls.push(link);
current = base_url + link;
new_links = scrapeAndPaste(current, 0); // second and consecutive runs we do not clear up the document
//ui.alert('Processed... ' + current + '\nReturned links: ' + new_links.join('\n') );
// add new links into links array (stack) if appropriate
for (var i in new_links){
var item = new_links[i];
if (links.indexOf(item) === -1 && processed_urls.indexOf(item) === -1)
links.push(item);
}
/* // alert message for debugging
ui.alert('Links in stack: ' + links.join(' ')
+ '\nTotal links in stack: ' + links.length
+ '\nProcessed: ' + processed_urls.join(' ')
+ '\nTotal processed: ' + processed_urls.length);
*/
}
}
}
}
function scrapeAndPaste(url, clear) {
var text;
try {
var html = UrlFetchApp.fetch(url).getContentText();
// some html pre-processing
if (html.indexOf('</head>') !== -1 ){
html = html.split('</head>')[1];
}
if (html.indexOf('</body>') !== -1 ){ // thus we split the body only
html = html.split('</body>')[0] + '</body>';
}
// fetch inner links
var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) {
// matched text: match[0]
if (match[1].indexOf('#') !== 0
&& match[1].indexOf('http') !== 0
//&& match[1].indexOf('https://') !== 0
&& match[1].indexOf('mailto:') !== 0
&& match[1].indexOf('.pdf') === -1 ) {
inner_links_arr.push(match[1]);
}
// match start: match.index
// capturing group n: match[n]
match = linkRegExp.exec(html);
}
text = getTextFromHtml(html);
outputText(url, text, clear); // output text into the current document with given url
return inner_links_arr; //we return all inner links of this doc as array
} catch (e) {
MailApp.sendEmail(Session.getActiveUser().getEmail(), "Scrape error report at "
+ Utilities.formatDate(new Date(), "GMT", "yyyy-MM-dd HH:mm:ss"),
"\r\nMessage: " + e.message
+ "\r\nFile: " + e.fileName+ '.gs'
+ "\r\nWeb page under scrape: " + url
+ "\r\nLine: " + e.lineNumber);
outputText(url, 'Scrape error for this page cause of malformed html!', clear);
}
}
function getTextFromHtml(html) {
return getTextFromNode(Xml.parse(html, true).getElement());
}
function getTextFromNode(x) {
switch(x.toString()) {
case 'XmlText': return x.toXmlString();
case 'XmlElement': return x.getNodes().map(getTextFromNode).join(' ');
default: return '';
}
}
function outputText(url, text, clear){
var body = DocumentApp.getActiveDocument().getBody();
if (clear){
body.clear();
}
else {
body.appendHorizontalRule();
}
var section = body.appendParagraph(' * ' + url);
section.setHeading(DocumentApp.ParagraphHeading.HEADING2);
body.appendParagraph(text);
}
function isValidURL(url){
var RegExp = /^(([\w]+:)?\/\/)?(([\d\w]|%[a-fA-f\d]{2,2})+(:([\d\w]|%[a-fA-f\d]{2,2})+)?#)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,4}(:[\d]+)?(\/([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?$/;
if(RegExp.test(url)){
return true;
}else{
return false;
}
}