Scrape site to report css selector occurrence in HTML - html

I want to see how much of my team's code has been integrated into a large scale site.
I believe I can achieve this (albeit roughly), by getting statistics on the number of occurrences certain CSS selectors appear across all the HTML pages. I have some unique CSS class selectors that I would like to use when scraping the site to analyze:
On how many pages the selector occurs.
On any page it does, how many times.
I've looked around but can't find any tools - does anyone know of any, or could suggest any idea's that may help me quickly achieve this ?
Thanks in advance.

Thanks to everyone for their advice.
In the end I decided that there was no one tool that could help me gather the statistics in the way I described so I already started to build up the application I needed in Node. Although I've not used Node before I've found it quick to grasp with an intermediate knowledge of Javascript.
For anyone looking to do the same:
I've used Simplecrawler to run over the site and Cheerio to find selectors and from this I can create a simple report created in Json using FS.

I'd recommend you to use Google App Scripting. You might manage to crawl site's pages and count the CSS selector occurrences with regex. Modify he following code to search each page for CSS selector. The code explanation is here.
Code
function onOpen() {
DocumentApp.getUi() // Or DocumentApp or FormApp.
.createMenu('New scrape web docs')
.addItem('Enter Url', 'showPrompt')
.addToUi();
}
function showPrompt() {
var ui = DocumentApp.getUi();
var result = ui.prompt(
'Scrape whole website into text!',
'Please enter website url (with http(s)://):',
ui.ButtonSet.OK_CANCEL);
// Process the user's response.
var button = result.getSelectedButton();
var url = result.getResponseText();
var links=[];
var base_url = url;
if (button == ui.Button.OK) { // User clicked "OK".
if(!isValidURL(url))
{
ui.alert('Your url is not valid.');
}
else {
// gather initial links
var inner_links_arr = scrapeAndPaste(url, 1); // first run and clear the document
links = links.concat(inner_links_arr); // append an array to all the links
var new_links=[]; // array for new links
var processed_urls =[url]; // processed links
var link, current;
while (links.length)
{
link = links.shift(); // get the most left link (inner url)
processed_urls.push(link);
current = base_url + link;
new_links = scrapeAndPaste(current, 0); // second and consecutive runs we do not clear up the document
//ui.alert('Processed... ' + current + '\nReturned links: ' + new_links.join('\n') );
// add new links into links array (stack) if appropriate
for (var i in new_links){
var item = new_links[i];
if (links.indexOf(item) === -1 && processed_urls.indexOf(item) === -1)
links.push(item);
}
/* // alert message for debugging
ui.alert('Links in stack: ' + links.join(' ')
+ '\nTotal links in stack: ' + links.length
+ '\nProcessed: ' + processed_urls.join(' ')
+ '\nTotal processed: ' + processed_urls.length);
*/
}
}
}
}
function scrapeAndPaste(url, clear) {
var text;
try {
var html = UrlFetchApp.fetch(url).getContentText();
// some html pre-processing
if (html.indexOf('</head>') !== -1 ){
html = html.split('</head>')[1];
}
if (html.indexOf('</body>') !== -1 ){ // thus we split the body only
html = html.split('</body>')[0] + '</body>';
}
// fetch inner links
var inner_links_arr= [];
var linkRegExp = /href="(.*?)"/gi; // regex expression object
var match = linkRegExp.exec(html);
while (match != null) {
// matched text: match[0]
if (match[1].indexOf('#') !== 0
&& match[1].indexOf('http') !== 0
//&& match[1].indexOf('https://') !== 0
&& match[1].indexOf('mailto:') !== 0
&& match[1].indexOf('.pdf') === -1 ) {
inner_links_arr.push(match[1]);
}
// match start: match.index
// capturing group n: match[n]
match = linkRegExp.exec(html);
}
text = getTextFromHtml(html);
outputText(url, text, clear); // output text into the current document with given url
return inner_links_arr; //we return all inner links of this doc as array
} catch (e) {
MailApp.sendEmail(Session.getActiveUser().getEmail(), "Scrape error report at "
+ Utilities.formatDate(new Date(), "GMT", "yyyy-MM-dd HH:mm:ss"),
"\r\nMessage: " + e.message
+ "\r\nFile: " + e.fileName+ '.gs'
+ "\r\nWeb page under scrape: " + url
+ "\r\nLine: " + e.lineNumber);
outputText(url, 'Scrape error for this page cause of malformed html!', clear);
}
}
function getTextFromHtml(html) {
return getTextFromNode(Xml.parse(html, true).getElement());
}
function getTextFromNode(x) {
switch(x.toString()) {
case 'XmlText': return x.toXmlString();
case 'XmlElement': return x.getNodes().map(getTextFromNode).join(' ');
default: return '';
}
}
function outputText(url, text, clear){
var body = DocumentApp.getActiveDocument().getBody();
if (clear){
body.clear();
}
else {
body.appendHorizontalRule();
}
var section = body.appendParagraph(' * ' + url);
section.setHeading(DocumentApp.ParagraphHeading.HEADING2);
body.appendParagraph(text);
}
function isValidURL(url){
var RegExp = /^(([\w]+:)?\/\/)?(([\d\w]|%[a-fA-f\d]{2,2})+(:([\d\w]|%[a-fA-f\d]{2,2})+)?#)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,4}(:[\d]+)?(\/([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(#([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?$/;
if(RegExp.test(url)){
return true;
}else{
return false;
}
}

Related

Is there any way to see the names of a website's network requests with Google apps scripts?

So here's what I'm trying to do:
There's a website called Torah Anytime (https://www.torahanytime.com/) which publishes audio files (I guess you can call them podcasts, the website refers to them as shiurim, shiur being hebrew for song, or in this case, audio) on a daily basis. I would like to create a script that downloads the audio of specific speakers and then emails those files to me. The way I'm accomplishing this is with Google Apps Scripts. Torah Anytime allows you to follow specific speakers and to get email notifications when a speaker you're following puts out a new podcast. Here is the code that I have so far:
function main() {
var emails = getemails();
for (var i = 0; i < emails.length; i++) {
var email = emails[i].getMessages();
if (email[0].getFrom() == "TorahAnytime Following <following#torahanytime.com>"){
var title = getTitle(email);
var shiurID = getShiurID(email);
var downloadLink = "https://dl.torahanytime.com/audio/" + shiurID;
var shiur = downloadShiur(downloadLink);
shiur.setName(title);
var emailSent = emailShiur(shiur);
if (emailSent) {email[0].moveToTrash();
Logger.log("Email moved to Trash");}
}
}
}
function getemails() {
var label = GmailApp.getUserLabelByName("TA Speeches");
return label.getThreads();
}
function getTitle(email) {
body = email[0].getPlainBody();
var begIndex = body.indexOf("from") + 4;
var endIndex = body.indexOf("on ");
var title = body.substring(begIndex, endIndex).toLowerCase().replaceAll(" ", "-").replace(/(\r\n|\n|\r)/gm, "-");
begIndex = body.indexOf("called ") + 7;
endIndex = body.indexOf(" [");
title += "-" + body.substring(begIndex, endIndex).toLowerCase().replaceAll(" ", "-").replace(/(\r\n|\n|\r)/gm, "-") + ".mp3";
return title;
}
function getShiurID(email) {
body = email[0].getPlainBody();
var begIndex = body.indexOf("[") + 1;
var endIndex = body.indexOf("]");
var link = body.substring(begIndex, endIndex).replaceAll("?v", "?a");
console.log(link);
var mainLink = UrlFetchApp.fetch(link);
//here I somehow need to get the link being used to stream that particular audio file
}
function getIDName(email) {
body = email[0].getPlainBody();
var begIndex = body.indexOf("ID ") + 3;
var endIndex = body.indexOf(" and");
return body.substring(begIndex, endIndex);
}
function downloadShiur(downloadLink) {
var audio = UrlFetchApp.fetch(downloadLink);
return audio.getBlob().getAs('audio/mp3');
}
function emailShiur(shiur) {
const maxFileSize = 26214400;
if (shiur.getBytes().length <= maxFileSize) {
MailApp.sendEmail("[Email addressed removed]", "TA Shiur (File)", "Enjoy!", {
attachments: [shiur],
name: 'Automatic Emailer Script'
});
return true;
} else {
MailApp.sendEmail("[Email addressed removed]", "TA Shiur (File)", "Error, File too large to email", {
name: 'Automatic Emailer Script'
});
return false;
}
}
My issue is that the URL to download the file is not in the HTML, so I don't know how to get to it using GAS. If you use chrome's dev-tools, you can see the URL right there in the network tab Example of output I see that I want to get. Does anyone know of any way that I can get the information that I see in chrome's dev-tools network tab (the name's of the URLs being received) using GAS? Thank you!

xpath in apps script?

I made a formula to extract some Wikipedia data in Google Seets which works fine. Here is the formula:
=regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Geography')]]"))),"\[[^\]]+\]","")&char(10)&char(10)&iferror(regexreplace(join("",flatten(IMPORTXML(D2,".//p[preceding-sibling::h2[1][contains(., 'Education')]]"))),"\[[^\]]+\]",""))
Where D2 is a URL like https://en.wikipedia.org/wiki/Abbeville,_Alabama
This extracts some Geography and Education data from the Wikipedia page. Trouble is that importxml only runs a few times before it dies due to quota.
So I thought maybe better to use Apps Script where there are much higher limits on fetching and parsing. I could not see a good way however of using Xpath in Apps Script. Older posts on the web discuss using a deprecated service called Xml but it seems to no longer work. There is a Service called XmlService which looks like it may do the job but you can't just plug in an Xpath. It looks like a lot of sweating to get to the result. Any solutions out there where you can just plug in Xpath?
Here is an alternative solution I actually do in a case like this.
I have used XmlService but only for parsing the content, not for using Xpath. This makes use of the element tags and so far pretty consistent on my tests. Although, it might need tweaks when certain tags are in the result and you might have to include them into the exclusion condition.
Tested the code below in both links:
https://en.wikipedia.org/wiki/Abbeville,_Alabama#Geography
https://en.wikipedia.org/wiki/Montgomery,_Alabama#Education
My test shows that the formula above used did not return the proper output from the 2nd link while the code does. (Maybe because it was too long)
Code:
function getGeoAndEdu(path) {
var data = UrlFetchApp.fetch(path).getContentText();
// wikipedia is divided into sections, if output is cut, increase the number
var regex = /.{1,100000}/g;
var results = [];
// flag to determine if matches should be added
var foundFlag = false;
do {
m = regex.exec(data);
if (foundFlag) {
// if another header is found during generation of data, stop appending the matches
if (matchTag(m[0], "<h2>"))
foundFlag = false;
// exclude tables, sub-headers and divs containing image description
else if(matchTag(m[0], "<div") || matchTag(m[0], "<h3") ||
matchTag(m[0], "<td") || matchTag(m[0], "<th"))
continue;
else
results.push(m[0]);
}
// start capturing if either IDs are found
if (m != null && (matchTag(m[0], "id=\"Geography\"") ||
matchTag(m[0], "id=\"Education\""))) {
foundFlag = true;
}
} while (m);
var output = results.map(function (str) {
// clean tags for XmlService
str = str.replace(/<[^>]*>/g, '').trim();
decode = XmlService.parse('<d>' + str + '</d>')
// convert html entity codes (e.g.  ) to text
return decode.getRootElement().getText();
// filter blank results due to cleaning and empty sections
// separate data and remove citations before returning output
}).filter(result => result.trim().length > 1).join("\n").replace(/\[\d+\]/g, '');
return output;
}
// check if tag is found in string
function matchTag(string, tag) {
var regex = RegExp(tag);
return string.match(regex) && string.match(regex)[0] == tag;
}
Output:
Difference:
Formula ending output
Script ending output
Education ending in wikipedia
Note:
You still have quota when using UrlFetchApp but should be better than IMPORTXML's limit depending on the type of your account.
Reference:
Apps Script Quotas
Sorry I got very busy this week so I didn't reply. I took a look at your answer which seems to work fine, but it was quite code heavy. I wanted something I would understand so I coded my own solution. not that mine is any simpler. It's just my own code so it's easier for me to follow:
function getTextBetweenTags(html, paramatersInFirstTag, paramatersInLastTag) { //finds text values between 2 tags and removes internal tags to leave plain text.
//eg getTextBetweenTags(html,[['class="mw-headline"'],['id="Geography"']],[['class="wikitable mw-collapsible mw-made-collapsible"']])
// **Note: you may want to replace &#number; with ascII number
var openingTagPos = null;
var closingTagPos = null;
var previousChar = '';
var readingTag = false;
var newTag = '';
var tagEnd = false;
var regexFirstTagParams = [];
var regexLastTagParams = [];
//prepare regexes to test for parameters in opening and closing tags. put regexes in arrays so each condition can be tested separately
for (var i in paramatersInFirstTag) {
regexFirstTagParams.push(new RegExp(escapeRegex(paramatersInFirstTag[i][0])))
}
for (var i in paramatersInLastTag) {
regexLastTagParams.push(new RegExp(escapeRegex(paramatersInLastTag[i][0])))
}
var startTagIndex = null;
var endTagIndex = null;
var matches = 0;
for (var i = 0; i < html.length - 1; i++) {
var nextChar = html.substr(i, 1);
if (nextChar == '<' && previousChar != '\\') {
readingTag = true;
}
if (nextChar == '>' && previousChar != '\\') { //if end of tag found, check tag matches start or end tag
readingTag = false;
newTag += nextChar;
//test for firstTag
if (startTagIndex == null) {
var alltestsPass = true;
for (var j in regexFirstTagParams) {
if (!regexFirstTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
startTagIndex = i + 1;
//console.log('Start Tag',startTagIndex)
matches++;
}
}
//test for lastTag
else if (startTagIndex != null) {
var alltestsPass = true;
for (var j in regexLastTagParams) {
if (!regexLastTagParams[j].test(newTag)) alltestsPass = false;
}
if (alltestsPass) {
endTagIndex = i + 1;
matches++;
}
}
if(startTagIndex && endTagIndex) break;
newTag = '';
}
if (readingTag) newTag += nextChar;
previousChar = nextChar;
}
if (matches < 2) return 'No matches';
else return html.substring(startTagIndex, endTagIndex).replace(/<[^>]+>/g, '');
}
function escapeRegex(string) {
if (string == null) return string;
return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
My function requires an array of attributes for the start tag and an array of attributes for the end tag. It gets any text in between and removes any tags found inbetween. One issue I also noticed was there were often special characters (eg  ) so they need to be replaced. I did that outside the scope of the function above.
The function could be easily improved to check the tag type (eg h2), but it wasn't necessary for the wikipedia case.
Here is a function where I called the above function. the html variable is just the result of UrlFetchApp.fetch('some wikipedia city url').getContextText();
function getWikiTexts(html) {
var geography = getTextBetweenTags(html, [['class="mw-headline"'], ['id="Geography']], [['class="mw-headline"']]);
var economy = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Economy']], 'span', [['class="mw-headline"']])
var education = getTextBetweenTags(html, 'span', [['class="mw-headline"'], ['id="Education']], 'span', [['class="mw-headline"']])
var returnString = '';
if (geography != 'No matches' && !/Wikipedia/.test(geography)) returnString += geography + '\n';
if (economy != 'No matches' && !/Wikipedia/.test(economy)) returnString += economy + '\n';
if (education != 'No matches' && !/Wikipedia/.test(education)) returnString += education + '\n';
return returnString
}
Thanks for posting your answer.

Google app script - getting all hyperlinks from document [duplicate]

Given a "normal document" in Google Docs/Drive (e.g. paragraphs, lists, tables) which contains external links scattered throughout the content, how do you compile a list of links present using Google Apps Script?
Specifically, I want to update all broken links in the document by searching for oldText in each url and replace it with newText in each url, but not the text.
I don't think the replacing text section of the Dev Documentation is what I need -- do I need to scan every element of the doc? Can I just editAsText and use an html regex? Examples would be appreciated.
This is only mostly painful! Code is available as part of a gist.
Yeah, I can't spell.
getAllLinks
Here's a utility function that scans the document for all LinkUrls, returning them in an array.
/**
* Get an array of all LinkUrls in the document. The function is
* recursive, and if no element is provided, it will default to
* the active document's Body element.
*
* #param {Element} element The document element to operate on.
* .
* #returns {Array} Array of objects, vis
* {element,
* startOffset,
* endOffsetInclusive,
* url}
*/
function getAllLinks(element) {
var links = [];
element = element || DocumentApp.getActiveDocument().getBody();
if (element.getType() === DocumentApp.ElementType.TEXT) {
var textObj = element.editAsText();
var text = element.getText();
var inUrl = false;
for (var ch=0; ch < text.length; ch++) {
var url = textObj.getLinkUrl(ch);
if (url != null) {
if (!inUrl) {
// We are now!
inUrl = true;
var curUrl = {};
curUrl.element = element;
curUrl.url = String( url ); // grab a copy
curUrl.startOffset = ch;
}
else {
curUrl.endOffsetInclusive = ch;
}
}
else {
if (inUrl) {
// Not any more, we're not.
inUrl = false;
links.push(curUrl); // add to links
curUrl = {};
}
}
}
if (inUrl) {
// in case the link ends on the same char that the element does
links.push(curUrl);
}
}
else {
var numChildren = element.getNumChildren();
for (var i=0; i<numChildren; i++) {
links = links.concat(getAllLinks(element.getChild(i)));
}
}
return links;
}
findAndReplaceLinks
This utility builds on getAllLinks to do a find & replace function.
/**
* Replace all or part of UrlLinks in the document.
*
* #param {String} searchPattern the regex pattern to search for
* #param {String} replacement the text to use as replacement
*
* #returns {Number} number of Urls changed
*/
function findAndReplaceLinks(searchPattern,replacement) {
var links = getAllLinks();
var numChanged = 0;
for (var l=0; l<links.length; l++) {
var link = links[l];
if (link.url.match(searchPattern)) {
// This link needs to be changed
var newUrl = link.url.replace(searchPattern,replacement);
link.element.setLinkUrl(link.startOffset, link.endOffsetInclusive, newUrl);
numChanged++
}
}
return numChanged;
}
Demo UI
To demonstrate the use of these utilities, here are a couple of UI extensions:
function onOpen() {
// Add a menu with some items, some separators, and a sub-menu.
DocumentApp.getUi().createMenu('Utils')
.addItem('List Links', 'sidebarLinks')
.addItem('Replace Link Text', 'searchReplaceLinks')
.addToUi();
}
function searchReplaceLinks() {
var ui = DocumentApp.getUi();
var app = UiApp.createApplication()
.setWidth(250)
.setHeight(100)
.setTitle('Change Url text');
var form = app.createFormPanel();
var flow = app.createFlowPanel();
flow.add(app.createLabel("Find: "));
flow.add(app.createTextBox().setName("searchPattern"));
flow.add(app.createLabel("Replace: "));
flow.add(app.createTextBox().setName("replacement"));
var handler = app.createServerHandler('myClickHandler');
flow.add(app.createSubmitButton("Submit").addClickHandler(handler));
form.add(flow);
app.add(form);
ui.showDialog(app);
}
// ClickHandler to close dialog
function myClickHandler(e) {
var app = UiApp.getActiveApplication();
app.close();
return app;
}
function doPost(e) {
var numChanged = findAndReplaceLinks(e.parameter.searchPattern,e.parameter.replacement);
var ui = DocumentApp.getUi();
var app = UiApp.createApplication();
sidebarLinks(); // Update list
var result = DocumentApp.getUi().alert(
'Results',
"Changed "+numChanged+" urls.",
DocumentApp.getUi().ButtonSet.OK);
}
/**
* Shows a custom HTML user interface in a sidebar in the Google Docs editor.
*/
function sidebarLinks() {
var links = getAllLinks();
var sidebar = HtmlService
.createHtmlOutput()
.setTitle('URL Links')
.setWidth(350 /* pixels */);
// Display list of links, url only.
for (var l=0; l<links.length; l++) {
var link = links[l];
sidebar.append('<p>'+link.url);
}
DocumentApp.getUi().showSidebar(sidebar);
}
I offer another, shorter answer for your first question, concerning iterating through all links in a document's body. This instructive code returns a flat array of links in the current document's body, where each link is represented by an object with entries pointing to the text element (text), the paragraph element or list item element in which it's contained (paragraph), the offset index in the text where the link appears (startOffset) and the URL itself (url). Hopefully, you'll find it easy to suit it for your own needs.
It uses the getTextAttributeIndices() method rather than iterating over every character of the text, and is thus expected to perform much more quickly than previously written answers.
EDIT: Since originally posting this answer, I modified the function a couple of times. It now also (1) includes the endOffsetInclusive property for each link (note that it can be null for links that extend to the end of the text element - in this case one can use link.text.length-1 instead); (2) finds links in all sections of the document, not only the body, and (3) includes the section and isFirstPageSection properties to indicate where the link is located; (4) accepts the argument mergeAdjacent, which when set to true, will return only a single link entry for a continuous stretch of text linked to the same URL (which would be considered separate if, for instance, part of the text is styled differently than another part).
For the purpose of including links under all sections, a new utility function, iterateSections(), was introduced.
/**
* Returns a flat array of links which appear in the active document's body.
* Each link is represented by a simple Javascript object with the following
* keys:
* - "section": {ContainerElement} the document section in which the link is
* found.
* - "isFirstPageSection": {Boolean} whether the given section is a first-page
* header/footer section.
* - "paragraph": {ContainerElement} contains a reference to the Paragraph
* or ListItem element in which the link is found.
* - "text": the Text element in which the link is found.
* - "startOffset": {Number} the position (offset) in the link text begins.
* - "endOffsetInclusive": the position of the last character of the link
* text, or null if the link extends to the end of the text element.
* - "url": the URL of the link.
*
* #param {boolean} mergeAdjacent Whether consecutive links which carry
* different attributes (for any reason) should be returned as a single
* entry.
*
* #returns {Array} the aforementioned flat array of links.
*/
function getAllLinks(mergeAdjacent) {
var links = [];
var doc = DocumentApp.getActiveDocument();
iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
if (!("getParagraphs" in section)) {
// as we're using some undocumented API, adding this to avoid cryptic
// messages upon possible API changes.
throw new Error("An API change has caused this script to stop " +
"working.\n" +
"Section #" + sectionIndex + " of type " +
section.getType() + " has no .getParagraphs() method. " +
"Stopping script.");
}
section.getParagraphs().forEach(function(par) {
// skip empty paragraphs
if (par.getNumChildren() == 0) {
return;
}
// go over all text elements in paragraph / list-item
for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
if (el.getType() != DocumentApp.ElementType.TEXT) {
continue;
}
// go over all styling segments in text element
var attributeIndices = el.getTextAttributeIndices();
var lastLink = null;
attributeIndices.forEach(function(startOffset, i, attributeIndices) {
var url = el.getLinkUrl(startOffset);
if (url != null) {
// we hit a link
var endOffsetInclusive = (i+1 < attributeIndices.length?
attributeIndices[i+1]-1 : null);
// check if this and the last found link are continuous
if (mergeAdjacent && lastLink != null && lastLink.url == url &&
lastLink.endOffsetInclusive == startOffset - 1) {
// this and the previous style segment are continuous
lastLink.endOffsetInclusive = endOffsetInclusive;
return;
}
lastLink = {
"section": section,
"isFirstPageSection": isFirstPageSection,
"paragraph": par,
"textEl": el,
"startOffset": startOffset,
"endOffsetInclusive": endOffsetInclusive,
"url": url
};
links.push(lastLink);
}
});
}
});
});
return links;
}
/**
* Calls the given function for each section of the document (body, header,
* etc.). Sections are children of the DocumentElement object.
*
* #param {Document} doc The Document object (such as the one obtained via
* a call to DocumentApp.getActiveDocument()) with the sections to iterate
* over.
* #param {Function} func A callback function which will be called, for each
* section, with the following arguments (in order):
* - {ContainerElement} section - the section element
* - {Number} sectionIndex - the child index of the section, such that
* doc.getBody().getParent().getChild(sectionIndex) == section.
* - {Boolean} isFirstPageSection - whether the section is a first-page
* header/footer section.
*/
function iterateSections(doc, func) {
// get the DocumentElement interface to iterate over all sections
// this bit is undocumented API
var docEl = doc.getBody().getParent();
var regularHeaderSectionIndex = (doc.getHeader() == null? -1 :
docEl.getChildIndex(doc.getHeader()));
var regularFooterSectionIndex = (doc.getFooter() == null? -1 :
docEl.getChildIndex(doc.getFooter()));
for (var i=0; i<docEl.getNumChildren(); ++i) {
var section = docEl.getChild(i);
var sectionType = section.getType();
var uniqueSectionName;
var isFirstPageSection = (
i != regularHeaderSectionIndex &&
i != regularFooterSectionIndex &&
(sectionType == DocumentApp.ElementType.HEADER_SECTION ||
sectionType == DocumentApp.ElementType.FOOTER_SECTION));
func(section, i, isFirstPageSection);
}
}
I was playing around and incorporated #Mogsdad's answer -- here's the really complicated version:
var _ = Underscorejs.load(); // loaded via http://googleappsdeveloper.blogspot.com/2012/11/using-open-source-libraries-in-apps.html, rolled my own
var ui = DocumentApp.getUi();
// #region --------------------- Utilities -----------------------------
var gDocsHelper = (function(P, un) {
// heavily based on answer https://stackoverflow.com/a/18731628/1037948
var updatedLinkText = function(link, offset) {
return function() { return 'Text: ' + link.getText().substring(offset,100) + ((link.getText().length-offset) > 100 ? '...' : ''); }
}
P.updateLink = function updateLink(link, oldText, newText, start, end) {
var oldLink = link.getLinkUrl(start);
if(0 > oldLink.indexOf(oldText)) return false;
var newLink = oldLink.replace(new RegExp(oldText, 'g'), newText);
link.setLinkUrl(start || 0, (end || oldLink.length), newLink);
log(true, "Updating Link: ", oldLink, newLink, start, end, updatedLinkText(link, start) );
return { old: oldLink, "new": newLink, getText: updatedLinkText(link, start) };
};
// moving this reused block out to 'private' fn
var updateLinkResult = function(text, oldText, newText, link, urls, sidebar, updateResult) {
// and may as well update the link while we're here
if(false !== (updateResult = P.updateLink(text, oldText, newText, link.start, link.end))) {
sidebar.append('<li>' + updateResult['old'] + ' → ' + updateResult['new'] + ' at ' + updateResult['getText']() + '</li>');
}
urls.push(link.url); // so multiple links get added to list
};
P.updateLinksMenu = function() {
// https://developers.google.com/apps-script/reference/base/prompt-response
var oldText = ui.prompt('Old link text to replace').getResponseText();
var newText = ui.prompt('New link text to replace with').getResponseText();
log('Replacing: ' + oldText + ', ' + newText);
var sidebar = gDocUiHelper.createSidebar('Update All Links', '<h3>Replacing</h3><p><code>' + oldText + '</code> → <code>' + newText + '</code></p><hr /><ol>');
// current doc available to script
var doc = DocumentApp.getActiveDocument().getBody();//.getActiveSection();
// Search until a link is found
var links = P.findAllElementsFor(doc, function(text) {
var i = -1, n = text.getText().length, link = false, url, urls = [], updateResult;
// note: the following only gets the FIRST link in the text -- while(i < n && !(url = text.getLinkUrl(i++)));
// scan the text element for links
while(++i < n) {
// getLinkUrl will continue to get a link while INSIDE the stupid link, so only do this once
if(url = text.getLinkUrl(i)) {
if(false === link) {
link = { start: i, end: -1, url: url };
// log(true, 'Type: ' + text.getType(), 'Link: ' + url, function() { return 'Text: ' + text.getText().substring(i,100) + ((n-i) > 100 ? '...' : '')});
}
else {
link.end = i; // keep updating the end position until we leave
}
}
// just left the link -- reset link tracking
else if(false !== link) {
// and may as well update the link while we're here
updateLinkResult(text, oldText, newText, link, urls, sidebar);
link = false; // reset "counter"
}
}
// once we've reached the end of the text, must also check to see if the last thing we found was a link
if(false !== link) updateLinkResult(text, oldText, newText, link, urls, sidebar);
return urls;
});
sidebar.append('</ol><p><strong>' + links.length + ' links reviewed</strong></p>');
gDocUiHelper.attachSidebar(sidebar);
log(links);
};
P.findAllElementsFor = function(el, test) {
// generic utility function to recursively find all elements; heavily based on https://stackoverflow.com/a/18731628/1037948
var results = [], searchResult = null, i, result;
// https://developers.google.com/apps-script/reference/document/body#findElement(ElementType)
while (searchResult = el.findElement(DocumentApp.ElementType.TEXT, searchResult)) {
var t = searchResult.getElement().editAsText(); // .asParagraph()
// check to add to list
if(test && (result = test(t))) {
if( _.isArray(result) ) results = results.concat(result); // could be big? http://jsperf.com/self-concatenation/
else results.push(result);
}
}
// recurse children if not plain text item
if(el.getType() !== DocumentApp.ElementType.TEXT) {
i = el.getNumChildren();
var result;
while(--i > 0) {
result = P.findAllElementsFor(el.getChild(i));
if(result && result.length > 0) results = results.concat(result);
}
}
return results;
};
return P;
})({});
// really? it can't handle object properties?
function gDocsUpdateLinksMenu() {
gDocsHelper.updateLinksMenu();
}
gDocUiHelper.addMenu('Zaus', [ ['Update links', 'gDocsUpdateLinksMenu'] ]);
// #endregion --------------------- Utilities -----------------------------
And I'm including the "extra" utility classes for creating menus, sidebars, etc below for completeness:
var log = function() {
// return false;
var args = Array.prototype.slice.call(arguments);
// allowing functions delegates execution so we can save some non-debug cycles if code left in?
if(args[0] === true) Logger.log(_.map(args, function(v) { return _.isFunction(v) ? v() : v; }).join('; '));
else
_.each(args, function(v) {
Logger.log(_.isFunction(v) ? v() : v);
});
}
// #region --------------------- Menu -----------------------------
var gDocUiHelper = (function(P, un) {
P.addMenuToSheet = function addMenu(spreadsheet, title, items) {
var menu = ui.createMenu(title);
// make sure menu items are correct format
_.each(items, function(v,k) {
var err = [];
// provided in format [ [name, fn],... ] instead
if( _.isArray(v) ) {
if ( v.length === 2 ) {
menu.addItem(v[0], v[1]);
}
else {
err.push('Menu item ' + k + ' missing name or function: ' + v.join(';'))
}
}
else {
if( !v.name ) err.push('Menu item ' + k + ' lacks name');
if( !v.functionName ) err.push('Menu item ' + k + ' lacks function');
if(!err.length) menu.addItem(v.name, v.functionName);
}
if(err.length) {
log(err);
ui.alert(err.join('; '));
}
});
menu.addToUi();
};
// list of things to hook into
var initializers = {};
P.addMenu = function(menuTitle, menuItems) {
if(initializers[menuTitle] === un) {
initializers[menuTitle] = [];
}
initializers[menuTitle] = initializers[menuTitle].concat(menuItems);
};
P.createSidebar = function(title, content, options) {
var sidebar = HtmlService
.createHtmlOutput()
.setTitle(title)
.setWidth( (options && options.width) ? width : 350 /* pixels */);
sidebar.append(content);
if(options && options.on) DocumentApp.getUi().showSidebar(sidebar);
// else { sidebar.attach = function() { DocumentApp.getUi().showSidebar(this); }; } // should really attach to prototype...
return sidebar;
};
P.attachSidebar = function(sidebar) {
DocumentApp.getUi().showSidebar(sidebar);
};
P.onOpen = function() {
var spreadsheet = SpreadsheetApp.getActive();
log(initializers);
_.each(initializers, function(v,k) {
P.addMenuToSheet(spreadsheet, k, v);
});
};
return P;
})({});
// #endregion --------------------- Menu -----------------------------
/**
* A special function that runs when the spreadsheet is open, used to add a
* custom menu to the spreadsheet.
*/
function onOpen() {
gDocUiHelper.onOpen();
}
Had some trouble getting Mogsdad's solution to work. Specifically it misses links which end their parent element so there isn't a trailing non-link character to terminate it. I've implemented something which addresses this and returns a standard range element. Sharing here incase someone finds it useful.
function getAllLinks(element) {
var rangeBuilder = DocumentApp.getActiveDocument().newRange();
// Parse the text iteratively to find the start and end indices for each link
if (element.getType() === DocumentApp.ElementType.TEXT) {
var links = [];
var string = element.getText();
var previousUrl = null; // The URL of the previous character
var currentLink = null; // The latest link being built
for (var charIndex = 0; charIndex < string.length; charIndex++) {
var currentUrl = element.getLinkUrl(charIndex);
// New URL means create a new link
if (currentUrl !== null && previousUrl !== currentUrl) {
if (currentLink !== null) links.push(currentLink);
currentLink = {};
currentLink.url = String(currentUrl);
currentLink.startOffset = charIndex;
}
// In a URL means extend the end of the current link
if (currentUrl !== null) {
currentLink.endOffsetInclusive = charIndex;
}
// Not in a URL means close and push the link if ready
if (currentUrl === null) {
if (currentLink !== null) links.push(currentLink);
currentLink = null;
}
// End the loop and go again
previousUrl = currentUrl;
}
// Handle the end case when final character is a link
if (currentLink !== null) links.push(currentLink);
// Convert the links into a range before returning
links.forEach(function(link) {
rangeBuilder.addElement(element, link.startOffset, link.endOffsetInclusive);
});
}
// If not a text element then recursively get links from child elements
else if (element.getNumChildren) {
for (var i = 0; i < element.getNumChildren(); i++) {
rangeBuilder.addRange(getAllLinks(element.getChild(i)));
}
}
return rangeBuilder.build();
}
You are right ... search and replace is not applicable here.
Use setLinkUrl() https://developers.google.com/apps-script/reference/document/container-element#setLinkUrl(String)
Basically you have to iterate through the elements recursively (elements can contain elements) and for each
use getLinkUrl() to get the oldText
if not null , setLinkUrl(newText) .... leaves displayed text unchanged
This Excel macro lists the links from a Word doc. You'd need to copy your data into a Word doc first.
Sub getLinks()
Dim wApp As Word.Application, wDoc As Word.Document
Dim i As Integer, r As Range
Const filePath = "C:\test\test.docx"
Set wApp = CreateObject("Word.Application")
'wApp.Visible = True
Set wDoc = wApp.Documents.Open(filePath)
Set r = Range("A1")
For i = 1 To wDoc.Hyperlinks.Count
r = wDoc.Hyperlinks(i).Address
Set r = r.Offset(1, 0)
Next i
wApp.Quit
Set wDoc = Nothing
Set wApp = Nothing
End Sub
Here's a quick and dirty way to accomplish the same goal with no scripting:
From Google Docs, save the document in RTF format.
In your editor of choice, edit the links in the RTF file (in my case, I wanted to modify all the hyperlinks, so I used Emacs and regexp-replace). Save the file when you're done.
Create a fresh, new Google Doc, and from the menu, select File>Open and open the RTF file. Docs will convert your edited RTF file back into a proper Google Doc, restoring all formatting.
Google Docs' RTF format is pretty complete--I haven't noticed any loss of fidelity in making the round trip, and it has the advantage of fully exposing all the hyperlinks, formatting, and everything else about the document in a form that's easy to edit and to apply regex tools to.

Get All Links in a Document

Given a "normal document" in Google Docs/Drive (e.g. paragraphs, lists, tables) which contains external links scattered throughout the content, how do you compile a list of links present using Google Apps Script?
Specifically, I want to update all broken links in the document by searching for oldText in each url and replace it with newText in each url, but not the text.
I don't think the replacing text section of the Dev Documentation is what I need -- do I need to scan every element of the doc? Can I just editAsText and use an html regex? Examples would be appreciated.
This is only mostly painful! Code is available as part of a gist.
Yeah, I can't spell.
getAllLinks
Here's a utility function that scans the document for all LinkUrls, returning them in an array.
/**
* Get an array of all LinkUrls in the document. The function is
* recursive, and if no element is provided, it will default to
* the active document's Body element.
*
* #param {Element} element The document element to operate on.
* .
* #returns {Array} Array of objects, vis
* {element,
* startOffset,
* endOffsetInclusive,
* url}
*/
function getAllLinks(element) {
var links = [];
element = element || DocumentApp.getActiveDocument().getBody();
if (element.getType() === DocumentApp.ElementType.TEXT) {
var textObj = element.editAsText();
var text = element.getText();
var inUrl = false;
for (var ch=0; ch < text.length; ch++) {
var url = textObj.getLinkUrl(ch);
if (url != null) {
if (!inUrl) {
// We are now!
inUrl = true;
var curUrl = {};
curUrl.element = element;
curUrl.url = String( url ); // grab a copy
curUrl.startOffset = ch;
}
else {
curUrl.endOffsetInclusive = ch;
}
}
else {
if (inUrl) {
// Not any more, we're not.
inUrl = false;
links.push(curUrl); // add to links
curUrl = {};
}
}
}
if (inUrl) {
// in case the link ends on the same char that the element does
links.push(curUrl);
}
}
else {
var numChildren = element.getNumChildren();
for (var i=0; i<numChildren; i++) {
links = links.concat(getAllLinks(element.getChild(i)));
}
}
return links;
}
findAndReplaceLinks
This utility builds on getAllLinks to do a find & replace function.
/**
* Replace all or part of UrlLinks in the document.
*
* #param {String} searchPattern the regex pattern to search for
* #param {String} replacement the text to use as replacement
*
* #returns {Number} number of Urls changed
*/
function findAndReplaceLinks(searchPattern,replacement) {
var links = getAllLinks();
var numChanged = 0;
for (var l=0; l<links.length; l++) {
var link = links[l];
if (link.url.match(searchPattern)) {
// This link needs to be changed
var newUrl = link.url.replace(searchPattern,replacement);
link.element.setLinkUrl(link.startOffset, link.endOffsetInclusive, newUrl);
numChanged++
}
}
return numChanged;
}
Demo UI
To demonstrate the use of these utilities, here are a couple of UI extensions:
function onOpen() {
// Add a menu with some items, some separators, and a sub-menu.
DocumentApp.getUi().createMenu('Utils')
.addItem('List Links', 'sidebarLinks')
.addItem('Replace Link Text', 'searchReplaceLinks')
.addToUi();
}
function searchReplaceLinks() {
var ui = DocumentApp.getUi();
var app = UiApp.createApplication()
.setWidth(250)
.setHeight(100)
.setTitle('Change Url text');
var form = app.createFormPanel();
var flow = app.createFlowPanel();
flow.add(app.createLabel("Find: "));
flow.add(app.createTextBox().setName("searchPattern"));
flow.add(app.createLabel("Replace: "));
flow.add(app.createTextBox().setName("replacement"));
var handler = app.createServerHandler('myClickHandler');
flow.add(app.createSubmitButton("Submit").addClickHandler(handler));
form.add(flow);
app.add(form);
ui.showDialog(app);
}
// ClickHandler to close dialog
function myClickHandler(e) {
var app = UiApp.getActiveApplication();
app.close();
return app;
}
function doPost(e) {
var numChanged = findAndReplaceLinks(e.parameter.searchPattern,e.parameter.replacement);
var ui = DocumentApp.getUi();
var app = UiApp.createApplication();
sidebarLinks(); // Update list
var result = DocumentApp.getUi().alert(
'Results',
"Changed "+numChanged+" urls.",
DocumentApp.getUi().ButtonSet.OK);
}
/**
* Shows a custom HTML user interface in a sidebar in the Google Docs editor.
*/
function sidebarLinks() {
var links = getAllLinks();
var sidebar = HtmlService
.createHtmlOutput()
.setTitle('URL Links')
.setWidth(350 /* pixels */);
// Display list of links, url only.
for (var l=0; l<links.length; l++) {
var link = links[l];
sidebar.append('<p>'+link.url);
}
DocumentApp.getUi().showSidebar(sidebar);
}
I offer another, shorter answer for your first question, concerning iterating through all links in a document's body. This instructive code returns a flat array of links in the current document's body, where each link is represented by an object with entries pointing to the text element (text), the paragraph element or list item element in which it's contained (paragraph), the offset index in the text where the link appears (startOffset) and the URL itself (url). Hopefully, you'll find it easy to suit it for your own needs.
It uses the getTextAttributeIndices() method rather than iterating over every character of the text, and is thus expected to perform much more quickly than previously written answers.
EDIT: Since originally posting this answer, I modified the function a couple of times. It now also (1) includes the endOffsetInclusive property for each link (note that it can be null for links that extend to the end of the text element - in this case one can use link.text.length-1 instead); (2) finds links in all sections of the document, not only the body, and (3) includes the section and isFirstPageSection properties to indicate where the link is located; (4) accepts the argument mergeAdjacent, which when set to true, will return only a single link entry for a continuous stretch of text linked to the same URL (which would be considered separate if, for instance, part of the text is styled differently than another part).
For the purpose of including links under all sections, a new utility function, iterateSections(), was introduced.
/**
* Returns a flat array of links which appear in the active document's body.
* Each link is represented by a simple Javascript object with the following
* keys:
* - "section": {ContainerElement} the document section in which the link is
* found.
* - "isFirstPageSection": {Boolean} whether the given section is a first-page
* header/footer section.
* - "paragraph": {ContainerElement} contains a reference to the Paragraph
* or ListItem element in which the link is found.
* - "text": the Text element in which the link is found.
* - "startOffset": {Number} the position (offset) in the link text begins.
* - "endOffsetInclusive": the position of the last character of the link
* text, or null if the link extends to the end of the text element.
* - "url": the URL of the link.
*
* #param {boolean} mergeAdjacent Whether consecutive links which carry
* different attributes (for any reason) should be returned as a single
* entry.
*
* #returns {Array} the aforementioned flat array of links.
*/
function getAllLinks(mergeAdjacent) {
var links = [];
var doc = DocumentApp.getActiveDocument();
iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
if (!("getParagraphs" in section)) {
// as we're using some undocumented API, adding this to avoid cryptic
// messages upon possible API changes.
throw new Error("An API change has caused this script to stop " +
"working.\n" +
"Section #" + sectionIndex + " of type " +
section.getType() + " has no .getParagraphs() method. " +
"Stopping script.");
}
section.getParagraphs().forEach(function(par) {
// skip empty paragraphs
if (par.getNumChildren() == 0) {
return;
}
// go over all text elements in paragraph / list-item
for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
if (el.getType() != DocumentApp.ElementType.TEXT) {
continue;
}
// go over all styling segments in text element
var attributeIndices = el.getTextAttributeIndices();
var lastLink = null;
attributeIndices.forEach(function(startOffset, i, attributeIndices) {
var url = el.getLinkUrl(startOffset);
if (url != null) {
// we hit a link
var endOffsetInclusive = (i+1 < attributeIndices.length?
attributeIndices[i+1]-1 : null);
// check if this and the last found link are continuous
if (mergeAdjacent && lastLink != null && lastLink.url == url &&
lastLink.endOffsetInclusive == startOffset - 1) {
// this and the previous style segment are continuous
lastLink.endOffsetInclusive = endOffsetInclusive;
return;
}
lastLink = {
"section": section,
"isFirstPageSection": isFirstPageSection,
"paragraph": par,
"textEl": el,
"startOffset": startOffset,
"endOffsetInclusive": endOffsetInclusive,
"url": url
};
links.push(lastLink);
}
});
}
});
});
return links;
}
/**
* Calls the given function for each section of the document (body, header,
* etc.). Sections are children of the DocumentElement object.
*
* #param {Document} doc The Document object (such as the one obtained via
* a call to DocumentApp.getActiveDocument()) with the sections to iterate
* over.
* #param {Function} func A callback function which will be called, for each
* section, with the following arguments (in order):
* - {ContainerElement} section - the section element
* - {Number} sectionIndex - the child index of the section, such that
* doc.getBody().getParent().getChild(sectionIndex) == section.
* - {Boolean} isFirstPageSection - whether the section is a first-page
* header/footer section.
*/
function iterateSections(doc, func) {
// get the DocumentElement interface to iterate over all sections
// this bit is undocumented API
var docEl = doc.getBody().getParent();
var regularHeaderSectionIndex = (doc.getHeader() == null? -1 :
docEl.getChildIndex(doc.getHeader()));
var regularFooterSectionIndex = (doc.getFooter() == null? -1 :
docEl.getChildIndex(doc.getFooter()));
for (var i=0; i<docEl.getNumChildren(); ++i) {
var section = docEl.getChild(i);
var sectionType = section.getType();
var uniqueSectionName;
var isFirstPageSection = (
i != regularHeaderSectionIndex &&
i != regularFooterSectionIndex &&
(sectionType == DocumentApp.ElementType.HEADER_SECTION ||
sectionType == DocumentApp.ElementType.FOOTER_SECTION));
func(section, i, isFirstPageSection);
}
}
I was playing around and incorporated #Mogsdad's answer -- here's the really complicated version:
var _ = Underscorejs.load(); // loaded via http://googleappsdeveloper.blogspot.com/2012/11/using-open-source-libraries-in-apps.html, rolled my own
var ui = DocumentApp.getUi();
// #region --------------------- Utilities -----------------------------
var gDocsHelper = (function(P, un) {
// heavily based on answer https://stackoverflow.com/a/18731628/1037948
var updatedLinkText = function(link, offset) {
return function() { return 'Text: ' + link.getText().substring(offset,100) + ((link.getText().length-offset) > 100 ? '...' : ''); }
}
P.updateLink = function updateLink(link, oldText, newText, start, end) {
var oldLink = link.getLinkUrl(start);
if(0 > oldLink.indexOf(oldText)) return false;
var newLink = oldLink.replace(new RegExp(oldText, 'g'), newText);
link.setLinkUrl(start || 0, (end || oldLink.length), newLink);
log(true, "Updating Link: ", oldLink, newLink, start, end, updatedLinkText(link, start) );
return { old: oldLink, "new": newLink, getText: updatedLinkText(link, start) };
};
// moving this reused block out to 'private' fn
var updateLinkResult = function(text, oldText, newText, link, urls, sidebar, updateResult) {
// and may as well update the link while we're here
if(false !== (updateResult = P.updateLink(text, oldText, newText, link.start, link.end))) {
sidebar.append('<li>' + updateResult['old'] + ' → ' + updateResult['new'] + ' at ' + updateResult['getText']() + '</li>');
}
urls.push(link.url); // so multiple links get added to list
};
P.updateLinksMenu = function() {
// https://developers.google.com/apps-script/reference/base/prompt-response
var oldText = ui.prompt('Old link text to replace').getResponseText();
var newText = ui.prompt('New link text to replace with').getResponseText();
log('Replacing: ' + oldText + ', ' + newText);
var sidebar = gDocUiHelper.createSidebar('Update All Links', '<h3>Replacing</h3><p><code>' + oldText + '</code> → <code>' + newText + '</code></p><hr /><ol>');
// current doc available to script
var doc = DocumentApp.getActiveDocument().getBody();//.getActiveSection();
// Search until a link is found
var links = P.findAllElementsFor(doc, function(text) {
var i = -1, n = text.getText().length, link = false, url, urls = [], updateResult;
// note: the following only gets the FIRST link in the text -- while(i < n && !(url = text.getLinkUrl(i++)));
// scan the text element for links
while(++i < n) {
// getLinkUrl will continue to get a link while INSIDE the stupid link, so only do this once
if(url = text.getLinkUrl(i)) {
if(false === link) {
link = { start: i, end: -1, url: url };
// log(true, 'Type: ' + text.getType(), 'Link: ' + url, function() { return 'Text: ' + text.getText().substring(i,100) + ((n-i) > 100 ? '...' : '')});
}
else {
link.end = i; // keep updating the end position until we leave
}
}
// just left the link -- reset link tracking
else if(false !== link) {
// and may as well update the link while we're here
updateLinkResult(text, oldText, newText, link, urls, sidebar);
link = false; // reset "counter"
}
}
// once we've reached the end of the text, must also check to see if the last thing we found was a link
if(false !== link) updateLinkResult(text, oldText, newText, link, urls, sidebar);
return urls;
});
sidebar.append('</ol><p><strong>' + links.length + ' links reviewed</strong></p>');
gDocUiHelper.attachSidebar(sidebar);
log(links);
};
P.findAllElementsFor = function(el, test) {
// generic utility function to recursively find all elements; heavily based on https://stackoverflow.com/a/18731628/1037948
var results = [], searchResult = null, i, result;
// https://developers.google.com/apps-script/reference/document/body#findElement(ElementType)
while (searchResult = el.findElement(DocumentApp.ElementType.TEXT, searchResult)) {
var t = searchResult.getElement().editAsText(); // .asParagraph()
// check to add to list
if(test && (result = test(t))) {
if( _.isArray(result) ) results = results.concat(result); // could be big? http://jsperf.com/self-concatenation/
else results.push(result);
}
}
// recurse children if not plain text item
if(el.getType() !== DocumentApp.ElementType.TEXT) {
i = el.getNumChildren();
var result;
while(--i > 0) {
result = P.findAllElementsFor(el.getChild(i));
if(result && result.length > 0) results = results.concat(result);
}
}
return results;
};
return P;
})({});
// really? it can't handle object properties?
function gDocsUpdateLinksMenu() {
gDocsHelper.updateLinksMenu();
}
gDocUiHelper.addMenu('Zaus', [ ['Update links', 'gDocsUpdateLinksMenu'] ]);
// #endregion --------------------- Utilities -----------------------------
And I'm including the "extra" utility classes for creating menus, sidebars, etc below for completeness:
var log = function() {
// return false;
var args = Array.prototype.slice.call(arguments);
// allowing functions delegates execution so we can save some non-debug cycles if code left in?
if(args[0] === true) Logger.log(_.map(args, function(v) { return _.isFunction(v) ? v() : v; }).join('; '));
else
_.each(args, function(v) {
Logger.log(_.isFunction(v) ? v() : v);
});
}
// #region --------------------- Menu -----------------------------
var gDocUiHelper = (function(P, un) {
P.addMenuToSheet = function addMenu(spreadsheet, title, items) {
var menu = ui.createMenu(title);
// make sure menu items are correct format
_.each(items, function(v,k) {
var err = [];
// provided in format [ [name, fn],... ] instead
if( _.isArray(v) ) {
if ( v.length === 2 ) {
menu.addItem(v[0], v[1]);
}
else {
err.push('Menu item ' + k + ' missing name or function: ' + v.join(';'))
}
}
else {
if( !v.name ) err.push('Menu item ' + k + ' lacks name');
if( !v.functionName ) err.push('Menu item ' + k + ' lacks function');
if(!err.length) menu.addItem(v.name, v.functionName);
}
if(err.length) {
log(err);
ui.alert(err.join('; '));
}
});
menu.addToUi();
};
// list of things to hook into
var initializers = {};
P.addMenu = function(menuTitle, menuItems) {
if(initializers[menuTitle] === un) {
initializers[menuTitle] = [];
}
initializers[menuTitle] = initializers[menuTitle].concat(menuItems);
};
P.createSidebar = function(title, content, options) {
var sidebar = HtmlService
.createHtmlOutput()
.setTitle(title)
.setWidth( (options && options.width) ? width : 350 /* pixels */);
sidebar.append(content);
if(options && options.on) DocumentApp.getUi().showSidebar(sidebar);
// else { sidebar.attach = function() { DocumentApp.getUi().showSidebar(this); }; } // should really attach to prototype...
return sidebar;
};
P.attachSidebar = function(sidebar) {
DocumentApp.getUi().showSidebar(sidebar);
};
P.onOpen = function() {
var spreadsheet = SpreadsheetApp.getActive();
log(initializers);
_.each(initializers, function(v,k) {
P.addMenuToSheet(spreadsheet, k, v);
});
};
return P;
})({});
// #endregion --------------------- Menu -----------------------------
/**
* A special function that runs when the spreadsheet is open, used to add a
* custom menu to the spreadsheet.
*/
function onOpen() {
gDocUiHelper.onOpen();
}
Had some trouble getting Mogsdad's solution to work. Specifically it misses links which end their parent element so there isn't a trailing non-link character to terminate it. I've implemented something which addresses this and returns a standard range element. Sharing here incase someone finds it useful.
function getAllLinks(element) {
var rangeBuilder = DocumentApp.getActiveDocument().newRange();
// Parse the text iteratively to find the start and end indices for each link
if (element.getType() === DocumentApp.ElementType.TEXT) {
var links = [];
var string = element.getText();
var previousUrl = null; // The URL of the previous character
var currentLink = null; // The latest link being built
for (var charIndex = 0; charIndex < string.length; charIndex++) {
var currentUrl = element.getLinkUrl(charIndex);
// New URL means create a new link
if (currentUrl !== null && previousUrl !== currentUrl) {
if (currentLink !== null) links.push(currentLink);
currentLink = {};
currentLink.url = String(currentUrl);
currentLink.startOffset = charIndex;
}
// In a URL means extend the end of the current link
if (currentUrl !== null) {
currentLink.endOffsetInclusive = charIndex;
}
// Not in a URL means close and push the link if ready
if (currentUrl === null) {
if (currentLink !== null) links.push(currentLink);
currentLink = null;
}
// End the loop and go again
previousUrl = currentUrl;
}
// Handle the end case when final character is a link
if (currentLink !== null) links.push(currentLink);
// Convert the links into a range before returning
links.forEach(function(link) {
rangeBuilder.addElement(element, link.startOffset, link.endOffsetInclusive);
});
}
// If not a text element then recursively get links from child elements
else if (element.getNumChildren) {
for (var i = 0; i < element.getNumChildren(); i++) {
rangeBuilder.addRange(getAllLinks(element.getChild(i)));
}
}
return rangeBuilder.build();
}
You are right ... search and replace is not applicable here.
Use setLinkUrl() https://developers.google.com/apps-script/reference/document/container-element#setLinkUrl(String)
Basically you have to iterate through the elements recursively (elements can contain elements) and for each
use getLinkUrl() to get the oldText
if not null , setLinkUrl(newText) .... leaves displayed text unchanged
This Excel macro lists the links from a Word doc. You'd need to copy your data into a Word doc first.
Sub getLinks()
Dim wApp As Word.Application, wDoc As Word.Document
Dim i As Integer, r As Range
Const filePath = "C:\test\test.docx"
Set wApp = CreateObject("Word.Application")
'wApp.Visible = True
Set wDoc = wApp.Documents.Open(filePath)
Set r = Range("A1")
For i = 1 To wDoc.Hyperlinks.Count
r = wDoc.Hyperlinks(i).Address
Set r = r.Offset(1, 0)
Next i
wApp.Quit
Set wDoc = Nothing
Set wApp = Nothing
End Sub
Here's a quick and dirty way to accomplish the same goal with no scripting:
From Google Docs, save the document in RTF format.
In your editor of choice, edit the links in the RTF file (in my case, I wanted to modify all the hyperlinks, so I used Emacs and regexp-replace). Save the file when you're done.
Create a fresh, new Google Doc, and from the menu, select File>Open and open the RTF file. Docs will convert your edited RTF file back into a proper Google Doc, restoring all formatting.
Google Docs' RTF format is pretty complete--I haven't noticed any loss of fidelity in making the round trip, and it has the advantage of fully exposing all the hyperlinks, formatting, and everything else about the document in a form that's easy to edit and to apply regex tools to.

Problem in getting right result for select box

I am using jQuery as:
$(document).ready(function(){
test("price");
alert("hi");
$("#item2").change(function()
{
sort= $("#item2").val();
test(sort);
});
});
Function test() is some JavaScript function, my problem is when page loads function calls by "price" parameter. Now when I select some item from select box function test() is called using sort parameter (verify by alert box). but I am not getting the correct result. I mean when I select option from select box than also my result of test() is as with "price" , I suppose it might be the problem because of jQuery's $(document).ready(function(){,. test() function make some html code based on the parameter and show it on the web page.
Please suggest me what can be the solution
EDIT:
function test() is :
function test(sort)
{
<%
Ampliflex ms = Ampliflex.getInstance();
String solrIP = ms.getSolrIP();
String solrPort = ms.getSolrPort();
String rows = ms.getSearchResultCount();
%>
solrIP='<%= solrIP %>'; // get Solr IP address
solrPort='<%= solrPort %>'; // get Solr Port number
rows='<%= rows %>'; // get number of results to return
solrURL="http://"+solrIP+":"+solrPort;
var query="${searchStr}"; // get the query string entered by ECommerce user
query=query.replace(/[^a-zA-Z 0-9*?:.+-^""_]+/g,''); // Remove special characters
query=query.replace(/\*+/g,'*'); // Replace multiple occurrence of "*" with single "*"
var newquery=query;
if(parseInt(query)==NaN)
{
var lowerCaseQuery=query.toLowerCase();
newquery=lowerCaseQuery;
}
else{
var lowerCaseQuery=query;
}
// sort= document.getElementById("item2").value;
$.getJSON(solrURL+"/solr/db/select/?qt=dismax&wt=json&&start=0&rows="+rows+"&q="+lowerCaseQuery+"&hl=true&hl.fl=text&hl.usePhraseHighlighter=true&sort="+sort+" desc&json.wrf=?", function(result){
var highlight = new Array(result.response.numFound);
$.each(result.highlighting, function(i, hitem){
var rg = /<em>(.*?)<\/em>/g;
var res = new Array();
var match = rg.exec(hitem.text[0]);
while(match != null){
res.push(match[1])
match = rg.exec(hitem.text[0]);
}
highlight[i]=res[0]
for (j=1 ;j<res.length;j++)
{
highlight[i]= highlight[i]+","+res[j];
}
});
var html="<table><tr>"
var count=0;
var alt="NoImage";
var size="3pt";
var id;
var flag=1; // Flag for error messages
border="1";
// If no search results
if(result.response.numFound==0)
{
var msg= "<hr /><font size="+size+" >We're sorry, we found no results for <b>"+document.getElementById("queryString").value+"</font><hr />";
}
else
{
/* var msg= "<hr /><font size="+size+" >Total Results Found <b> "+ result.response.numFound+"</b> for "+"<b>"+document.getElementById("queryString").value+"</b> keyword</font><hr /> ";*/
if (newquery==lowerCaseQuery)
{
var msg= "<hr /><font size="+size+" >Total Results Found <b> "+ result.response.numFound+"</b> for "+"<b>"+query+"</b> </font><hr /> ";
}
else
{
var msg= "<hr /><font size="+size+" >There were no exact matches for <b> "+ query+"</b> , so we searched automatically for "+"<b>"+query+"</b> and yielded "+result.response.numFound+" result(s)</font><hr /> ";
}
// Parse solr response and display it on web page
$.each(result.response.docs, function(i,item){
var word = new Array();
word=highlight[item["UID_PK"]].split(",");
var result="";
var j=0;
for (j=0 ;j<=item.text.length;j++)
{
result = result+item.text[j]+"<br>";
}
for (j=0 ;j<word.length;j++)
{
result=result.replace(word[j],'<em>' + word[j] + '</em>');
}
html+="<td><table>";
var src=item.image;
id="id";
if(src!= null && src!= ""){
html+="<p><tr><td><br>"+"<img id= "+id+ " src="+src+ " border="+border+ " /></td></tr>";
count=count+1;
html += "<tr><td><b>ImagePath</b> "+ item.image+"</td></tr>";
}
// If not insert a default image
else
{
src="images/products/default.jpg";
html+="<tr><td><br><p>"+"<img id= "+id+ " src="+src+ " border="+border+" /></td></tr>";
count=count+1;
html += "<tr><td><b>ImagePath</b> "+"No image path found" +"</td></tr>";
}
html += "<tr><td>UID_PK: "+ item.UID_PK+"</td></tr>";
html += "<tr><td>Name: "+ item.name+"</td></tr>";
html+="<tr><td><b>Price: $"+item.price+"</td></tr>";
html+="<tr><td> "+result+"<br></td></tr>";
html+="</p></table></td>"
if(count%3==0)
{
html+="</tr>"
html+="<tr>"
}
});
html+="</table>"
}
$("#text_container").html(msg);
$("#result").append(html);
}
});
});
}
Your question isn't particularly clear, but your alert code only fires when the document is ready - it is not inside the "change" event function.
Try using the following to see what value is being returned when you change the select box:
$(document).ready(function(){
test("price");
$("#item2").change(function()
{
sort= $("#item2").val();
alert(sort);
test(sort);
});
});
When changing the select box, you should get an alert with the value you have chosen, which will help you understand why the test() function isn't functioning as you expect.
If you amend your question to include the HTML of the select box and the test() function itself I will amend my answer to help.
The JQuery code that you have posted is working fine. Demo: http://jsfiddle.net/DtnUr/
We need more details to figure out the issue, such as your HTML code and JS functions.