How do I isolate Drive file ID from URL using regex? - google-apps-script

I am trying to get the file name for a file I only have the Google Drive URL for. I am currently using a Google Sheets regexextract function to extract the file ID from the URL, and then using the script to find the file based on the ID, but would prefer to do the regex within the script.
I've looked through various posts on here to try to figure it out with no luck. Hopefully someone can spell out what I have done wrong in trying to incorporate the regex.
var sheet = SpreadsheetApp.getActive().getSheetByName("Test1");
var link1 = sheet.getRange("N2:N").getValues();
var regex_ids = new RegExp("/file/d/[a-zA-Z0-9]g");
var links = regex_ids.exec(link1);
var filenames = [];
for (var i = 0; i < links.length; i++) {
var url = links[i][0];
if (url != "") {
var filename = DriveApp.getFileById(links[i][0]).getName();
filenames.push([filename]);
}
}
var startRow = 2; // print in row 2 since row 1 is the header row
var fileNameColumn = 18; // Column B = column 2
var destination = sheet.getRange(startRow, fileNameColumn, filenames.length, filenames[0].length);
destination.setValues(filenames);
}
Currently I am stuck with error "TypeError: Cannot read property "length" from null. (line 7, file "Code")" because the regex is not configured correctly.

Issues/Solution:
Invalid Syntax:
Regexp() accepts a regex string and a flag string as arguments, while the code provides a concatenated regex flag string.
exec() accepts a string argument, while the code provides a 2D array.
Insufficient regex:
Filename IDs also contain underscores _.
Regex should capture() only the ID. The regex provided also captures /file/d
ID contains more than 1 character. Use +
Snippet:
var link1 = sheet.getRange("N2:N" + sheet.getLastRow()).getValues();//modified
var regex_ids = /\/file\/d\/([^\/]+)/;//or new RegExp("/file/d/([a-zA-Z0-9_]+)","g");() =capture ids
//var links = regex_ids.exec(link1);
var filenames = [];
for (var i = 0; i < link1.length; i++) {//modified;loop through values2D array
var url = link1[i][0];//modified;
var preId = regex_ids.exec(url);//added;
var id;
if (preId && (id=preId[1])) {//modified; [1] = first capture group
var filename = DriveApp.getFileById(id).getName();//modified
filenames.push([filename]);
} else {
filenames.push([""]);
}
}
References:
Regex guide
Regexp#exec

Related

XmlService.parse() not able to handle HTML tables

I am looking for help from this community regarding the below issue.
// I am searching my Gmail inbox for a specific email
function getWeeklyEmail() {
var emailFilter = 'newer_than:7d AND label:inbox AND "Report: Launchpad filter"';
var threads = GmailApp.search(emailFilter, 0, 5);
var messages=[];
threads.forEach(function(threads)
{
messages.push(threads.getMessages()[0]);
});
return messages;
}
// Trying to parse the HTML table contained within the email
function getParsedMsg() {
var messages = getWeeklyEmail();
var msgbody = messages[0].getBody();
var doc = XmlService.parse(msgbody);
var html = doc.getRootElement();
var tables = doc.getDescendants();
var templ = HtmlService.createTemplateFromFile('Messages1');
templ.tables = [];
return templ.evaluate();
}
The debugger crashes when I try to step over the XmlService.parse function. The msgbody of the email contains both text and HTML formatted table. I am getting the following error: TypeError: Cannot read property 'getBody' of undefined (line 19, file "Code")
If I remove the getParsedMsg function and instead just display the content of the email, I get the email body along with the element tags etc in html format.
Workaround
Hi ! The issue you are experiencing is due to (as you previously mentioned) XmlService only recognising canonical XML rather than HTML. One possible workaround to solve this issue is to search in the string you are obtaining with getBody() for your desired tags.
In your case your main issue is var doc = XmlService.parse(msgbody);. To solve it you could iterate through the whole string looking for the table tags you need using Javascript search method. Here is an example piece of code retrieving an email with a single table:
function getWeeklyEmail() {
var emailFilter = 'newer_than:7d AND label:inbox AND "Report: Launchpad filter"';
var threads = GmailApp.search(emailFilter, 0, 5);
var messages=[];
threads.forEach(function(threads)
{
messages.push(threads.getMessages()[0]);
});
return messages;
}
// Trying to parse the HTML table contained within the email
function getParsedMsg() {
var messages = getWeeklyEmail();
var msgbody = messages[0].getBody();
var indexOrigin = msgbody.search('<table');
var indexEnd = msgbody.search('</table');
// Get what is in between those indexes of the string.
// I am adding 8 as it indexEnd only gets the first index of </table
// i.e the one before <
var Table = msgbody.substring(indexOrigin,indexEnd+8);
Logger.log(Table);
}
If you are looking for more than one table in your message, you can change getParsedMsg to the following:
function getParsedMsg() {
// If you are not sure about how many you would be expecting, use an approximate number
var totalTables = 2;
var messages = getWeeklyEmail();
var msgbody = messages[0].getBody();
var indexOrigin = msgbody.indexOf('<table');
var indexEnd = msgbody.indexOf('</table');
var Table = []
for(i=0;i<totalTables;i++){
// go over each stable and store their strings in elements of an array
var start = msgbody.indexOf('<table', (indexOrigin + i))
var end = msgbody.indexOf('</table', (indexEnd + i))
Table.push(msgbody.substring(start,end+8));
}
Logger.log(Table);
}
This will let you store each table in an element of an array. If you want to use these you would just need to retrieve the elements of this array and use them accordingly (for exaple to use them as HTML tables.
I hope this has helped you. Let me know if you need anything else or if you did not understood something. :)

Not Able to Scrape table in Google Sheets

With the help of this SO questionsI am trying to scrape the following website. I would like the two teams and the time. For example, the first entry would be Chicago | Miami | 12:30 PM, and the last entry would be Colorado | Arizona | 10:10 PM. My code is as follows
function espn_schedule() {
var url = "http://www.espn.com/mlb/schedule/_/date/20180329";
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser.data(content).from('class="schedule has-team-logos align-left"').to('</tbody>').iterate();
var res = [];
var temp = [];
var away_ticker = "";
scraped.forEach(function(e){
var away_team = Parser.data(e).from('href="mlb/team/_/name/').to('"').build();
var time = Parser.data(e).from('a data-dateformat="time1"').to('</a>').build();
if (away_ticker == "") away_ticker = away_team;
if (away_team != away_ticker) {
temp.splice(1, 0, away_ticker);
res.push(temp);
temp = [];
away_ticker = away_team;
temp.push(time);
}
});
var ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Schedule");
ss.getRange(ss.getLastRow() + 1, 1, res.length, res[0].length).setValues(res);
}
I get the following error:
TypeError: Cannot read property "length" from undefined. (line 42, file "Code")
Here is a modified solution that works
function espn_schedule() {
var url = "http://www.espn.com/mlb/schedule/_/date/20180329";
var content = UrlFetchApp.fetch(url).getContentText();
var e = Parser.data(content).from('class="schedule has-team-logos align-left"').to('</tbody>').build();
var res = [];
//Logger.log(scraped[0])
var temp = [];
var away_ticker = "";
var teams = Parser.data(e).from('<abbr title="').to('">').iterate();
Logger.log(teams)
var time = Parser.data(e).from('data-date="').to('">').iterate()
Logger.log(time)
for( var i = 0; i<teams.length ; i = i+2)
{
res[i/2] = []
res[i/2][0] = teams[i]
res[i/2][1] = teams[i+1]
res[i/2][2] = new Date(time[i/2]).toLocaleTimeString('en-US')
}
Logger.log(res)
var ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Schedule");
ss.getRange(ss.getLastRow() + 1, 1, res.length, res[0].length).setValues(res);
}
Modification explained:
1) Since you access only the first table you don't need to iterate during parsing and just get the first table. Also, since you get just the first table, you don't need to use forEach to loop through each element.
var e = Parser.data(content)
.from('class="schedule has-team-logos align-left"')
.to('</tbody>')
.build(); //Use build instead of iterate
2) Instead of parsing the HTML link to get the team name, you can use <abbr title=" element to scrape the name. Furthermore, you can iterate over all the team names in the table to get an array of team names.
var teams = Parser.data(e).from('<abbr title="').to('">').iterate();
3) Similar to the above modification, you can get the time by using the data-date tag. This gives you date which can read by Date() class. Again, we iterate over the table to get all the times
var time = Parser.data(e).from('data-date="').to('">').iterate()
4) Finally, we use for loop to rearrange the teams and time in the array called res. This allows for inserting the data into the sheet directly.
for( var i = 0; i<teams.length ; i = i+2) //each loop adds 2 to the counter
{
res[i/2] = []
res[i/2][0] = teams[i] //even team (starts at zero)
res[i/2][1] = teams[i+1] //vs odd teams
res[i/2][2] = new Date(time[i/2]).toLocaleTimeString('en-US')
}
Reference:
Date(),Date.toLocaleTimeString()
Edit:
Reason for error, in the below code
Parser.data(e).from('href="mlb/team/_/name/').to('"').build()
you are looking for string 'href="mlb/team/_/name/', however it should be href="/mlb/team/_/name/'. Note the difference mlb vs /mlb.
Secondly, in the following code
Parser.data(e).from('a data-dateformat="time1"').to('</a>').build();
The string should be a data-dateFormat, when you inspect the website it shown as dateformat. However, when you call it using URLfetch and log the text, it is shown as dateFormat

Google Sheets Logger values into rows

I am pulling data from a .csv file generated after a URL request using UrlFetchApp.fetch and storing the data using the logger.
Is there a way to write each row of fetched data from the logger into separate rows in a spreadsheet?
I did this quite a while ago. It's how I import data from website visitor logs. To review them. I don't use commas because you find them in the data too often. I use 3 tildes '~~~' instead and the lines are separated with a line feed '\n'.
But I basically split the lines and then fields on each line into array and feed them into the sheets one line at a time and usually I deal with 20 or 30 files about 100K or less. And I find that it loads the files very quickly.
function importData1(myFolderID,myFolderName,myFileName) {
var myFolderID = typeof(myFolderID) !== 'undefined' ? myFolderID : 'FolderID';
var myFileName = typeof(myFileName) !== 'undefined' ? myFileName : '';
if(myFileName && myFolderID)
{
var fi = DriveApp.getFolderById(myFolderID).getFilesByName(myFileName); // Selected IPLogYYMMDD.txt file
var ssid = SpreadsheetApp.getActive().getId();
var ss = SpreadsheetApp.openById(ssid);
if (fi.hasNext() ) // proceed if file exists in the IPlogs folder
{
var file = fi.next();
var data = file.getBlob().getDataAsString();
var lines = data.split('\n');
var newsheet = ss.insertSheet(myFolderName + '/' + myFileName);
var j=0;
for (var i=0; i<lines.length; i++ )
{
var fields = lines[i].split('~~~');
if(fields.length>=8)//There's supposed to be 8 or 9 fields
{
Logger.log('i=' + i + 'fields.length=' + fields.length);
newsheet.getRange(j+1, 1, 1,fields.length).setValues([fields]);
j=j+1;
}
}
}
}
else
{
displayStatus('Error Importing Data','Either Folder or File not found in importData1');
}
Most of the variables are easy to figure out. You'll probably be able to adapt it to what you need. And probably there will be several optional answers for you to chose from.

Error in Google Sheets Script when parsing XML

I have this function running in a Google Sheets script that pulls HTML from subreddits and returns them to a spreadsheet. It works for me some/most of the time, but other times I get an error "Could not parse text. (line 13)" which is the line with var doc = Xml.parse(page, true);. Any idea why this is happening or is this just a bug with Google Scripts? Here's the code that works...sometimes.
function getRedditHTML() {
var entries_array = [];
var subreddit_array = ['https://www.reddit.com/r/news/','https://www.reddit.com/r/funny/','https://www.reddit.com/r/science/'];
for (var s = 0; s < subreddit_array.length; s++) {
var page = UrlFetchApp.fetch(subreddit_array[s]);
//this is Line 13 that is breaking
var doc = Xml.parse(page, true);
var bodyHtml = doc.html.body.toXmlString();
doc = XmlService.parse(bodyHtml);
var root = doc.getRootElement();
var entries = getElementsByClassName(root,'thing');
for (var i = 0; i < entries.length; i++) {
var title = getElementsByClassName(entries[i],'title');
title = XmlService.getRawFormat().format(title[1]).replace(/<[^>]*>/g, "");
var link = getElementsByClassName(entries[i],'comments');
link = link[0].getAttribute('href').getValue();
var rank = getElementsByClassName(entries[i],'rank');
rank = rank[0].getValue();
var likes = getElementsByClassName(entries[i],'likes');
likes = likes[0].getValue();
entries_array.push([rank, likes, title, link]);
}
}
return entries_array.sort(function (a, b) {
return b[1] - a[1];
});
}
Here is what I found upon playing with importXML (my usual way of doing this) - for some reason I cannot narrow down - it DOES appear to randomly stall out and return null for a few minutes - so I'm guessing the issue with your thing is not the code but that the site or google temporarily blocks/won't return the data -
however I found the JSON endpoint to the piece you want - and I noticed that when XML went down - the JSON didnt.
You can take that and fix it to push your own array of topics/urls - I just left it for one link for now to show you how the URL breaks down and where it should be modified:
The URL is 'https://www.reddit.com/r/news/hot.json?raw_json=1&subredditName=news&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client
News is mentioned in 2 places so just modify all your URLs to follow that method - you can easily load that javascript in a browser to see all the fields available
Also the portion hot.json is where you can change whether you want the ranked list (called hot), or new,top,promoted, etc. you just change that keyword.
Score is the same as the upvotes/likes
function getSubReddit() {
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sheet = ss.getActiveSheet(); //get Active sheet
var subject = 'news';
var url = 'https://www.reddit.com/r/' + subject + '/hot.json?raw_json=1&subredditName=' + subject + '&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client'; //json endpoint for data
var response = UrlFetchApp.fetch(url); // get api endpoint
var json = response.getContentText(); // get the response content as text
var redditData = JSON.parse(json); //parse text into json
Logger.log(redditData); //log data to logger to check
//create empty array to hold data points
var statsRows = [];
var date = new Date(); //create new date for timestamp
//The following lines push the parsed json into empty stats array
for (var j=0;j<25;j++){
for (var i =0;i<25;i++){
var stats=[];
stats.push(date);//timestamp
stats.push(i+1);
stats.push(redditData.data.children[i].data.score); //score
stats.push(redditData.data.children[i].data.title); //title
stats.push(redditData.data.children[i].data.url); //article url
// stats.push('http://www.reddit.com' + redditData.data.children[i].data.permalink); //reddit permalink
statsRows.push(stats)
}
//append the stats array to the active sheet
sheet.appendRow(statsRows[j])
}
}

Need to edit gdoc using a googleappscript

Is it possible that I could add the body of a gdoc into an email? I kinda have an idea of how to do it but I am not completely sure. I have written this code below to kinda help me. I am new to this and I have managed to have a few scripts running, but I am completely lost on this one. I have watched several videos and this is what I was able to do. The code is below.
Basically what I want to do is to be able to have a user input his name and another variable and then go to the google doc file and change it to the value that was input and then put it back in an email and send it to an address... Any ideas of what I am doing wrong or where should I start?? Thanks in advance.
function gsnot() {
var emailaddress="albdominguez25#gmail.net";
var sub="Subject1";
var pattern = Browser.inputBox("Enter your name");
var pattern2 = Browser.inputBox("Enter the minutes:");
var templateDocID= ScriptProperties.getProperty("EmailTemplateDocId");
var doc = DocumentApp.openById(templateDocID);
var body = doc.getActiveSection()
var html = "";
var keys = {
name: pattern,
min: pattern2,
};
for ( var k in keys ) {
body.replaceText("%" + k + "%", keys[k]);
doc.saveAndClose();
html = getDocAsHtml(docId);
DocsList.getFileById(docId).setTrashed(true);
return html;
var emailaddress="albdominguez25#gmail.net";
var sub="Subject1";
MailApp.sendEmail(emailaddress,sub, {htmlBody: body});}}
You might want to change your code by reading the body from the document into a variable, doing the replace on the variable and inserting that into your email. For example:
function gsnot() {
var emailaddress = "albdominguez25#gmail.net";
var sub = "New Subject";
var pattern = Browser.inputBox("Enter your name:");
var pattern2 = Browser.inputBox("Enter the minutes:");
var templateDocID = ScriptProperties.getProperty("EmailTemplateDocId");
var doc = DocumentApp.openById(templateDocID);
var body = doc.getText();
var replacement;
var k;
var keys = {
name: pattern,
min: pattern2
};
for (k in keys) {
if (keys.hasOwnProperty(k)) {
replacement = new RegExp("%" + k + "%",'g');
body = body.replace(replacement, keys[k]);
}
}
MailApp.sendEmail(emailaddress,sub, '', {htmlBody: body});
}
A few notes:
It is good form to have all your var statements at the beginning of
the function
When you use for-in (eg. for (k in keys) ), it returns all properties of the object. You only want the ones you assigned. This is the reason for: for (k in keys)
You had the mail sending for each property, I believe you wanted it outside the for-in loop so it only sent after all the replacements were completed.
Using replace(), you need to create a regular expression object that is set to global
or it will only replace the first instance of the pattern (you have name twice).
In your parameters for sendEmail(), even if you are using the htmlBody option, you need to specify a plain text body. I used empty quotes ''.