Can I ungzip files in Google Drive with apps script? - google-apps-script

This is a follow-up to this thread.
I am trying to use the code provided by #July.Tech there, but I keep getting unknown compression method error or incorrect header check error. The error came up when either of two different gzip methods were used to create the compressed file, so I would think the file is correctly gzipped.
Any suggestions? (My input file is gzipped so I cannot use Utilities.unzip().)
Here is the entire code:
reports_folder_id = 'xxxxx'; //id of folder where gzipped csv reports are saved
report_name = 'xxxxxx.gz'; // name of gzipped CSV file
function importData() {
var fSource = DriveApp.getFolderById(reports_folder_id);
var fi = fSource.getFilesByName(report_name); // latest report file
eval(UrlFetchApp.fetch('https://cdn.rawgit.com/nodeca/pako/master/dist/pako.js').getContentText());
if ( fi.hasNext() ) { // proceed if report_name file exists in the reports folder
var file = fi.next();
var charData = file.getBlob().getDataAsString(); // same error if .getBytes() is used
var binData = [];
for (var i = 0; i < charData.length; i++) {
binData.push(charData[i] < 0 ? charData[i] + 256 : charData[i]);
}
var data = pako.ungzip(binData); // I get same error for pako.inflate(binData);
var decoded = '';
for (var i = 0; i < data.length; i++) {
decoded += String.fromCharCode(data[i]);
}
}
}
If no suggestion for fixing the above, any ideas on how to ungzip a gDrive file programatically?
Thanks.

As of January 19, 2018 (see release notes) Apps Script now supports gzip compression accessible using the following Utilities methods:
gzip(blob)
ungzip(blob)

Running the supplied example code indeed results in unknown compression method error in my environment as well.
Try changing
var charData = file.getBlob().getDataAsString(); // same error if .getBytes() is used
To
var charData = file.getBlob().getBytes();
So that is
function myFunction() {
reports_folder_id = '<FOLDER_ID>';
report_name = 'zip3.csv.gz'; // name of gzipped CSV file
var fSource = DriveApp.getFolderById(reports_folder_id);
var fi = fSource.getFilesByName(report_name);
eval(UrlFetchApp.fetch('https://cdn.rawgit.com/nodeca/pako/master/dist/pako.js').getContentText());
if ( fi.hasNext() ) {
var file = fi.next();
var blobData = file.getBlob();
var charData = blobData.getBytes();
var binData = [];
for (var i = 0; i < charData.length; i++) {
binData.push(charData[i] < 0 ? charData[i] + 256 : charData[i]);
}
var data = pako.inflate(binData);
var decoded = '';
for (var i = 0; i < data.length; i++) {
decoded += String.fromCharCode(data[i]);
}
Logger.log(decoded);
}
}
Try running this script on a subset of your original "GlicemiaMisurazioni.csv.gz" file: https://drive.google.com/file/d/0B8geUNXmd4J2YzJoemFLMnBTbVU/view?usp=sharing
(I truncated the original csv to 32 rows to speed up execution for the sake of test – the original takes way too long to run)
Checking the logs shows that the uncompressing worked:
4;02/07/2017 03.00.30;;;158.0;158.0;1;0M0000UMQ5D;;;0;;
4;02/07/2017 02.59.30;;;158.0;158.0;1;0M0000UMQ5D;;;0;;
4;02/07/2017 02.58.30;;;159.0;159.0;1;0M0000UMQ5D;;;0;;
4;02/07/2017 02.57.30;;;159.0;159.0;1;0M0000UMQ5D;;;0;;
4;02/07/2017 02.56.30;;;158.0;158.0;1;0M0000UMQ5D;;;0;;
4;02/07/2017 02.56.00;;;;;0;;0.4;Novorapid ;0;Left flank;Test

You need to find out if pako supports gzip compression. If not, you should look for another compression package that supports gzip.

Related

Use importData to fetch and parse JSON

I had the following function running perfectly:
var ss = SpreadsheetApp.getActiveSpreadsheet();
var habSheet = ss.getSheetByName("Harvests");
var bVals = habSheet.getRange("b2:b").getValues();
var habs = bVals.filter(String).length;
var habitats = habSheet.getRange("B2:B"+habs+1).getDisplayValues();
var data = [];
var traitNames = habSheet.getRange("D1:U1").getValues();
var values = new Array(habs);
for (i = 0; i < habs; i++) {
values[i] = new Array(traitNames[0].length);
for (j=0; j<traitNames[0].length; j++){
values[i][j] = [""];
}
}
var rawData = "";
var names = new Array(habs);
for (i = 0; i < habs; i++) {
names[i] = new Array(1);
}
for (i=0; i<habs; i++){
try{
rawData = UrlFetchApp.fetch("https://api.genopets.me/habitat/"+habitats[i]);
data[i] = JSON.parse(rawData.getContentText());
names[i][0] = data[i].name;
for (j=0; j<data[i].attributes.length; j++){
value = data[i].attributes[j].value;
trait = data[i].attributes[j].trait_type;
for (k=0; k<=21; k++){
if (traitNames[0][k] == trait){
values[i][k] = value;
}
}
}
}
catch(err){
But I'm exceeding max fetch calls daily. I'm in an emergency situation because this needs to run again within an hour.
I'm trying to build a temporary fix, so I'm using importData to call the API with the following formula:
=join(",",IMPORTDATA("https://api.genopets.me/habitat/"&B2,","))
Then, I want to just replace rawData in the code with this imported data. However, now it comes in as text and can't be parsed in the same way. Is there a quick way to force it into JSON format or otherwise convert to a dictionary as before so that I can parse it with the same code?
I'm getting stuck because .name, .length, etc. are failing as the "rawData" is now just a string.
This is the code snippet I'm playing with to try and get this right and build the quick patch for right now:
// for (i=0; i<habs; i++){
var i=0;
importData = habSheet.getRange("AL1").getDisplayValue();
rawData = JSON.stringify(importData);
// Logger.log(rawData);
data[i] = rawData;
// data[i] = JSON.parse(rawData.getContentText());
names[i][0] = data[i].name;
for (j=0; j<data[i].attributes.length; j++){
value = data[i].attributes[j].value;
trait = data[i].attributes[j].trait_type;
for (k=0; k<=21; k++){
if (traitNames[0][k] == trait){
values[i][k] = value;
}
}
}
I've tried as above, and also without stringify, but I can't get this yet.
For reference, this is an example of the API response:
https://api.genopets.me/habitat/7vTz9dniU14Egpt8XHkMxP1x36BLRd15C11eUTaWhB19
Appreciate any help!
I have done a lot of testing to find a simple workaround, but could not find one, the string resulting from the =join(",",IMPORTDATA(url,",")) (and none of the other =IMPORTXXX functions) will work for your code. When using these IMPORT functions the data is interpreted and certain characters are removed or the values formatted, it is NOT recommended to use these functions.
Since you mentioned the message you are getting is related to quota limits you should consider splitting the load of this script in multiple Apps Script projects. As a possible immediate solution you can make a copy of the script (or file bound to the script), authorize the new copy and try again.
To increase performance you could try using the calls in bulk, use this other function fetchAll (https://developers.google.com/apps-script/reference/url-fetch/url-fetch-app#fetchallrequests). There is a 100 request limit for this method. This will result in the same quota usage.

How to extract files from .tar archive with Google Apps Script

Good day all,
I'm trying to get a tar.gz attachment from Gmail, extract the file and save it to Google Drive. It's a daily auto generated report which I'm getting, compressed due to >25mb raw size.
I got this so far:
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Setup");
var gmailLabels = sheet.getRange("B2:B2").getValue(); //I have my Gmail Label stored here
var driveFolder = sheet.getRange("B5:B5").getValue(); //I have my GDrive folder name stored here
// apply label filter, search only last 24hrs mail
var filter = "has:attachment label:" + gmailLabels + " after:" + Utilities.formatDate(new Date(new Date().getTime()-1*(24*60*60*1000)), "GMT", "yyyy/MM/dd");
var threads = GmailApp.search(filter, 0, 1); // check only 1 email at a time
var folder = DriveApp.getFoldersByName(driveFolder);
if (folder.hasNext()) {
folder = folder.next();
} else {
folder = DriveApp.createFolder(driveFolder);
}
var message = threads[0].getMessages()[0];
var desc = message.getSubject() + " #" + message.getId();
var att = message.getAttachments();
for (var z=0; z<att.length; z++) {
var attName = att[z].getName()
var attExt = attName.search('csv')
if (attExt > 0){ var fileType = "csv"; }
else {
var attExt = attName.search('tar.gz');
if (attExt > 0){ var fileType = "gzip"; }
else {
threads[x].addLabel(skipLabel);
continue;
}
}
// save the file to GDrive
try {
file = folder.createFile(att[z]);
file.setDescription(desc);
}
catch (e) {
Logger.log(e.toString());
}
// extract if gzip
if (fileType == 'gzip' ){
var ungzippedFile = Utilities.ungzip(file);
try {
gz_file = folder.createFile(ungzippedFile);
gz_file.setDescription(desc);
}
catch (e) {
Logger.log(e.toString());
}
}
}
Everything works fine, but in the last step it only decompresses the .gz file saving .tar file in the Drive. What can I do with it next? The .tar file contains a .csv file which I need to extract and process afterwards.
I should probably add that I'm limited to use GAS only.
Any help warmly appreciated.
How about this answer? Unfortunately, in the current stage, there are no methods for extracting files from a tar file in Google Apps Script, yet. But fortunately, from wiki of tar, we can retrieve the structure of the tar data. I implemented this method with Google Apps Script using this structure data.
1. Unarchive of tar data:
Before you run this script, please set the file ID of tar file to run(). Then, run run().
Sample script:
function tarUnarchiver(blob) {
var mimeType = blob.getContentType();
if (!mimeType || !~mimeType.indexOf("application/x-tar")) {
throw new Error("Inputted blob is not mimeType of tar. mimeType of inputted blob is " + mimeType);
}
var baseChunkSize = 512;
var byte = blob.getBytes();
var res = [];
do {
var headers = [];
do {
var chunk = byte.splice(0, baseChunkSize);
var headerStruct = {
filePath: function(b) {
var r = [];
for (var i = b.length - 1; i >= 0; i--) {
if (b[i] != 0) {
r = b.slice(0, i + 1);
break;
}
}
return r;
}(chunk.slice(0, 100)),
fileSize: chunk.slice(124, 124 + 11),
fileType: Utilities.newBlob(chunk.slice(156, 156 + 1)).getDataAsString(),
};
Object.keys(headerStruct).forEach(function(e) {
var t = Utilities.newBlob(headerStruct[e]).getDataAsString();
if (e == "fileSize") t = parseInt(t, 8);
headerStruct[e] = t;
});
headers.push(headerStruct);
} while (headerStruct.fileType == "5");
var lastHeader = headers[headers.length - 1];
var filePath = lastHeader.filePath.split("/");
var blob = Utilities.newBlob(byte.splice(0, lastHeader.fileSize)).setName(filePath[filePath.length - 1]).setContentTypeFromExtension();
byte.splice(0, Math.ceil(lastHeader.fileSize / baseChunkSize) * baseChunkSize - lastHeader.fileSize);
res.push({fileInf: lastHeader, file: blob});
} while (byte[0] != 0);
return res;
}
// Following function is a sample script for using tarUnarchiver().
// Please modify this to your situation.
function run() {
// When you want to extract the files from .tar.gz file, please use the following script.
var id = "### file ID of .tar.gz file ###";
var gz = DriveApp.getFileById(id).getBlob().setContentTypeFromExtension();
var blob = Utilities.ungzip(gz).setContentTypeFromExtension();
// When you want to extract the files from .tar file, please use the following script.
var id = "### file ID of .tar file ###";
var blob = DriveApp.getFileById(id).getBlob().setContentType("application/x-tar");
// Extract files from a tar data.
var res = tarUnarchiver(blob);
// If you want to create the extracted files to Google Drive, please use the following script.
res.forEach(function(e) {
DriveApp.createFile(e.file);
});
// You can see the file information by below script.
Logger.log(res);
}
2. Modification of your script:
If this script is used for your script, for example, how about this? tarUnarchiver() of above script is used. But I'm not sure how you want to use this script. So please think of this as a sample.
Sample script:
// extract if gzip
if (fileType == 'gzip' ){
var ungzippedFile = Utilities.ungzip(file);
try {
var blob = ungzippedFile.setContentType("application/x-tar"); // Added
tarUnarchiver(blob).forEach(function(e) {folder.createFile(e.file)}); // Added
}
catch (e) {
Logger.log(e.toString());
}
}
In this modified script, the blob of ungzippedFile (tar data) is put to my script and run tarUnarchiver(). Then, each file is created to the folder.
Note:
When you run this script, if an error related to mimeType occurs, please set the mimeType of "tar" to the input blob.
As the method for setting the mimeType, you can use as follows.
blob.setContentTypeFromExtension() Ref
blob.setContentType("application/x-tar") Ref
It might have already been got the mimeType in the blob. At that time, setContentTypeFromExtension() and setContentType() are not required.
If you want to retrieve the file path of each file, please check the response from tarUnarchiver(). You can see it as a property of fileInf from the response.
Limitations:
When this script is used, there is the limitations as follows. These limitations are due to Google's specification.
About the file size, when the size of tar data is over 50 MB (52,428,800 bytes), an error related to size limitation occurs.
When the size of extracted file is over 50 MB, an error occurs.
When a single file size of extracted file is near to 50 MB, there is a case that an error occurs.
In my environment, I could confirm that the size of 49 MB can be extracted. But in the case of just 50 MB
, an error occurred.
Reference:
tar (wiki)
In my environment, I could confirm that the script worked. But if this script didn't work, I apologize. At that time, can you provide a sample tar file? I would like to check it and modify the script.

Google Sheets Logger values into rows

I am pulling data from a .csv file generated after a URL request using UrlFetchApp.fetch and storing the data using the logger.
Is there a way to write each row of fetched data from the logger into separate rows in a spreadsheet?
I did this quite a while ago. It's how I import data from website visitor logs. To review them. I don't use commas because you find them in the data too often. I use 3 tildes '~~~' instead and the lines are separated with a line feed '\n'.
But I basically split the lines and then fields on each line into array and feed them into the sheets one line at a time and usually I deal with 20 or 30 files about 100K or less. And I find that it loads the files very quickly.
function importData1(myFolderID,myFolderName,myFileName) {
var myFolderID = typeof(myFolderID) !== 'undefined' ? myFolderID : 'FolderID';
var myFileName = typeof(myFileName) !== 'undefined' ? myFileName : '';
if(myFileName && myFolderID)
{
var fi = DriveApp.getFolderById(myFolderID).getFilesByName(myFileName); // Selected IPLogYYMMDD.txt file
var ssid = SpreadsheetApp.getActive().getId();
var ss = SpreadsheetApp.openById(ssid);
if (fi.hasNext() ) // proceed if file exists in the IPlogs folder
{
var file = fi.next();
var data = file.getBlob().getDataAsString();
var lines = data.split('\n');
var newsheet = ss.insertSheet(myFolderName + '/' + myFileName);
var j=0;
for (var i=0; i<lines.length; i++ )
{
var fields = lines[i].split('~~~');
if(fields.length>=8)//There's supposed to be 8 or 9 fields
{
Logger.log('i=' + i + 'fields.length=' + fields.length);
newsheet.getRange(j+1, 1, 1,fields.length).setValues([fields]);
j=j+1;
}
}
}
}
else
{
displayStatus('Error Importing Data','Either Folder or File not found in importData1');
}
Most of the variables are easy to figure out. You'll probably be able to adapt it to what you need. And probably there will be several optional answers for you to chose from.

How to get the number of pages in Google Docs via Google script?

What should I do to get amount of pages in Google Docs (when converted to PDF) via Google script?
I have tried this, but it returns 0 instead of the number of pages.
function getNumPages()
{
var blob = DocumentApp.getActiveDocument().getAs("application/pdf");
var data = blob.getDataAsString();
var re = /Pages\/Count (\d+)/g;
var match;
var pages = 0;
while(match = re.exec(data)) {
Logger.log("MATCH = " + match[1]);
var value = parseInt(match[1]);
if (value > pages) {
pages = value;
}
}
Logger.log("pages = " + pages);
return pages;
}
Your regular expression expects a string like Pages/Count 3 in the PDF file. Logging the contents of the file with Logger.log(data) shows there isn't such a string. Instead, I find the number of pages near the beginning of the file:
<< /Linearized 1 /L 18937 /H [ 687 137 ] /O 10 /E 17395 /N 3 /T 18641 >>
The number following /N is the number of pages. Here is a function extracting it:
function getNumPages() {
var blob = DocumentApp.getActiveDocument().getAs("application/pdf");
var data = blob.getDataAsString();
var pages = parseInt(data.match(/ \/N (\d+) /)[1], 10);
Logger.log("pages = " + pages);
return pages;
}
function getNumPages(docId) {
var pages = 0;
var blob = DocumentApp.openById(docId).getAs("application/pdf");
var data = blob.getDataAsString();
try {
var matched = data.match(/\/Type[\s]*\/Page[^s]/g);
pages = matched.length;
} catch(err) {
// NOOP
}
return pages;
}

How to get CSV data as a string from GmailAttachment?

I'd like to grab gmail attachments that are CSV files then import them into a google sheet.
Here's where I'm stuck - turning the attachment into a string. I think I have a blob to which the method getContentAsString applies, but apparently I still have the type GmailAttachment because I'm getting this error:
TypeError: Cannot find function getContentAsString in object GmailAttachment.
here's the relevant code:
//************** get the attachments ***************************************
var attachments = [];
var files = [];
for (var i = 0; i < 4; i ++) {
attachments = messages[i].getAttachments();
for (var k = 0; k < attachments.length; k = k+2) { //2 attachments per message, but I only want the 1st one
j = k/2;
files[j] = attachments[k].copyBlob();
Logger.log('Message "%s" contains the attachment "%s" (%s bytes)',
messages[i].getSubject(), files[j].getName(), files[j].getSize());
}
}
var csvFile = "";
for (var i = 0; i < files.length; i++) {
csvFile = files[i].getContentAsString();
}
why is .copyBlob() not returning a blob, but in this case a GmailAttachemnt, and how can I fix it?
is the problem here:
files[j] = attachments[k].copyBlob();
?
by the way I also tried getBlob() instead of copyBlob() and it returned a type error at this line above.
using copyBlob() I get the typeError at this line:
csvFile = files[i].getContentAsString();
Thanks for your help!
Inorder to get the string value of a blob you should use: getDataAsString() method in Blob class. The .getContentAsString() method is from the DocsList Service which is deprecated.
You could also use the getDataAsString() method from the GmailAttachment class itself. Hope that helps!