Extending script execution time beyond 5 minutes for reddit scraping

Extending script execution time beyond 5 minutes for reddit scraping - google-apps-script

I'm attempting to gather all of the posts submitted to a particular subreddit using the code found here: https://www.labnol.org/internet/web-scraping-reddit/28369/ However the execution limit is reached well before this completes.
I am looking for a way to extend the run time of the script, with it ideally not needing my intervention at all once I click run.
const getThumbnailLink_ = url => {
if (!/^http/.test(url)) return '';
return `=IMAGE("${url}")`;
};
const getHyperlink_ = (url, text) => {
if (!/^http/.test(url)) return '';
return `=HYPERLINK("${url}", "${text}")`;
};
const writeDataToSheets_ = data => {
const values = data.map(r => [
new Date(r.created_utc * 1000),
r.title,
getThumbnailLink_(r.thumbnail),
getHyperlink_(r.url, 'Link'),
getHyperlink_(r.full_link, 'Comments')
]);
const sheet = SpreadsheetApp.getActiveSheet();
sheet.getRange(sheet.getLastRow() + 1, 1, values.length, values[0].length).setValues(values);
SpreadsheetApp.flush();
};
const isRateLimited_ = () => {
const response = UrlFetchApp.fetch('https://api.pushshift.io/meta');
const { server_ratelimit_per_minute: limit } = JSON.parse(response);
return limit < 1;
};
const getAPIEndpoint_ = (subreddit, before = '') => {
const fields = ['title', 'created_utc', 'url', 'thumbnail', 'full_link'];
const size = 10000;
const base = 'https://api.pushshift.io/reddit/search/submission';
const params = { subreddit, size, fields: fields.join(',') };
if (before) params.before = before;
const query = Object.keys(params)
.map(key => `${key}=${params[key]}`)
.join('&');
return `${base}?${query}`;
};
const scrapeReddit = (subreddit = 'AskMen') => {
let before = '';
do {
const apiUrl = getAPIEndpoint_(subreddit, before);
const response = UrlFetchApp.fetch(apiUrl);
const { data } = JSON.parse(response);
const { length } = data;
before = length > 0 ? String(data[length - 1].created_utc) : '';
if (length > 0) {
writeDataToSheets_(data);
}
} while (before !== '' && !isRateLimited_());
};

Generally it's a better practice to optimize your script to not reach the execution time defined by your quota. So in your case, one solution is to reduce the batch size per execution. In the reference you linked the code fetches 1000 posts per batch, your code fetches 10000.
Try with smaller values to see if the script execution time does not exceed quota anymore.
const getAPIEndpoint_ = (subreddit, before = '') => {
const fields = ['title', 'created_utc', 'url', 'thumbnail', 'full_link'];
const size = 1000;
const base = 'https://api.pushshift.io/reddit/search/submission';
const params = { subreddit, size, fields: fields.join(',') };
if (before) params.before = before;
const query = Object.keys(params)
.map(key => `${key}=${params[key]}`)
.join('&');
return `${base}?${query}`;
};
But if you have a business need to exceed your quota, you can upgrade to to either Google Workspace Basic, Business or Enterprise - depending on how much you need to increase your quota and how much you are willing to pay.
See here for more information about different accounts and pricing.

Related

Scraping <meta> content with Cheerio

Tanaike helped me with this amazing script using Cheerio. The original was for Letterboxd, but this one is to pull in a watchlist from Trakt.tv (sample watchlist).
As it is right now it pulls in the watched date and the title, but I'd also like to pull in the content from the meta tag for each item.
<meta content="8 Million Ways to Die (1986)" itemprop="name">
I tried using $('[itemprop="name"]').attr('content'); but it doesn't accept the .attr piece.
Here is the full script as it is now, returning the watched date in the Col1 and the title in Col2.
/**
* Returns Trakt watchlist by username
* #param pages enter the number of pages in the list. Default is 10
* #customfunction
*/
function TRAKT(pages=10) {
const username = `jerrylaslow`
const maxPage = pages;
const reqs = [...Array(maxPage)].map((_, i) => ({ url: `https://trakt.tv/users/`+ username +`/history/all/added?page=${i + 1}`, muteHttpExceptions: true }));
return UrlFetchApp.fetchAll(reqs).flatMap((r, i) => {
if (r.getResponseCode() != 200) {
return [["Values couldn't be retrieved.", reqs[i].url]];
}
const $ = Cheerio.load(r.getContentText());
const ar = $(`a.titles-link > h3.ellipsify, h4 > span.format-date`).toArray();
return [...Array(Math.ceil(ar.length / 2))].map((_) => {
const temp = ar.splice(0, 2);
const watchDate = Utilities.formatDate(new Date($(temp[1]).text().trim().replace(/T|Z/g, " ")),"GMT","yyyy-MM-dd");
const title = $(temp[0]).text().trim();
return [watchDate,title];
});
});
}
The values can be pulled with this, so I know there isn't any sort of blocking in play.
=IMPORTXML(
"https://trakt.tv/users/jerrylaslow/history",
"//meta[#itemprop='name']/#content")
Any help is appreciated.

In order to achieve your goal, in your script, how about the following modification?
Modified script:
function TRAKT(pages = 10) {
const username = `jerrylaslow`
const maxPage = pages;
const reqs = [...Array(maxPage)].map((_, i) => ({ url: `https://trakt.tv/users/` + username + `/history/all/added?page=${i + 1}`, muteHttpExceptions: true }));
return UrlFetchApp.fetchAll(reqs).flatMap((r, i) => {
if (r.getResponseCode() != 200) {
return [["Values couldn't be retrieved.", reqs[i].url]];
}
const $ = Cheerio.load(r.getContentText());
const ar = $(`a.titles-link > h3.ellipsify, h4 > span.format-date`).toArray();
return [...Array(Math.ceil(ar.length / 2))].map((_) => {
const temp = ar.splice(0, 2);
const c = $(temp[0]).parent('a').parent('div').parent('div').find('meta').toArray().find(ff => $(ff).attr("itemprop") == "name"); // Added
const watchDate = Utilities.formatDate(new Date($(temp[1]).text().trim().replace(/T|Z/g, " ")), "GMT", "yyyy-MM-dd");
const title = $(temp[0]).text().trim();
return [watchDate, title, c ? $(c).attr("content") : ""]; // Modified
});
});
}
When this modified script is run, the value of content is put to 3rd column. If you want to put it to other column, please modify return [watchDate, title, c ? $(c).attr("content") : ""];.

How to decrypt AES with Google Apps Script

I am trying to decrypt AES with GAS. The target of decryption is a document file retrieved by Amazon Selling Partner API.
The key, iv, and URL are obtained by the API, and I want to decrypt the data downloaded by accessing the URL with the key and iv.
However, the decrypted text is either empty or garbled.
Can you please tell me what is wrong with the following code? The code uses cCryptoGS, which is a wrapper library for CryptoJS.
const decrypt_test = () => {
const url = 'https://tortuga-prod-fe.s3-us-west-2.amazonaws.com/%2FNinetyDays/amzn1.tortuga.3.5d4685fe-cdf1-4f37-8dfc-a25b85468e34.T1J5QXLEXAMPLE';
const response = UrlFetchApp.fetch(url);
const file = response.getContentText();
const key = 'xiZ8FGT6pYo49ZwfvAplJxKgO0qW46Morzs5aEXAMPLE';
const iv = 'aoGh0rhbB3ALlCFKiEXAMPLE';
const enc_key = cCryptoGS.CryptoJS.enc.Base64.parse(key);
const enc_iv = cCryptoGS.CryptoJS.enc.Base64.parse(iv);
const cipherParams = cCryptoGS.CryptoJS.lib.CipherParams.create({
ciphertext: file//cCryptoGS.CryptoJS.enc.Base64.parse(file)
});
console.log(`enc_key_length:${enc_key.words.length}`);
console.log(`enc_iv_length:${enc_iv.words.length}`);
const decryptedMessage = cCryptoGS.CryptoJS.AES.decrypt(cipherParams, enc_key, { iv: enc_iv, mode: cCryptoGS.CryptoJS.mode.CBC}).toString();
console.log(`decryptedMessage:${decryptedMessage}`);
return decryptedMessage;
};
[output]
2021/06/20 20:04:04 debug enc_key_length:8
2021/06/20 20:04:04 debug enc_iv_length:4
2021/06/20 20:04:04 debug decryptedMessage:bfc095f3ecec221e8585ceb68031078d25112f5f26ea2c1f80470f5f4f19f2e1c2cd94638e8666c3486fa29191b568bcd9e8d5a3bdcbbc05456f0567bb6cdae675fa044f94e560379d16b1d370cd7c4a9c5afbbcf4fde2694ed01c1b7950eaabc65e46c4640d8f0814bfe66e8ae65f7768136ac4615624be25373d665ee8fde82742e26664d7c09c61ac8994dc3052f0f22d5042f0b407d696e3c84a3906350dc60c46001ef7865d0c6594c57c5af22616688e028f52d4f12b538d0580c420fdcb0ee61287d4ee2629cd7d39f739d63e84dd75e948eaffb4383076f0c66997

The following code solved the problem
const decrypt_test = () => {
const url = 'https://tortuga-prod-fe.s3-us-west-2.amazonaws.com/%2FNinetyDays/EXAMPLE';
let options = {
'method': 'get',
'muteHttpExceptions': true,
};
const response = UrlFetchApp.fetch(url, options);
const file = response.getBlob().getBytes();
const key = 'xiZ8FGT6pYo49ZwfvAplJxKgO0qW46MoEXAMPLE';
const iv = 'aoGh0rhbB3ALlCFKiuJEXAMPLE';
const enc_key = cCryptoGS.CryptoJS.enc.Base64.parse(key);
const enc_iv = cCryptoGS.CryptoJS.enc.Base64.parse(iv);
const cipherParams = cCryptoGS.CryptoJS.lib.CipherParams.create({
ciphertext: cCryptoGS.CryptoJS.enc.Hex.parse(hexes(file))
});
const decryptedMessage = cCryptoGS.CryptoJS.AES.decrypt(cipherParams, enc_key,
{ iv: enc_iv, mode: cCryptoGS.CryptoJS.mode.CBC}).toString();
console.log(`decryptedMessage:${decryptedMessage}`);
const bin = bytes(decryptedMessage)
const myBlob = Utilities.newBlob(bin, MimeType.TEXT, "decrypted.csv");
DriveApp.createFile(myBlob);
};
const bytes = (hexstr) => {
ary = [];
for (var i = 0; i < hexstr.length; i += 2) {
ary.push(parseInt(hexstr.substr(i, 2), 16));
}
return ary;
}
const hexes = (ary) => {
return ary.map((e) => ( '00' + (e < 0 ? e += 0x0100 : e).toString(16)).slice(-2)).join('')
}

How can I get the count of rows in my uploaded excel

I have uploaded an excel and need to read it and get the count of the number of rows in it.
I have tried the below codes but it doesn't seen to work out.
I have been getting so many errors like,"_fs.readFileSync is not a function" "f.slice is not a function" and currently I am stuck at "input.replace is not a function". Please let me know where am I going wrong or what actually needs to be done
Component.ts
<pre>
proceedUpload(){
if(this.file != null) {
var workbook = XLSX.read(this.file);
var firstSheet = workbook.SheetNames[0];
var excelRows = XLSX.utils.sheet_to_json(workbook.Sheets[firstSheet]);
var number = excelRows.length();
if( number > 0 )
{
this.uploadMessage = true;
}
else
{
this.uploadMessage = false;
}
let formData: FormData = new FormData();
formData.append('Files', this.file);
this.http.post("Url", formData).map(res => res.json())
.catch(error => Observable.throw(error))
.subscribe(
data => console.log('success'),
error => console.log(error)
)
}
else
{
alert("Please upload a file!")
}
</pre>
Component.html
<div *ngIf="uploadMessage">
<p>{{number}} uploaded!!</p>
</div>

const table = XLSX.readFile('mytable.xlsx');
const sheet = table.Sheets[table.SheetNames[0]];
var range = XLSX.utils.decode_range(sheet['!ref']);
console.log(range.e.r);
Source

let excelRowsObjArr = XLSX.utils.sheet_to_row_object_array(workbook.Sheets[firstSheet]);
On this one you can do:
excelRowsObjArr.length
For more info you can go here: https://www.aspsnippets.com/Articles/Read-Parse-Excel-File-XLS-and-XLSX-using-JavaScript.aspx

trouble with "statistics" property when using with YouTube.Search.list in Google App Script

I cannot seem to get this script to work for 'statistics':
function searchByKeyword2() {
var results = YouTube.Search.list('statistics', {q: 'dogs', maxResults: 5, });
for(var i in results.items) {
var item = results.items[i];
Logger.log(item);
}
}
I can use 'id', 'snippet', or 'id, snippet', but I cannot get it to work with 'statistics'. I've been looking an answer for hours, but I haven't found anything. Any clues?

Per the API documentation, YouTube.Search includes results for Videos, Channels, and Playlists. Not all of these resources have statistics nodes, and thus the YouTube.Search endpoint does not allow querying for the statistics node - only id and snippet.
For collections which track statistics, like Videos, you query them directly to access statistics. Since Videos.list does not search, you need to first search and then provide the relevant video IDs. Note that you can change the search order (and many other search properties) - full details available in the API reference - but the default sort is 'relevance'.
As an example:
function getVideoStatistics(videoIds) {
const options = {
id: videoIds.join(","),
fields: "nextPageToken,items(id,statistics)"
};
const results = [];
do {
var search = YouTube.Videos.list('statistics', options);
if (search.items && search.items.length)
Array.prototype.push.apply(results, search.items);
options.pageToken = search.nextPageToken;
} while (options.pageToken);
return results;
}
function getVideosFromQuery(query, maxResults) {
const options = {
q: query,
maxResults: maxResults,
type: 'video',
fields: "nextPageToken,pageInfo/totalResults,items(id/videoId,snippet(title,channelTitle))"
};
const results = [];
do {
var search = YouTube.Search.list('snippet', options);
if (search.items && search.items.length)
Array.prototype.push.apply(results, search.items);
options.pageToken = search.nextPageToken;
} while (options.pageToken && results.length < search.pageInfo.totalResults && results.length < maxResults);
return results;
}
function foo() {
var someQuery = "something";
var searchResults = getVideosFromQuery(someQuery, 50);
var ids = searchResults.map(function (videoSearchResult) {
return videoSearchResult.id.videoId;
});
var stats = getVideoStatistics(ids);
console.log({message:"query video statistics", searchResults: searchResults, statistics: stats});
}

Export Form responses as csv Google Apps Scripts

Is there is a fast way to programmatically export all responses from a Google Form to a csv? Something like "Export responses to csv" invoked via Scripts.
Right now I'm doing it in a rock art way:
Iterate over the forms I want to export (~75)
Open each form var form = FormApp.openById(formId);
Get responses: var formReponses = form.getResponses(); (from 0 to 700 responses each form)
Iterate over responses and get item responses: var preguntes = formReponses[r].getItemResponses();
For each itemResponse, convert it to csv/json
Export responses to a drive file
This is extremly slow and additionally it hangs over and over, so I had to export responses in chunks of 50 responses and save them in Drive separated files. On next execution (after letting servers to cool down for a while), I'm executing the script again, skipping the number of responses found on the chunk file.
Additionally I'm not sure that Google keeps the responses order when doing form.getResponses(); (actually I've found that if the form has been modified, the order is not the same)
Is there a better way to do it?

Whith the help of #JackBrown I've managed to write a Chrome extension to download responses (maybe soon in github). This will wait for each download in the formIds object until finished and then prompt for the next one:
'use strict';
function startDownload() {
const formIds = {
'Downloads-subfolder-here': {
'Download-filename-here': '1-cx-aSAMrTK0IHsQkE... {form-id here}',
'Another-filename-here': '...-dnqdpnEso {form-id here}',
// ...
},
'Another-subfolder-here': {
'Download-filename-here': '1-cx-aSAMrTK0IHsQkE... {form-id here}',
'Another-filename-here': '...-dnqdpnEso {form-id here}',
// ...
},
};
const destFolders = Object.keys(formIds);
const downloads = [];
for (let t = 0, tl = destFolders.length; t < tl; t += 1) {
const destFolder = destFolders[t];
const forms = Object.keys(formIds[destFolder]);
for (let f = 0, fl = forms.length; f < fl; f += 1) {
const formName = forms[f];
downloads.push({
destFolder,
formName,
url: `https://docs.google.com/forms/d/${formIds[destFolder][formName]}/downloadresponses?tz_offset=-18000000`,
filename: `myfolder/${destFolder}/${formName.replace(/\//g, '_')}.csv`,
});
}
}
const event = new Event('finishedDownload');
const eventInterrupt = new Event('interruptedDownload');
let currId;
chrome.downloads.onChanged.addListener((downloadDelta) => {
if (downloadDelta.id === currId) {
if (downloadDelta.state && downloadDelta.state.current === 'complete') {
document.dispatchEvent(event);
} else if (downloadDelta.state && downloadDelta.state.current === 'interrupted') {
console.log(downloadDelta);
document.dispatchEvent(eventInterrupt);
}
}
});
downloads.reduce((promise, actual) => {
return promise.then((last) => (last ? new Promise((resolve) => {
const { url, filename, destFolder, formName } = actual;
function listener() {
document.removeEventListener('finishedDownload', listener);
document.removeEventListener('interruptedDownload', listener);
resolve(true);
};
function interrupt() {
document.removeEventListener('finishedDownload', listener);
document.removeEventListener('interruptedDownload', listener);
resolve(false);
}
console.log(`Processant ${destFolder}, ${formName}: ${url}`);
document.addEventListener('finishedDownload', listener);
document.addEventListener('interruptedDownload', interrupt);
chrome.downloads.download({ url, filename }, (downloadId) => {
currId = downloadId;
if (!downloadId) {
console.log();
console.log('Error downloading...');
console.log(runtime.lastError);
resolve();
}
});
}) : Promise.resolve(false)));
}, Promise.resolve(true));
}
chrome.browserAction.onClicked.addListener((/*tab*/) => startDownload());

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Extending script execution time beyond 5 minutes for reddit scraping - google-apps-script

Related

Scraping <meta> content with Cheerio

How to decrypt AES with Google Apps Script

How can I get the count of rows in my uploaded excel

trouble with "statistics" property when using with YouTube.Search.list in Google App Script

Export Form responses as csv Google Apps Scripts

Categories

Resources