I have this function running in a Google Sheets script that pulls HTML from subreddits and returns them to a spreadsheet. It works for me some/most of the time, but other times I get an error "Could not parse text. (line 13)" which is the line with var doc = Xml.parse(page, true);. Any idea why this is happening or is this just a bug with Google Scripts? Here's the code that works...sometimes.
function getRedditHTML() {
var entries_array = [];
var subreddit_array = ['https://www.reddit.com/r/news/','https://www.reddit.com/r/funny/','https://www.reddit.com/r/science/'];
for (var s = 0; s < subreddit_array.length; s++) {
var page = UrlFetchApp.fetch(subreddit_array[s]);
//this is Line 13 that is breaking
var doc = Xml.parse(page, true);
var bodyHtml = doc.html.body.toXmlString();
doc = XmlService.parse(bodyHtml);
var root = doc.getRootElement();
var entries = getElementsByClassName(root,'thing');
for (var i = 0; i < entries.length; i++) {
var title = getElementsByClassName(entries[i],'title');
title = XmlService.getRawFormat().format(title[1]).replace(/<[^>]*>/g, "");
var link = getElementsByClassName(entries[i],'comments');
link = link[0].getAttribute('href').getValue();
var rank = getElementsByClassName(entries[i],'rank');
rank = rank[0].getValue();
var likes = getElementsByClassName(entries[i],'likes');
likes = likes[0].getValue();
entries_array.push([rank, likes, title, link]);
}
}
return entries_array.sort(function (a, b) {
return b[1] - a[1];
});
}
Here is what I found upon playing with importXML (my usual way of doing this) - for some reason I cannot narrow down - it DOES appear to randomly stall out and return null for a few minutes - so I'm guessing the issue with your thing is not the code but that the site or google temporarily blocks/won't return the data -
however I found the JSON endpoint to the piece you want - and I noticed that when XML went down - the JSON didnt.
You can take that and fix it to push your own array of topics/urls - I just left it for one link for now to show you how the URL breaks down and where it should be modified:
The URL is 'https://www.reddit.com/r/news/hot.json?raw_json=1&subredditName=news&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client
News is mentioned in 2 places so just modify all your URLs to follow that method - you can easily load that javascript in a browser to see all the fields available
Also the portion hot.json is where you can change whether you want the ranked list (called hot), or new,top,promoted, etc. you just change that keyword.
Score is the same as the upvotes/likes
function getSubReddit() {
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sheet = ss.getActiveSheet(); //get Active sheet
var subject = 'news';
var url = 'https://www.reddit.com/r/' + subject + '/hot.json?raw_json=1&subredditName=' + subject + '&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client'; //json endpoint for data
var response = UrlFetchApp.fetch(url); // get api endpoint
var json = response.getContentText(); // get the response content as text
var redditData = JSON.parse(json); //parse text into json
Logger.log(redditData); //log data to logger to check
//create empty array to hold data points
var statsRows = [];
var date = new Date(); //create new date for timestamp
//The following lines push the parsed json into empty stats array
for (var j=0;j<25;j++){
for (var i =0;i<25;i++){
var stats=[];
stats.push(date);//timestamp
stats.push(i+1);
stats.push(redditData.data.children[i].data.score); //score
stats.push(redditData.data.children[i].data.title); //title
stats.push(redditData.data.children[i].data.url); //article url
// stats.push('http://www.reddit.com' + redditData.data.children[i].data.permalink); //reddit permalink
statsRows.push(stats)
}
//append the stats array to the active sheet
sheet.appendRow(statsRows[j])
}
}
Related
I've written a custom Google Apps Script that will pull some data (2 columns wide, 50-100 rows long but this varies)in an array 2 from an API, parse it into JSON and then paste into a google sheet.
I can run the script from the editor and it works ok. But when I try to run it from a custom menu or when I run the debugger I get the following error:
'Cannot find method getRange(number,number,(class),number) (line 43)'
Line 43 is the last line of the code.
sheet.getRange(3,1,dataSet.length,2).setValues(rows);
It seems that the issue is that getRange is not able to use the variable of length of the dataset (number of rows) to set the number of rows to use in the range in which the data is to be pasted.
I cannot work out how to fix this - can anyone else see what I am doing wrong? Thanks for taking a look.
//custom menu
function onOpen() {
var ui = SpreadsheetApp.getUi();
ui.createMenu('XXXX Data')
.addItem('Credit Limits','CREDITLIMITS')
.addToUi();
}
function CREDITLIMITS() {
var ss = SpreadsheetApp.getActiveSpreadsheet(); //get active spreadsheet
var sheet = ss.getActiveSheet();
// var sheet = ss.getSheetByName('data'); //get sheet by name from active spreadsheet
// URL and params for the API
var USERNAME = 'XXXXXXX';
var PASSWORD = 'XXXXXXXXXXXXX';
var url = 'https://api.XXXX.com/api/v1/XXX/?where=type=%27XXXXXXX%27'; // var url="http://example.com/feeds?type=json"; // Paste your JSON URL here
var authHeader = 'Basic ' + Utilities.base64Encode(USERNAME + ':' + PASSWORD);
var params = {'method': 'GET','muteHttpExceptions': true,'headers': {'Authorization': authHeader,} };
//call the XXXX API
var response = UrlFetchApp.fetch(url, params); // get api endpoint
var json = response.getContentText(); // get the response content as text
var dataAll = JSON.parse(json); //parse text into json
var dataSet = dataAll;
//create empty array to hold data points
var rows=[],
data;
//loop over the retrun events
for (i=0; i < dataSet.length; i++) {
data = dataSet[i];
//push a row of data as 2d array
rows.push([data.company, data.creditLimit]);
}
// clear any previous content
sheet.getRange(1,1,500,10).clearContent();
// write data to sheet
sheet.getRange(3,1,dataSet.length,2).setValues(rows);
}
In a Spreadsheet cell I have a link to a Google doc and I want to recuperate it in order to open the Google Doc and modify it. So in my cell I've put this format https://docs.google.com/document/d/1Gb48I...... (without the /edit in the end) and I've tried
var body = '=HYPERLINK("'data[n][COLUMN_URL-1]'+'/edit'")'.getBody();
var body = '=HYPERLINK("'data[n][COLUMN_URL-1]'+'/edit'")'.getBody();
// this one works but I want to use data[n][COLUMN_URL-1] because I have several Google Docs links
var body = DocumentApp.openByUrl('https://docs.google.com/document/d/1Gb48IXos......../edit').getBody();
if(body){
//edit the Google doc
If you have ideas what to do thank you because with the concatenation of the /edit(3rd line of code) it works
Edit: The cell is not formatted(hyperlinked)/formula i have only the https://...
Edit 2: I've tried with a code from Suhail Ansari thank you but I have an error that the document is missing and I don't have nothing for the logs if you have others ideas:
var COLUMN_URL = 10 ;
function getIdFrom(url) {
var id = "";
var parts = url.split(/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?/);
if (url.indexOf('?id=') >= 0){
id = (parts[6].split("=")[1]).replace("&usp","");
return id;
} else {
id = parts[5].split("/");
//Using sort to get the id as it is the longest element.
var sortArr = id.sort(function(a,b){return b.length - a.length});
id = sortArr[0];
return id;
}
}
var sheet = SpreadsheetApp.openById(SPREADSHEET_ID).getSheetByName(SHEET_NAME);
var numRows = sheet.getLastRow();
var lastColumn = sheet.getLastColumn();
var data = sheet.getRange(1,1,numRows,lastColumn).getDisplayValues();
for(n=1;n < data.length;n++) {
var URL =data[n][COLUMN_URL-1];
Logger.log('The URL ',URL);
var id = getIdFrom(URL);
Logger.log('The ID ',id);
var body = DocumentApp.openById(id).getBody();
if(body)
{......//edit Google Doc
This is the 10 column with the URLs from the Google Docs
In fact I have docs.google.com/open?id=1qo0B_HCbjcBrYyJ ... as URL for the others rows I tried to do by hand for 3rd one as you can see in the picture
Edit 3: So I am very confused in this moment because the column I have in the spreadsheet commes from a loop where I put the URL of the Google Docs in the cell like this : var trange = sheet.getRange.. trange.setValue(doc.getUrl()); and now if i look at the table i have the links in this format
https://docs.google.com/open?id=1RYRVotAq6IOz5tys1krnENfgN_pU0KYuUzR24i
so now if i want to get back the Google Doc I have the following :
// var body = DocumentApp.openByUrl('https://docs.google.com/open?id=1Xb4QjiWwpn2TJ8-9kkIhU6m1rgmb48g6xYVopN').getBody();
//var body = DocumentApp.openByUrl('https://docs.google.com/1Xb4QjiWwpn2TJ8-9kkIhU6m1rgmb48g6xYVopN/edit').getBody();
var body = DocumentApp.openByUrl('https://docs.google.com/document/d/1Xb4QjiWwpn2TJ8-9kkIhU6m1rgmb48g6xYVopN/edit').getBody();
only the 3rd one works.Where I am doing wrong? Because I want to
for(n=1;n < data.length;n++) {... make a loop for and var URL =
data[n][COLUMN_URL-1];
You could add some code for getting the Doc ID from the URL and then open the doc by using
DocumentApp.openById(id);
Here is a link to do that.
A sample code below with help from the above linked answer:
function myFunction() {
var URL = SpreadsheetApp.getActiveSheet().getRange('A1').getValue();
var id = getIdFrom(URL);
var doc = DocumentApp.openById(id);
var body = doc.getBody();
Logger.log(body.getText());
}
function getIdFrom(url) {
var id = "";
var parts = url.split(/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?/);
if (url.indexOf('?id=') >= 0){
id = (parts[6].split("=")[1]).replace("&usp","");
return id;
} else {
id = parts[5].split("/");
//Using sort to get the id as it is the longest element.
var sortArr = id.sort(function(a,b){return b.length - a.length});
id = sortArr[0];
return id;
}
}
Demo Google Sheet
With the help of this SO questionsI am trying to scrape the following website. I would like the two teams and the time. For example, the first entry would be Chicago | Miami | 12:30 PM, and the last entry would be Colorado | Arizona | 10:10 PM. My code is as follows
function espn_schedule() {
var url = "http://www.espn.com/mlb/schedule/_/date/20180329";
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser.data(content).from('class="schedule has-team-logos align-left"').to('</tbody>').iterate();
var res = [];
var temp = [];
var away_ticker = "";
scraped.forEach(function(e){
var away_team = Parser.data(e).from('href="mlb/team/_/name/').to('"').build();
var time = Parser.data(e).from('a data-dateformat="time1"').to('</a>').build();
if (away_ticker == "") away_ticker = away_team;
if (away_team != away_ticker) {
temp.splice(1, 0, away_ticker);
res.push(temp);
temp = [];
away_ticker = away_team;
temp.push(time);
}
});
var ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Schedule");
ss.getRange(ss.getLastRow() + 1, 1, res.length, res[0].length).setValues(res);
}
I get the following error:
TypeError: Cannot read property "length" from undefined. (line 42, file "Code")
Here is a modified solution that works
function espn_schedule() {
var url = "http://www.espn.com/mlb/schedule/_/date/20180329";
var content = UrlFetchApp.fetch(url).getContentText();
var e = Parser.data(content).from('class="schedule has-team-logos align-left"').to('</tbody>').build();
var res = [];
//Logger.log(scraped[0])
var temp = [];
var away_ticker = "";
var teams = Parser.data(e).from('<abbr title="').to('">').iterate();
Logger.log(teams)
var time = Parser.data(e).from('data-date="').to('">').iterate()
Logger.log(time)
for( var i = 0; i<teams.length ; i = i+2)
{
res[i/2] = []
res[i/2][0] = teams[i]
res[i/2][1] = teams[i+1]
res[i/2][2] = new Date(time[i/2]).toLocaleTimeString('en-US')
}
Logger.log(res)
var ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Schedule");
ss.getRange(ss.getLastRow() + 1, 1, res.length, res[0].length).setValues(res);
}
Modification explained:
1) Since you access only the first table you don't need to iterate during parsing and just get the first table. Also, since you get just the first table, you don't need to use forEach to loop through each element.
var e = Parser.data(content)
.from('class="schedule has-team-logos align-left"')
.to('</tbody>')
.build(); //Use build instead of iterate
2) Instead of parsing the HTML link to get the team name, you can use <abbr title=" element to scrape the name. Furthermore, you can iterate over all the team names in the table to get an array of team names.
var teams = Parser.data(e).from('<abbr title="').to('">').iterate();
3) Similar to the above modification, you can get the time by using the data-date tag. This gives you date which can read by Date() class. Again, we iterate over the table to get all the times
var time = Parser.data(e).from('data-date="').to('">').iterate()
4) Finally, we use for loop to rearrange the teams and time in the array called res. This allows for inserting the data into the sheet directly.
for( var i = 0; i<teams.length ; i = i+2) //each loop adds 2 to the counter
{
res[i/2] = []
res[i/2][0] = teams[i] //even team (starts at zero)
res[i/2][1] = teams[i+1] //vs odd teams
res[i/2][2] = new Date(time[i/2]).toLocaleTimeString('en-US')
}
Reference:
Date(),Date.toLocaleTimeString()
Edit:
Reason for error, in the below code
Parser.data(e).from('href="mlb/team/_/name/').to('"').build()
you are looking for string 'href="mlb/team/_/name/', however it should be href="/mlb/team/_/name/'. Note the difference mlb vs /mlb.
Secondly, in the following code
Parser.data(e).from('a data-dateformat="time1"').to('</a>').build();
The string should be a data-dateFormat, when you inspect the website it shown as dateformat. However, when you call it using URLfetch and log the text, it is shown as dateFormat
First, I'm not a non-native, so there're some mistakes.And I'm a newbie.
function myFunction() {
var response = UrlFetchApp.fetch("https://api.ookami.me/v1/news/public?sport_id=1");
var json = JSON.parse(response.getContentText());
var news = JSON.parse(json.getContentText());
Logger.log("id");
Logger.log("url");
Logger.log("image");
}
CODE
I wrote "news" in line8's code, and that time, displayed the log. but wrote again, didn't display.
And the log displayed with id, url, image and so on, so I added the code on line6 and wrote the code on line8 to line10. But a log displayed "undefined".
I want to get data about id, url, image from this API with GAS.
And the data, I'll export to spread sheet.
You're almost there:
At this line:
var json = JSON.parse(response.getContentText());
You have all the data in this format:
I you want the element news, you need to change your line var news = JSON.parse(json.getContentText()); to var news = json.news;
Then you can loop through all the values:
for(var i = 0; i < news.length; i++) {
var obj = news[i];
Logger.log(obj.summary);
}
To export to spreadsheet, you just need to populate an Array inside the loop, and it should look like this:
var rows = []; // new values
for(var i = 0; i < news.length; i++) {
var obj = news[i];
rows.push([obj.id, obj.image,obj.url]); //your JSON entities here
}
Logger.log(rows);
I want to parse the data I get from UrlFetch into a spreadsheet, but all I'm getting is undefined can someone show me what i'm doing wrong
The xml is at the address https://dl.dropbox.com/u/11787731/Minecraft/bans.xml
function runevery15mins() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("MC Bans");
sheet.clearContents();
var banURL = "https://dl.dropbox.com/u/11787731/Minecraft/bans.xml";
var banXML = UrlFetchApp.fetch(banURL).getContentText();
var banDOC = Xml.parse(banXML, false);
var mcuser = banDOC.bans;
var x = 0;
for(var c=0; c>mcuser.length;c++){
var name = mcuser.getElement("username")[c].getText();
var date = mcuser.getElement("date")[c].getText();
var reason = mcuser.getElement("reason")[c].getText();
var duration = mcuser.getElement("duration")[c].getText();
}
sheet.appendRow([name, date, reason, duration]);
}
You have some small errors in your code.
For example, the second argument in the for loop needs to be c<mcuser.length.
Using the Xml service documentation, this worked for me
function runevery15mins() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("MC Bans");
sheet.clearContents();
var banURL = "https://dl.dropbox.com/u/11787731/Minecraft/bans.xml";
var banXML = UrlFetchApp.fetch(banURL).getContentText();
var banDOC = Xml.parse(banXML, false);
// Get all the child nodes from the document element that are 'user' nodes
var mcusers = banDOC.getElement().getElements('user');
for(var c=0; c<mcusers.length;c++){
var user = mcusers[c];
var name = user.getElement('username').getText();
var date = user.getElement('date').getText();
var reason = user.getElement('reason').getText();
var duration = user.getElement('duration').getText();
sheet.appendRow([name, date, reason, duration]);
}
}
Note for example that the sheet.appendRow line is INSIDE the loop, not outside as you had it before. I also deleted the X variable, since I didn't see any purpose for it.
I also created a user variable, which is an XmlElement, to make it easier to understand how to get the different contents of each node.
You were almost there.
Looks like there was another array you needed to drill down into. Also, your call back to the spreadsheet should be in the loop. Try this:
...
var mcuser = banDOC.bans.user;
for(var i in mcuser){
var name = mcuser[i].getElement("username").getText();
var date = mcuser[i].getElement("date").getText();
var reason = mcuser[i].getElement("reason").getText();
var duration = mcuser[i].getElement("duration").getText();
sheet.appendRow([name, date, reason, duration])
}