Not Able to Scrape table in Google Sheets - google-apps-script

With the help of this SO questionsI am trying to scrape the following website. I would like the two teams and the time. For example, the first entry would be Chicago | Miami | 12:30 PM, and the last entry would be Colorado | Arizona | 10:10 PM. My code is as follows
function espn_schedule() {
var url = "http://www.espn.com/mlb/schedule/_/date/20180329";
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser.data(content).from('class="schedule has-team-logos align-left"').to('</tbody>').iterate();
var res = [];
var temp = [];
var away_ticker = "";
scraped.forEach(function(e){
var away_team = Parser.data(e).from('href="mlb/team/_/name/').to('"').build();
var time = Parser.data(e).from('a data-dateformat="time1"').to('</a>').build();
if (away_ticker == "") away_ticker = away_team;
if (away_team != away_ticker) {
temp.splice(1, 0, away_ticker);
res.push(temp);
temp = [];
away_ticker = away_team;
temp.push(time);
}
});
var ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Schedule");
ss.getRange(ss.getLastRow() + 1, 1, res.length, res[0].length).setValues(res);
}
I get the following error:
TypeError: Cannot read property "length" from undefined. (line 42, file "Code")

Here is a modified solution that works
function espn_schedule() {
var url = "http://www.espn.com/mlb/schedule/_/date/20180329";
var content = UrlFetchApp.fetch(url).getContentText();
var e = Parser.data(content).from('class="schedule has-team-logos align-left"').to('</tbody>').build();
var res = [];
//Logger.log(scraped[0])
var temp = [];
var away_ticker = "";
var teams = Parser.data(e).from('<abbr title="').to('">').iterate();
Logger.log(teams)
var time = Parser.data(e).from('data-date="').to('">').iterate()
Logger.log(time)
for( var i = 0; i<teams.length ; i = i+2)
{
res[i/2] = []
res[i/2][0] = teams[i]
res[i/2][1] = teams[i+1]
res[i/2][2] = new Date(time[i/2]).toLocaleTimeString('en-US')
}
Logger.log(res)
var ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Schedule");
ss.getRange(ss.getLastRow() + 1, 1, res.length, res[0].length).setValues(res);
}
Modification explained:
1) Since you access only the first table you don't need to iterate during parsing and just get the first table. Also, since you get just the first table, you don't need to use forEach to loop through each element.
var e = Parser.data(content)
.from('class="schedule has-team-logos align-left"')
.to('</tbody>')
.build(); //Use build instead of iterate
2) Instead of parsing the HTML link to get the team name, you can use <abbr title=" element to scrape the name. Furthermore, you can iterate over all the team names in the table to get an array of team names.
var teams = Parser.data(e).from('<abbr title="').to('">').iterate();
3) Similar to the above modification, you can get the time by using the data-date tag. This gives you date which can read by Date() class. Again, we iterate over the table to get all the times
var time = Parser.data(e).from('data-date="').to('">').iterate()
4) Finally, we use for loop to rearrange the teams and time in the array called res. This allows for inserting the data into the sheet directly.
for( var i = 0; i<teams.length ; i = i+2) //each loop adds 2 to the counter
{
res[i/2] = []
res[i/2][0] = teams[i] //even team (starts at zero)
res[i/2][1] = teams[i+1] //vs odd teams
res[i/2][2] = new Date(time[i/2]).toLocaleTimeString('en-US')
}
Reference:
Date(),Date.toLocaleTimeString()
Edit:
Reason for error, in the below code
Parser.data(e).from('href="mlb/team/_/name/').to('"').build()
you are looking for string 'href="mlb/team/_/name/', however it should be href="/mlb/team/_/name/'. Note the difference mlb vs /mlb.
Secondly, in the following code
Parser.data(e).from('a data-dateformat="time1"').to('</a>').build();
The string should be a data-dateFormat, when you inspect the website it shown as dateformat. However, when you call it using URLfetch and log the text, it is shown as dateFormat

Related

Parse XML to Google Spreadsheet in google Apps Script

I need to parse a XML file to Google Spreadsheet. I need all the data from each row "row".
Every URL should have its own row in spreadsheet for all its values.
XML File, example:
<response>
<method>domain.urls</method>
<answer>
<row url="https://www.example.com/1" top10="3048" top100="4490" visindex="9.1068505804717"/>
<row url="https://www.example.com/2" top10="2633" top100="2720" visindex="8.6659210425021"/>
<row url="https://www.example.com/3" top10="875" top100="964" visindex="2.7381900000597"/>
</answer>
<credits used="4"/>
</response>
I started with this function and got one value back (yay!)
for (var i = 0; i < items.length; i++) {
if(items[i].getName() == 'answer'){
var answer = items[i].getChildren();
return answer[0].getAttribute('visindex').getValue();
}
}
Tis function writes the value (answer) to spreadhseet
var seoValue = getSeoValue(apikey, seoMetric, keyword, country);
outputSheet.getRange(outputLastRow, 6 + i ).setValue(seoValue/1); //aktuell nur 1 outputwert
}
// increase the last output row by one
outputLastRow++;
}
I dont knwo how to collect all the values from a row and save them to spreadhseet.
Output spreadhsheet example:
INPUT - (excerpt)
<row url="https://www.example.com/1" top10="3048" top100="4490" visindex="9.1068505804717"/>
<row url="https://www.example.com/2" top10="2633" top100="2720" visindex="8.6659210425021"/>
<row url="https://www.example.com/3" top10="875" top100="964" visindex="2.7381900000597"/>
OUTPUT - Row A1 | B1 | C1 | D1
values row-1 -> URL-1-value | top-10-value-1 | top-100-value-1 | visindex-value-1
values row-2 -> URL-2-value | top-10-value-2 | top-100-value-2 | visindex-value-2
And one more thing that kills me: as far as I understand, I need to convert the URL to a string.
Apps Script has an XML Service that you can use to parse data. Here's a way you can do it based on one of the examples there. You can just paste it on a new Sheet's Apps Script project to test and modify at your convenience.
function xmlParser() {
//input should be your xml file as text
let xml = '<response><method>domain.urls</method><answer><row url="https://www.example.com/1" top10="3048" top100="4490" visindex="9.1068505804717"/><row url="https://www.example.com/2" top10="2633" top100="2720" visindex="8.6659210425021"/><row url="https://www.example.com/3" top10="875" top100="964" visindex="2.7381900000597"/></answer><credits used="4"/></response>';
let document = XmlService.parse(xml); //have the XML service parse the document
let root = document.getRootElement(); //get the root element of the document
let answers = root.getChild("answer").getChildren("row"); //gets the 'answer' node, and a list of its subnodes, note that we use getChildren() to get them all in an array
//now the answers array contains each <row> element with all its attributes
const list = [] //we create an array that will hold the data
answers.forEach(function (row) {
//forEach function that iterates through all the row nodes and uses
//getAttribute() to get their values based on the names we know already
//we push each element to our list array
list.push([row.getAttribute("url").getValue(), row.getAttribute("top10").getValue(), row.getAttribute("top100").getValue(), row.getAttribute("visindex").getValue()])
}
)
writeToSheet(list) // after the array is populated you can call another function to paste in the Sheet
}
function writeToSheet(list) {
//first set a range where you will paste the data. You have to define the length with the input array
//the first two parameters are "1, 1" for row 1, column 1, but you can change this depending on your needs.
let range = SpreadsheetApp.getActiveSheet().getRange(1, 1, list.length, list[0].length)
//once we have the array set you can just call setValues() on it which pastes the array on its own
range.setValues(list)
}
Output looks like this:
References:
XML Service
getRange()
setValues()
Holy frak that worked Daniel.
I put some scripts together and it gives me what I need.
Google Apps Script: SISTRIX API Call page.urls. Its crap but output is okay.
function getData() {
var spreadSheet = SpreadsheetApp.getActiveSpreadsheet();
var inputSheet = spreadSheet.getSheets()[0];
var outputSheet = spreadSheet.getSheets()[1];
// get the last non-empty row number in the input sheet
var inputLastRow = inputSheet.getLastRow();
// get the first empty row number in the output sheet
var outputLastRow = outputSheet.getLastRow() + 1;
// get the api key from the input sheet
var apikey = inputSheet.getRange('A2').getValue();
//var week = getWeek();
// get the input for queries
var inputs = inputSheet.getRange('A11:E' + inputLastRow).getValues();
// specify the SISTRIX KPIs for the client and the competitor(s)
var clientSeoMetrics = inputSheet.getRange('A5').getValue().split(',');
// loop over rows in the input
for (var row = 0; row < inputLastRow - 10; row++) {
// specify inputs - which column means what
//var keyword = inputs[row][0].toLowerCase();
//var country = inputs[row][2].toLowerCase();
var domain = inputs[row][0].toLowerCase();
var limit = inputs[row][1];
//write the basic information to output
//outputSheet.getRange('A'+outputLastRow).setValue(keyword);
//B Suchvolumen
//outputSheet.getRange('C'+outputLastRow).setValue(country.toUpperCase());
//D wird Search Intent
//outputSheet.getRange('E'+outputLastRow).setValue(week);
// check if competition or client and take proper KPIs
var seoMetrics;
seoMetrics = clientSeoMetrics; //eigenltich unnoetig - evtl. fuer intent sinnvoll (if intent dann)
// loop over seometrics - falls weitere Metriken
for (var i = 0; i < seoMetrics.length; i++) {
var seoMetric = seoMetrics[i];
if (seoMetric == ""){
break;
}
// run seoMetric query
//var seoValue = getSeoValue(apikey, seoMetric, domain, limit);
//outputSheet.getRange(outputLastRow, 5 + i ).setValue(seoValue/1); //aktuell nur 1 outputwert
//var seoValue = getSeoValue(apikey, seoMetric, domain, limit);
//outputSheet.getRange(outputLastRow, 5 + i ).setValue(seoValue/1); //aktuell nur 1 outputwert
var seoValue = [] ;
var seoValue = getSeoValue(apikey, seoMetric, domain, limit);
}
// increase the last output row by one
outputLastRow++;
}
function getSeoValue(apikey, seoMetric, domain, limit){
var url = "https://api.sistrix.com/"+seoMetric+"?domain="+domain+"&api-key="+apikey+"&country=de&limit="+limit;
var xml = UrlFetchApp.fetch(url).getContentText();
var document = XmlService.parse(xml);
var root = document.getRootElement(); //get the root element of the document
var answers = root.getChild("answer").getChildren("row"); //gets the 'answer' node, and a list of its subnodes, note that we use getChildren() to get them all in an array
//now the answers array contains each <row> element with all its attributes
const list = [] //we create an array that will hold the data
answers.forEach(function (row) {
//forEach function that iterates through all the row nodes and uses
//getAttribute() to get their values based on the names we know already
//we push each element to our list array
list.push([row.getAttribute("url").getValue(), row.getAttribute("top10").getValue(), row.getAttribute("top100").getValue(), row.getAttribute("visindex").getValue()])
}
)
writeToSheet(list) // after the array is populated you can call another function to paste in the Sheet
}
function writeToSheet(list) {
//let range = SpreadsheetApp.getActiveSheet().getRange(1, 1, list.length, list[0].length)
//range.setValues(list)
//outputSheet.getRange(outputLastRow, 5 + i ).setValue(list/1); //aktuell nur 1 outputwert
let range = outputSheet.getRange(outputLastRow, 1, list.length, list[0].length)
range.setValues(list)
}
}

How can I use a variable as a parameter in getSheetByName?

I have the following code which generates some gslides slides from data in a spreadsheet. The code is attached to the gslides file and it works when the parameters for link/year are entered as strings. However, these parameters will sometimes vary from a set list (e.g. pull from a different sheet or from a different file) so the code sometimes needs to be edited.
I want to share this with colleagues who don't know how to use app scripts and so tried to edit it so that the link/year can be pulled from a textbox in one of the slides in the gslides file. They are pulling through ok but when I try to use it as a parameter I get an error for line 11 (var values = sheet.getRange('F2:aa2').getValues();) as the sheet returns as null.
I've tried wrapping it in quotes (" ' " + year + " ' ") but that doesn't work either.
Please let me know if I need to include more info or a link to the sheets/slides files.
function generateStarters() {
var deck = SlidesApp.getActivePresentation();
var slides = deck.getSlides();
var link = slides[2].getPageElements()[2].asShape().getText().asString()
var year = slides[2].getPageElements()[4].asShape().getText().asString()
var dataSpreadsheetUrl = link
var ss = SpreadsheetApp.openByUrl(dataSpreadsheetUrl);
var sheet = ss.getSheetByName(year);
var values = sheet.getRange('F2:aa2').getValues();
var templateSlide = slides[1];
var presLength = slides.length;
values.forEach(function(page){
if(page[0]){
var Q1 = page[0];
var A1 = page[1];
var Q2 = page[4];
var A2 = page[5];
var Q3 = page[8];
var A3 = page[9];
var Q4 = page[12];
var A4 = page[13];
var QK = page[16];
var AK = page[17];
var QF = page[20];
var AF = page[21];
templateSlide.duplicate(); //duplicate the template page
slides = deck.getSlides(); //update the slides array for indexes and length
newSlide = slides[3]; // declare the new page to update
var shapes = (newSlide.getShapes());
shapes.forEach(function(shape){
shape.getText().replaceAllText('{{q1}}',Q1);
shape.getText().replaceAllText('{{a1}}',A1);
shape.getText().replaceAllText('{{q2}}',Q2);
shape.getText().replaceAllText('{{a2}}',A2);
shape.getText().replaceAllText('{{q3}}',Q3);
shape.getText().replaceAllText('{{a3}}',A3);
shape.getText().replaceAllText('{{q4}}',Q4);
shape.getText().replaceAllText('{{a4}}',A4);
shape.getText().replaceAllText('{{qk}}',QK);
shape.getText().replaceAllText('{{ak}}',AK);
shape.getText().replaceAllText('{{qf}}',QF);
shape.getText().replaceAllText('{{af}}',AF);
});
presLength = slides.length;
newSlide.move(presLength);
}
});
} ```
The string from the slides file seems to always have a space at the end (from the textbox) so I just needed to add an extra line of code to remove that space.

Error in Google Sheets Script when parsing XML

I have this function running in a Google Sheets script that pulls HTML from subreddits and returns them to a spreadsheet. It works for me some/most of the time, but other times I get an error "Could not parse text. (line 13)" which is the line with var doc = Xml.parse(page, true);. Any idea why this is happening or is this just a bug with Google Scripts? Here's the code that works...sometimes.
function getRedditHTML() {
var entries_array = [];
var subreddit_array = ['https://www.reddit.com/r/news/','https://www.reddit.com/r/funny/','https://www.reddit.com/r/science/'];
for (var s = 0; s < subreddit_array.length; s++) {
var page = UrlFetchApp.fetch(subreddit_array[s]);
//this is Line 13 that is breaking
var doc = Xml.parse(page, true);
var bodyHtml = doc.html.body.toXmlString();
doc = XmlService.parse(bodyHtml);
var root = doc.getRootElement();
var entries = getElementsByClassName(root,'thing');
for (var i = 0; i < entries.length; i++) {
var title = getElementsByClassName(entries[i],'title');
title = XmlService.getRawFormat().format(title[1]).replace(/<[^>]*>/g, "");
var link = getElementsByClassName(entries[i],'comments');
link = link[0].getAttribute('href').getValue();
var rank = getElementsByClassName(entries[i],'rank');
rank = rank[0].getValue();
var likes = getElementsByClassName(entries[i],'likes');
likes = likes[0].getValue();
entries_array.push([rank, likes, title, link]);
}
}
return entries_array.sort(function (a, b) {
return b[1] - a[1];
});
}
Here is what I found upon playing with importXML (my usual way of doing this) - for some reason I cannot narrow down - it DOES appear to randomly stall out and return null for a few minutes - so I'm guessing the issue with your thing is not the code but that the site or google temporarily blocks/won't return the data -
however I found the JSON endpoint to the piece you want - and I noticed that when XML went down - the JSON didnt.
You can take that and fix it to push your own array of topics/urls - I just left it for one link for now to show you how the URL breaks down and where it should be modified:
The URL is 'https://www.reddit.com/r/news/hot.json?raw_json=1&subredditName=news&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client
News is mentioned in 2 places so just modify all your URLs to follow that method - you can easily load that javascript in a browser to see all the fields available
Also the portion hot.json is where you can change whether you want the ranked list (called hot), or new,top,promoted, etc. you just change that keyword.
Score is the same as the upvotes/likes
function getSubReddit() {
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sheet = ss.getActiveSheet(); //get Active sheet
var subject = 'news';
var url = 'https://www.reddit.com/r/' + subject + '/hot.json?raw_json=1&subredditName=' + subject + '&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client'; //json endpoint for data
var response = UrlFetchApp.fetch(url); // get api endpoint
var json = response.getContentText(); // get the response content as text
var redditData = JSON.parse(json); //parse text into json
Logger.log(redditData); //log data to logger to check
//create empty array to hold data points
var statsRows = [];
var date = new Date(); //create new date for timestamp
//The following lines push the parsed json into empty stats array
for (var j=0;j<25;j++){
for (var i =0;i<25;i++){
var stats=[];
stats.push(date);//timestamp
stats.push(i+1);
stats.push(redditData.data.children[i].data.score); //score
stats.push(redditData.data.children[i].data.title); //title
stats.push(redditData.data.children[i].data.url); //article url
// stats.push('http://www.reddit.com' + redditData.data.children[i].data.permalink); //reddit permalink
statsRows.push(stats)
}
//append the stats array to the active sheet
sheet.appendRow(statsRows[j])
}
}

How to Iterate to output data to Google Sheets with App-Script

Using Google App Script, when I used Logger.log() the for loop iterates properly and I get results for each value. When I try to output this to a google sheet only the last value for each variable is output over and over again for the number of goals.length.
Any help is very much appreciated!
function listGoals() {
var sheet = SpreadsheetApp.getActiveSheet();
var filterList = Analytics.Management.Goals.list(accountId, webPropertyId, profileId)
var goals = filterList.items;
for (var i = 0, goal; goal = goals[i]; i++) {
var accountId = goal.accountId;
var propertyId = goal.webPropertyId;
var goalNumber = goal.id;
var goalName = goal.name;
Logger.log('accountId: ' + accountId);
Logger.log('profileId: ' + propertyId);
Logger.log('goal number: ' + goalNumber);
Logger.log('goal name: ' + goalName);
//Logger.log prints for each result
sheet.getRange(1,1,goals.length).setValue(goalNumber);
sheet.getRange(1,2,goals.length).setValue(goalName);
//this only prints out the last value of goalNumber and goalName to the sheet
}
}
It doesn't only print the last results, it just keeps overwriting the old result with the new one.
goals.length only helps if you then supply an array of arrays containing the values looking as such:
[[1, "Goal 1"],
[2, "Goal 2"]]
If you want to print out a list of goalNumber and goalName you need to offset the cell to write in every time.
something like
sheet.getRange(1+i,1).setValue(goalNumber);
sheet.getRange(1+i,2).setValue(goalName);
To speed up the process a bit and not do two calls for every goal you can store the id name pairs as arrays within an array and do one final setValues call after the loop finishes executing.
function listGoals() {
var sheet = SpreadsheetApp.getActiveSheet();
var filterList = Analytics.Management.Goals.list(accountId, webPropertyId, profileId)
var goals = filterList.items;
var goalsToWrite = [];
for (var i = 0, goal; goal = goals[i]; i++) {
goalsToWrite.push([goal.id, goal.name]);
}
sheet.getRange(1, 1, goals.length, 2).setValues(goalsToWrite);
}

Parsing XML Data that I receive from UrlFetch

I want to parse the data I get from UrlFetch into a spreadsheet, but all I'm getting is undefined can someone show me what i'm doing wrong
The xml is at the address https://dl.dropbox.com/u/11787731/Minecraft/bans.xml
function runevery15mins() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("MC Bans");
sheet.clearContents();
var banURL = "https://dl.dropbox.com/u/11787731/Minecraft/bans.xml";
var banXML = UrlFetchApp.fetch(banURL).getContentText();
var banDOC = Xml.parse(banXML, false);
var mcuser = banDOC.bans;
var x = 0;
for(var c=0; c>mcuser.length;c++){
var name = mcuser.getElement("username")[c].getText();
var date = mcuser.getElement("date")[c].getText();
var reason = mcuser.getElement("reason")[c].getText();
var duration = mcuser.getElement("duration")[c].getText();
}
sheet.appendRow([name, date, reason, duration]);
}
You have some small errors in your code.
For example, the second argument in the for loop needs to be c<mcuser.length.
Using the Xml service documentation, this worked for me
function runevery15mins() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("MC Bans");
sheet.clearContents();
var banURL = "https://dl.dropbox.com/u/11787731/Minecraft/bans.xml";
var banXML = UrlFetchApp.fetch(banURL).getContentText();
var banDOC = Xml.parse(banXML, false);
// Get all the child nodes from the document element that are 'user' nodes
var mcusers = banDOC.getElement().getElements('user');
for(var c=0; c<mcusers.length;c++){
var user = mcusers[c];
var name = user.getElement('username').getText();
var date = user.getElement('date').getText();
var reason = user.getElement('reason').getText();
var duration = user.getElement('duration').getText();
sheet.appendRow([name, date, reason, duration]);
}
}
Note for example that the sheet.appendRow line is INSIDE the loop, not outside as you had it before. I also deleted the X variable, since I didn't see any purpose for it.
I also created a user variable, which is an XmlElement, to make it easier to understand how to get the different contents of each node.
You were almost there.
Looks like there was another array you needed to drill down into. Also, your call back to the spreadsheet should be in the loop. Try this:
...
var mcuser = banDOC.bans.user;
for(var i in mcuser){
var name = mcuser[i].getElement("username").getText();
var date = mcuser[i].getElement("date").getText();
var reason = mcuser[i].getElement("reason").getText();
var duration = mcuser[i].getElement("duration").getText();
sheet.appendRow([name, date, reason, duration])
}