May be this question already asked, but that won't solve my problem.
I try to save data's into google spreadsheet using google app script. But it shows Exceeded memory limit error.
following my code:
//new
function getNewTitle() {
var url = "https://www.reddit.com/r/DigitalMarketing.rss?limit=100&after=0";
var fromText = '</updated><title>';
var toText = '</title>';
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser.data(content).from(fromText).to(toText).iterate();
return scraped;
}
function getNewContent() {
var url = "https://www.reddit.com/r/DigitalMarketing.rss?limit=10&after=0";
var content = UrlFetchApp.fetch(url).getContentText();
var document = XmlService.parse(content);
var root = document.getRootElement();
var atom = XmlService.getNamespace('http://www.w3.org/2005/Atom');
Logger.log(atom);
var fromText = '<content type="html"><!-- SC_OFF --><div class="md"><p>';
var toText = '</div>';
var scraped = Parser.data(content).from(fromText).to(toText).iterate();
return scraped;
}
function getNewLink() {
var url = "https://www.reddit.com/r/DigitalMarketing.rss?limit=10&after=0";
var fromText = '<link href="';
var toText = '" /><updated>';
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser.data(content).from(fromText).to(toText).iterate();
return scraped;
}
function SAVE_DATA() {
var sheet = SpreadsheetApp.openById('1No3m_FnhyxIaxj2zSlbHrg8HLBJULGQ2bda65hpKlyY').getSheetByName('sample');
var content = getNewContent();
var title = getNewTitle();
var link = getNewLink();
Logger.log(title[1]);
for(var i =0; i < title.length; i++) {
sheet.appendRow([ 'Reddit','wordpress', title[i], link[i], content[i]]);
}
}
//new
In my above code am tried to save the data from url.
But i get Exceeded memory limit error.
In my Log i got this message
[18-07-21 05:33:29:719 PDT] [Namespace: prefix "" is mapped to URI "http://www.w3.org/2005/Atom"]
Please help me to fix this error...!
Thanks in advance.
I think that the reason of the error is that </div> of var toText = '</div>'; is not included in content retrieved from https://www.reddit.com/r/DigitalMarketing.rss?limit=10&after=0. So how about this modification?
Modification points :
</div> of var toText = '</div>'; is not included in content. So in this modification, I used </content>. Because you are using '<content type="html"><!-- SC_OFF --><div class="md"><p>' for fromText.
setValues() instead of appendRow() is used for putting the values.
You can see the difference of the cost between setValues() and appendRow() at here.
Modified script :
1. For getNewContent()
Please modify from
From :
var toText = '</div>';
To :
var toText = '</content>';
2. For SAVE_DATA()
Please modify as follows.
function SAVE_DATA() {
var sheet = SpreadsheetApp.openById('1No3m_FnhyxIaxj2zSlbHrg8HLBJULGQ2bda65hpKlyY').getSheetByName('sample');
var content = getNewContent();
var title = getNewTitle();
var link = getNewLink();
var values = title.map(function(e, i){return [e, link[i], content[i]]});
sheet.getRange(sheet.getLastRow() + 1, 1, values.length, values[0].length).setValues(values);
}
Note :
In this modification, I used var toText = '</content>'; for getNewContent(). If you want to retrieve other range of the site, please modify this.
About the URL, limit=100 for the title is set. But limit=10 is set for the link and content. So when the values are retrieved and put them to Spreadsheet, link and content become undefined from 11 row.
If you have already known this, please ignore this.
Reference :
Easy data scraping with Google Apps Script in 5 minutes
Parser is a GAS library. You can check at here.
If I misunderstand your question, I'm sorry.
Related
I need to parse this xml by Google Script. jsonformatter.org tells me that the XML is valid
I want to get text of ICO but //var ico = root.getChild('Ares_odpovedi').getChild('Odpoved').getChild('VBAS').getChild('ICO').getText(); is throwing an error
The full code is
function getARES() {
var url = 'https://wwwinfo.mfcr.cz/cgi-bin/ares/darv_bas.cgi?'
+ 'ico=06018025'
+ '&xml=1';
var response = UrlFetchApp.fetch(url);
var responseText = response.getContentText(); //.replace(/D:/g,'');
var document = XmlService.parse(responseText);
var root = document.getRootElement();
var ico_tmp0 = root.getName(); // value is "Ares_odpovedi"
var ico_tmp1 = root.getContentSize(); // value is 3
var ico_tmp2 = root.getChild('Ares_odpovedi'); // value is null
var ico_tmp3 = root.getChild('Odpoved'); // value is null
//var ico = root.getChild('Ares_odpovedi').getChild('Odpoved').getChild('VBAS').getChild('ICO').getText();
//var ico = root.getChild('Odpoved').getChild('VBAS').getChild('ICO').getText();
Logger.log(response);
Logger.log(" ");
Logger.log(responseText);
}
I believe your goal as follows.
You want to retrieve the text of ICO using Google Apps Script.
In this case, it is required to use the name space when getChild is used. When this is reflected to your script, it becomes as follows.
Modified script:
function getARES() {
var url = 'https://wwwinfo.mfcr.cz/cgi-bin/ares/darv_bas.cgi?'
+ 'ico=06018025'
+ '&xml=1';
var response = UrlFetchApp.fetch(url);
var responseText = response.getContentText(); //.replace(/D:/g,'');
var document = XmlService.parse(responseText);
var root = document.getRootElement();
// I modified below script.
var ns1 = XmlService.getNamespace("/ares/xml_doc/schemas/ares/ares_answer_basic/v_1.0.3");
var ns2 = XmlService.getNamespace("/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3");
var res = root.getChild("Odpoved", ns1).getChild("VBAS", ns2).getChild("ICO", ns2).getText();
Logger.log(res)
}
When above script is run, 06018025 is retrieved.
When http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_bas.cgi?ico=27074358&xml=1 is used as the URL of UrlFetchApp.fetch, 27074358 is obtained.
References:
XML Service
Added:
From your replying of Any idea why var res2 = root.getChild("Odpoved", ns1).getChild("VBAS", ns2).getChild("DIC", ns2).getText(); does not work?, now I noticed that your question had been changed.
In your question, you wanted to retrieve the value of ICO. But in the case for retrieving the value of DIC, it is required to check the structure of XML. Because in your script in your question, the XML from var url = 'https://wwwinfo.mfcr.cz/cgi-bin/ares/darv_bas.cgi?' + 'ico=06018025' + '&xml=1'; doesn't include the value of DIC. I think that this is the reason of your issue.
When you want to retrieve the value of DIC from http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_bas.cgi?ico=27074358&xml=1, please use the following script.
Modified script:
function getARES() {
var url = 'http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_bas.cgi?ico=27074358&xml=1'; // <--- Modified
var response = UrlFetchApp.fetch(url);
var responseText = response.getContentText(); //.replace(/D:/g,'');
var document = XmlService.parse(responseText);
var root = document.getRootElement();
var ns1 = XmlService.getNamespace("/ares/xml_doc/schemas/ares/ares_answer_basic/v_1.0.3");
var ns2 = XmlService.getNamespace("/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3");
var res = root.getChild("Odpoved", ns1).getChild("VBAS", ns2).getChild("DIC", ns2).getText(); // <--- Modified
Logger.log(res) // In this case, CZ27074358 is retrieved.
}
Note:
About the name space, these threads might be useful.
What are XML namespaces for?
How does XPath deal with XML namespaces?
The script is triggered when a Google form is submitted and then auto-fills a Google doc.
It worked perfectly before I added var servicesPTY = e.values[117]; and replaced all the placeholders perfectly. But as soon as I add it then the executions indicator show completed but no documents are produced anymore. The document has placeholders that look like this: {{servicesPTY}} {{regNumberPTY}} {{tradingNamePTY}}
And the code looks like this:
function myFormSubmitPTY(e) {
var regNumberPTY = e.values[112];
var taxNumberPTY = e.values[111];
var tradingNamePTY = e.values[113];
var servicesPTY = e.values[117];
var file = DriveApp.getFileById("16OwyBIZAD2pwkuUXZnYSj-9WB6ObGGRXiEjDLa1tcjw");
var folder = DriveApp.getFolderById("1kogpJdxHLwuEhbVyh2oiIgTPH0SNac2m");
var copy = file.makeCopy(tradingNamePTY, folder);
var doc = DocumentApp.openById(copy.getId());
var body = doc.getBody();
if (type == "PTY (LTD)") {
body.replaceText("{{servicesPTY}}",servicesPTY);
body.replaceText("{{regNumberPTY}}", regNumberPTY);
body.replaceText("{{tradingNamePTY}}", tradingNamePTY);
doc.saveAndClose();
}
}
This works for me:
function testmyFormSubmit() {
var e={values:["one","two","three","four"]};
myFormSubmitPTY(e);
}
var type="PTY (LTD)";//global
function myFormSubmitPTY(e) {
var regNumberPTY = e.values[0];
var taxNumberPTY = e.values[1];
var tradingNamePTY = e.values[2];
var servicesPTY = e.values[3];
var file = DriveApp.getFileById("fileid");
var folder = DriveApp.getFolderById("folderid");
var copy = file.makeCopy(tradingNamePTY, folder);
var doc = DocumentApp.openById(copy.getId());
var body = doc.getBody();
if (type=="PTY (LTD)") {
body.replaceText("{{servicesPTY}}",servicesPTY);
body.replaceText("{{regNumberPTY}}", regNumberPTY);
body.replaceText("{{tradingNamePTY}}", tradingNamePTY);
doc.saveAndClose();
}
}
file name: three
pattern order:
{{servicesPTY}}
{{regNumberPTY}}
{{tradingNamePTY}}
output order:
four
one
three
I must have deselected the trigger for PTY. When I looked at the stackdriver logs I noticed PTY had none. Must have happened in the early hours of the morning. Thanks though. You guys put me on the right track
I am trying to scrape a table of price data from this website using the following code;
function scrapeData() {
// Retrieve table as a string using Parser.
var url = "https://stooq.com/q/d/?s=barc.uk&i=d";
var fromText = '<td align="center" id="t03">';
var toText = '</td>';
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser.data(content).from(fromText).to(toText).build();
//Parse table using XmlService.
var root = XmlService.parse(scraped).getRootElement();
}
I have taken this method from an approach I used in a similar question here however its failing on this particular url and giving me the error;
Error on line 1: Content is not allowed in prolog. (line 12, file "Stooq")
In related questions here and here they talk of textual content that is not accepted being submitted to the parser however, I am unable to apply the solutions in these questions to my own problem. Any help would be much appreciated.
How about this modification?
Modification points:
In this case, it is required to modify the retrieved HTML values. For example, when var content = UrlFetchApp.fetch(url).getContentText() is run, each attribute value is not enclosed. These are required to be modified.
There is a merged column in the header.
When above points are reflected to the script, it becomes as follows.
Modified script:
function scrapeData() {
// Retrieve table as a string using Parser.
var url = "https://stooq.com/q/d/?s=barc.uk&i=d";
var fromText = '#d9d9d9}</style>';
var toText = '<table';
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser.data(content).from(fromText).to(toText).build();
// Modify values
scraped = scraped.replace(/=([a-zA-Z0-9\%-:]+)/g, "=\"$1\"").replace(/nowrap/g, "");
// Parse table using XmlService.
var root = XmlService.parse(scraped).getRootElement();
// Retrieve header and modify it.
var headerTr = root.getChild("thead").getChildren();
var res = headerTr.map(function(e) {return e.getChildren().map(function(f) {return f.getValue()})});
res[0].splice(7, 0, "Change");
// Retrieve values.
var valuesTr = root.getChild("tbody").getChildren();
var values = valuesTr.map(function(e) {return e.getChildren().map(function(f) {return f.getValue()})});
Array.prototype.push.apply(res, values);
// Put the result to the active spreadsheet.
var ss = SpreadsheetApp.getActiveSheet();
ss.getRange(1, 1, res.length, res[0].length).setValues(res);
}
Note:
Before you run this modified script, please install the GAS library of Parser.
This modified script is not corresponding to various URL. This can be used for the URL in your question. If you want to retrieve values from other URL, please modify the script.
Reference:
Parser
XmlService
If this was not what you want, I'm sorry.
In a Spreadsheet cell I have a link to a Google doc and I want to recuperate it in order to open the Google Doc and modify it. So in my cell I've put this format https://docs.google.com/document/d/1Gb48I...... (without the /edit in the end) and I've tried
var body = '=HYPERLINK("'data[n][COLUMN_URL-1]'+'/edit'")'.getBody();
var body = '=HYPERLINK("'data[n][COLUMN_URL-1]'+'/edit'")'.getBody();
// this one works but I want to use data[n][COLUMN_URL-1] because I have several Google Docs links
var body = DocumentApp.openByUrl('https://docs.google.com/document/d/1Gb48IXos......../edit').getBody();
if(body){
//edit the Google doc
If you have ideas what to do thank you because with the concatenation of the /edit(3rd line of code) it works
Edit: The cell is not formatted(hyperlinked)/formula i have only the https://...
Edit 2: I've tried with a code from Suhail Ansari thank you but I have an error that the document is missing and I don't have nothing for the logs if you have others ideas:
var COLUMN_URL = 10 ;
function getIdFrom(url) {
var id = "";
var parts = url.split(/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?/);
if (url.indexOf('?id=') >= 0){
id = (parts[6].split("=")[1]).replace("&usp","");
return id;
} else {
id = parts[5].split("/");
//Using sort to get the id as it is the longest element.
var sortArr = id.sort(function(a,b){return b.length - a.length});
id = sortArr[0];
return id;
}
}
var sheet = SpreadsheetApp.openById(SPREADSHEET_ID).getSheetByName(SHEET_NAME);
var numRows = sheet.getLastRow();
var lastColumn = sheet.getLastColumn();
var data = sheet.getRange(1,1,numRows,lastColumn).getDisplayValues();
for(n=1;n < data.length;n++) {
var URL =data[n][COLUMN_URL-1];
Logger.log('The URL ',URL);
var id = getIdFrom(URL);
Logger.log('The ID ',id);
var body = DocumentApp.openById(id).getBody();
if(body)
{......//edit Google Doc
This is the 10 column with the URLs from the Google Docs
In fact I have docs.google.com/open?id=1qo0B_HCbjcBrYyJ ... as URL for the others rows I tried to do by hand for 3rd one as you can see in the picture
Edit 3: So I am very confused in this moment because the column I have in the spreadsheet commes from a loop where I put the URL of the Google Docs in the cell like this : var trange = sheet.getRange.. trange.setValue(doc.getUrl()); and now if i look at the table i have the links in this format
https://docs.google.com/open?id=1RYRVotAq6IOz5tys1krnENfgN_pU0KYuUzR24i
so now if i want to get back the Google Doc I have the following :
// var body = DocumentApp.openByUrl('https://docs.google.com/open?id=1Xb4QjiWwpn2TJ8-9kkIhU6m1rgmb48g6xYVopN').getBody();
//var body = DocumentApp.openByUrl('https://docs.google.com/1Xb4QjiWwpn2TJ8-9kkIhU6m1rgmb48g6xYVopN/edit').getBody();
var body = DocumentApp.openByUrl('https://docs.google.com/document/d/1Xb4QjiWwpn2TJ8-9kkIhU6m1rgmb48g6xYVopN/edit').getBody();
only the 3rd one works.Where I am doing wrong? Because I want to
for(n=1;n < data.length;n++) {... make a loop for and var URL =
data[n][COLUMN_URL-1];
You could add some code for getting the Doc ID from the URL and then open the doc by using
DocumentApp.openById(id);
Here is a link to do that.
A sample code below with help from the above linked answer:
function myFunction() {
var URL = SpreadsheetApp.getActiveSheet().getRange('A1').getValue();
var id = getIdFrom(URL);
var doc = DocumentApp.openById(id);
var body = doc.getBody();
Logger.log(body.getText());
}
function getIdFrom(url) {
var id = "";
var parts = url.split(/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?/);
if (url.indexOf('?id=') >= 0){
id = (parts[6].split("=")[1]).replace("&usp","");
return id;
} else {
id = parts[5].split("/");
//Using sort to get the id as it is the longest element.
var sortArr = id.sort(function(a,b){return b.length - a.length});
id = sortArr[0];
return id;
}
}
Demo Google Sheet
I have this function running in a Google Sheets script that pulls HTML from subreddits and returns them to a spreadsheet. It works for me some/most of the time, but other times I get an error "Could not parse text. (line 13)" which is the line with var doc = Xml.parse(page, true);. Any idea why this is happening or is this just a bug with Google Scripts? Here's the code that works...sometimes.
function getRedditHTML() {
var entries_array = [];
var subreddit_array = ['https://www.reddit.com/r/news/','https://www.reddit.com/r/funny/','https://www.reddit.com/r/science/'];
for (var s = 0; s < subreddit_array.length; s++) {
var page = UrlFetchApp.fetch(subreddit_array[s]);
//this is Line 13 that is breaking
var doc = Xml.parse(page, true);
var bodyHtml = doc.html.body.toXmlString();
doc = XmlService.parse(bodyHtml);
var root = doc.getRootElement();
var entries = getElementsByClassName(root,'thing');
for (var i = 0; i < entries.length; i++) {
var title = getElementsByClassName(entries[i],'title');
title = XmlService.getRawFormat().format(title[1]).replace(/<[^>]*>/g, "");
var link = getElementsByClassName(entries[i],'comments');
link = link[0].getAttribute('href').getValue();
var rank = getElementsByClassName(entries[i],'rank');
rank = rank[0].getValue();
var likes = getElementsByClassName(entries[i],'likes');
likes = likes[0].getValue();
entries_array.push([rank, likes, title, link]);
}
}
return entries_array.sort(function (a, b) {
return b[1] - a[1];
});
}
Here is what I found upon playing with importXML (my usual way of doing this) - for some reason I cannot narrow down - it DOES appear to randomly stall out and return null for a few minutes - so I'm guessing the issue with your thing is not the code but that the site or google temporarily blocks/won't return the data -
however I found the JSON endpoint to the piece you want - and I noticed that when XML went down - the JSON didnt.
You can take that and fix it to push your own array of topics/urls - I just left it for one link for now to show you how the URL breaks down and where it should be modified:
The URL is 'https://www.reddit.com/r/news/hot.json?raw_json=1&subredditName=news&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client
News is mentioned in 2 places so just modify all your URLs to follow that method - you can easily load that javascript in a browser to see all the fields available
Also the portion hot.json is where you can change whether you want the ranked list (called hot), or new,top,promoted, etc. you just change that keyword.
Score is the same as the upvotes/likes
function getSubReddit() {
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sheet = ss.getActiveSheet(); //get Active sheet
var subject = 'news';
var url = 'https://www.reddit.com/r/' + subject + '/hot.json?raw_json=1&subredditName=' + subject + '&sort=top&t=day&feature=link_preview&sr_detail=true&app=mweb-client'; //json endpoint for data
var response = UrlFetchApp.fetch(url); // get api endpoint
var json = response.getContentText(); // get the response content as text
var redditData = JSON.parse(json); //parse text into json
Logger.log(redditData); //log data to logger to check
//create empty array to hold data points
var statsRows = [];
var date = new Date(); //create new date for timestamp
//The following lines push the parsed json into empty stats array
for (var j=0;j<25;j++){
for (var i =0;i<25;i++){
var stats=[];
stats.push(date);//timestamp
stats.push(i+1);
stats.push(redditData.data.children[i].data.score); //score
stats.push(redditData.data.children[i].data.title); //title
stats.push(redditData.data.children[i].data.url); //article url
// stats.push('http://www.reddit.com' + redditData.data.children[i].data.permalink); //reddit permalink
statsRows.push(stats)
}
//append the stats array to the active sheet
sheet.appendRow(statsRows[j])
}
}