How to rescale range of numbers shifting the centre in spark/scala? - function

Which function in spark can transform / rescale values in range -infinity to +infinity or -2 to 130 etc to max value to be defined.
In below example, I want to ensure that 55 is 100, and 100+ is 0
before | after
45-55 | 90-100
35-44 | 80-89
...
100+ or < 0| 0-5
is any of the ML features functions useful?

I was able to solve it, thanks #user6910411 for your help.
You can use dense or sparse vector depending on data and replace MinMaxScaler with MaxAbsScaler and extract values using linalg.Vectors or DenseVector
Idea is to split data at the point of required median and reverse scale for one half, then scale both halfs and merge DF.
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.feature.MaxAbsScaler
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.functions.udf
val vectorToColumn = udf{ (x: DenseVector, index: Int) => x(index) }
val gt50 = df.filter("score >= 55").select('id,('score * -1).as("score"))
val lt50 = df.filter("score < 55")
val assembler = new VectorAssembler()
.setInputCols(Array("score"))
.setOutputCol("features")
val ass_lt50 = assembler.transform(lt50)
val ass_gt50 = assembler.transform(gt50)
val scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("featuresScaled")
.setMax(100)
.setMin(0)
val feat_lt50 = scaler.fit(ass_lt50).transform(ass_lt50).drop('score)
val feat_gt50 = scaler.fit(ass_gt50).transform(ass_gt50).drop('score)
val scaled_lt50 = feat_lt50.select('id,round(
vectorToColumn(col("featuresScaled"),lit(0))).as("scaled_score"))
val scaled_gt50 = feat_gt50.select('id,round(
vectorToColumn(col("featuresScaled"),lit(0))).as("scaled_score"))
val scaled = scaled_lt50.unionAll(scaled_gt50)

Related

Why is RandomCrop with size 84 and padding 8 returning an image size of 84 and not 100 in pytorch?

I was using the mini-imagenet data set and noticed this line of code:
elif data_augmentation == 'lee2019:
normalize = Normalize(
mean=[120.39586422 / 255.0, 115.59361427 / 255.0, 104.54012653 / 255.0],
std=[70.68188272 / 255.0, 68.27635443 / 255.0, 72.54505529 / 255.0],
)
train_data_transforms = Compose([
ToPILImage(),
RandomCrop(84, padding=8),
ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
RandomHorizontalFlip(),
ToTensor(),
normalize,
])
test_data_transforms = Compose([
normalize,
])
but when I checked the image size it was 84 instead of 100 (after adding padding):
X.size()=torch.Size([50, 3, 84, 84])
what is going on with this? Shouldn't it be 100?
reproduction:
import random
from typing import Callable
import learn2learn as l2l
import numpy as np
import torch
from learn2learn.data import TaskDataset, MetaDataset, DataDescription
from learn2learn.data.transforms import TaskTransform
from torch.utils.data import Dataset
class IndexableDataSet(Dataset):
def __init__(self, datasets):
self.datasets = datasets
def __len__(self) -> int:
return len(self.datasets)
def __getitem__(self, idx: int):
return self.datasets[idx]
class SingleDatasetPerTaskTransform(Callable):
"""
Transform that samples a data set first, then creates a task (e.g. n-way, k-shot) and finally
applies the remaining task transforms.
"""
def __init__(self, indexable_dataset: IndexableDataSet, cons_remaining_task_transforms: Callable):
"""
:param: cons_remaining_task_transforms; constructor that builds the remaining task transforms. Cannot be a list
of transforms because we don't know apriori which is the data set we will use. So this function should be of
type MetaDataset -> list[TaskTransforms] i.e. given the dataset it returns the transforms for it.
"""
self.indexable_dataset = MetaDataset(indexable_dataset)
self.cons_remaining_task_transforms = cons_remaining_task_transforms
def __call__(self, task_description: list):
"""
idea:
- receives the index of the dataset to use
- then use the normal NWays l2l function
"""
# - this is what I wish could have gone in a seperate callable transform, but idk how since the transforms take apriori (not dynamically) which data set to use.
i = random.randint(0, len(self.indexable_dataset) - 1)
task_description = [DataDescription(index=i)] # using this to follow the l2l convention
# - get the sampled data set
dataset_index = task_description[0].index
dataset = self.indexable_dataset[dataset_index]
dataset = MetaDataset(dataset)
# - use the sampled data set to create task
remaining_task_transforms: list[TaskTransform] = self.cons_remaining_task_transforms(dataset)
description = None
for transform in remaining_task_transforms:
description = transform(description)
return description
def sample_dataset(dataset):
def sample_random_dataset(x):
print(f'{x=}')
i = random.randint(0, len(dataset) - 1)
return [DataDescription(index=i)]
# return dataset[i]
return sample_random_dataset
def get_task_transforms(dataset: IndexableDataSet) -> list[TaskTransform]:
"""
:param dataset:
:return:
"""
transforms = [
sample_dataset(dataset),
l2l.data.transforms.NWays(dataset, n=5),
l2l.data.transforms.KShots(dataset, k=5),
l2l.data.transforms.LoadData(dataset),
l2l.data.transforms.RemapLabels(dataset),
l2l.data.transforms.ConsecutiveLabels(dataset),
]
return transforms
def print_datasets(dataset_lst: list):
for dataset in dataset_lst:
print(f'\n{dataset=}\n')
def get_indexable_list_of_datasets_mi_and_cifarfs(root: str = '~/data/l2l_data/') -> IndexableDataSet:
from learn2learn.vision.benchmarks import mini_imagenet_tasksets
datasets, transforms = mini_imagenet_tasksets(root=root)
mi = datasets[0].dataset
from learn2learn.vision.benchmarks import cifarfs_tasksets
datasets, transforms = cifarfs_tasksets(root=root)
cifarfs = datasets[0].dataset
dataset_list = [mi, cifarfs]
dataset_list = [l2l.data.MetaDataset(dataset) for dataset in dataset_list]
dataset = IndexableDataSet(dataset_list)
return dataset
# -- tests
def loop_through_l2l_indexable_datasets_test():
"""
:return:
"""
# - for determinism
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)
# - options for number of tasks/meta-batch size
batch_size: int = 10
# - create indexable data set
indexable_dataset: IndexableDataSet = get_indexable_list_of_datasets_mi_and_cifarfs()
# - get task transforms
def get_remaining_transforms(dataset: MetaDataset) -> list[TaskTransform]:
remaining_task_transforms = [
l2l.data.transforms.NWays(dataset, n=5),
l2l.data.transforms.KShots(dataset, k=5),
l2l.data.transforms.LoadData(dataset),
l2l.data.transforms.RemapLabels(dataset),
l2l.data.transforms.ConsecutiveLabels(dataset),
]
return remaining_task_transforms
task_transforms: TaskTransform = SingleDatasetPerTaskTransform(indexable_dataset, get_remaining_transforms)
# -
taskset: TaskDataset = TaskDataset(dataset=indexable_dataset, task_transforms=task_transforms)
# - loop through tasks
for task_num in range(batch_size):
print(f'{task_num=}')
X, y = taskset.sample()
print(f'{X.size()=}')
print(f'{y.size()=}')
print(f'{y=}')
print()
print('-- end of test --')
# -- Run experiment
if __name__ == "__main__":
import time
from uutils import report_times
start = time.time()
# - run experiment
loop_through_l2l_indexable_datasets_test()
# - Done
print(f"\nSuccess Done!: {report_times(start)}\a")
context: https://github.com/learnables/learn2learn/issues/333
crossposted:
https://discuss.pytorch.org/t/why-is-randomcrop-with-size-84-and-padding-8-returning-an-image-size-of-84-and-not-100-in-pytorch/151463
https://www.reddit.com/r/pytorch/comments/uno1ih/why_is_randomcrop_with_size_84_and_padding_8/
The padding is applied to the input image or tensor before applying the random crop. Ultimately, the output image has a spatial size equal to that of the provided size(s) given to the T.RandomCrop function since the operation is performed after.
After all, it makes more sense to pad the input image rather than the cropped image, doesn't it?

Create nested JSON of all rows having same Id: DataFrame

I have a DataFrame df4 with three column
id annotating entity
data having JSON Array data
executor_id as string value
Code to create same is as follow:
val df1 = Seq((1, "n1", "d1")).toDF("id", "number", "data")
val df2 = df1.withColumn("data", to_json(struct($"number", $"data"))).groupBy("id").agg(collect_list($"data").alias("data")).withColumn("executor_id", lit("e1"))
val df3 = df1.withColumn("data", to_json(struct($"number", $"data"))).groupBy("id").agg(collect_list($"data").alias("data")).withColumn("executor_id", lit("e2"))
val df4 = df2.union(df3)
Content of DF4 is like
scala> df4.show(false)
+---+-----------------------------+-----------+
|id |data |executor_id|
+---+-----------------------------+-----------+
|1 |[{"number":"n1","data":"d1"}]|e1 |
|1 |[{"number":"n1","data":"d1"}]|e2 |
+---+-----------------------------+-----------+
I have to create new json data with executor_id as key and data as json data, group by id. Resultant dataFrame like
+---+------------------------------------------------------------------------+
|id |new_data |
+---+------------------------------------------------------------------------+
|1 |{"e1":[{"number":"n1","data":"d1"}], "e2":[{"number":"n1","data":"d1"}]}|
+---+------------------------------------------------------------------------+
Versions:
Spark: 2.2
Scala: 2.11
I have been struggling to solve this problem from past three days and finally able to work around it using UserDefinedAggregateFunction. Here is sample code for same
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
class CustomAggregator extends UserDefinedAggregateFunction {
override def inputSchema: org.apache.spark.sql.types.StructType =
StructType(Array(StructField("key", StringType), StructField("value", ArrayType(StringType))))
// This is the internal fields you keep for computing your aggregate
override def bufferSchema: StructType = StructType(
Array(StructField("mapData", MapType(StringType, ArrayType(StringType))))
)
// This is the output type of your aggregatation function.
override def dataType = StringType
override def deterministic: Boolean = true
// This is the initial value for your buffer schema.
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = scala.collection.mutable.Map[String, String]()
}
// This is how to update your buffer schema given an input.
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer(0) = buffer.getMap(0) + (input.getAs[String](0) -> input.getAs[String](1))
}
// This is how to merge two objects with the bufferSchema type.
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1.update(0, buffer1.getAs[Map[String, Any]](0) ++ buffer2.getAs[Map[String, Any]](0))
}
// This is where you output the final value, given the final value of your bufferSchema.
override def evaluate(buffer: Row): Any = {
val map = buffer(0).asInstanceOf[Map[Any, Any]]
val buff: ListBuffer[String] = ListBuffer()
for ((k, v) <- map) {
val valArray = v.asInstanceOf[mutable.WrappedArray[Any]].array;
val tmp = {
for {
valString <- valArray
} yield valString.toString
}.toList.mkString(",")
buff += "\"" + k.toString + "\":[" + tmp + "]"
}
"{" + buff.toList.mkString(",") + "}"
}
}
Now use customAggregator,
val ca = new CustomAggregator
val df5 = df4.groupBy("id").agg(ca(col("executor_id"), col("data")).as("jsonData"))
Resultant DF is
scala> df5.show(false)
+---+-----------------------------------------------------------------------+
|id |jsonData |
+---+-----------------------------------------------------------------------+
|1 |{"e1":[{"number":"n1","data":"d1"}],"e2":[{"number":"n1","data":"d1"}]}|
+---+-----------------------------------------------------------------------+
Even though I have solved this problem, I am not sure whether this is right way or not. Reasons for my doubts are
In places, I have used Any. I don't feel this it is correct.
For each evaluation, I am creating ListBuffer and many other data type. I am not sure about performance of code.
I still have to test code for many dataType like double, date tpye, nested json etc. as data.

Dumping numpy arrays into neural network

I am trying to solve the titanic machine learning challenge from kaggle using a neural network. I removed most of the irrelevant data and converted the useful data into a 2D numpy array while the survival is converted into a 1D numpy array. For some reason it throws an error saying dimension 0 in both shape must be equal, I've been trying to solve it for quite a while and I hope that you guys can help out.
TensorFlowNumpy.py
import tensorflow as tf
def numpy2tensor(numpy):
sess = tf.Session()
with sess.as_default():
return tf.constant(numpy)
def tensor2numpy(tensor):
sess = tf.Session()
with sess.as_default():
return tensor.eval()
Dataset.py
import pandas
import numpy as np
dataset = pandas.read_csv('train.csv')
dataset2= dataset.drop(['PassengerId','Survived','Name','Ticket','Fare','Cabin','Embarked'],axis=1)
dataset3= dataset2.fillna(0)
survive = pandas.read_csv('train.csv')
survival = np.float32(survive.Survived)
dataset4 = np.float32(dataset3)
MainCode.py
import tensorflow as tf
import numpy
from dataset import dataset4,survival
from sklearn.model_selection import train_test_split
from TensorFlowNumpy import numpy2tensor
train_x,test_x,train_y,test_y = train_test_split(dataset4,survival,test_size
= 0.2)
tensor_train_x = numpy2tensor(train_x)
tensor_train_y = numpy2tensor(train_y)
tensor_test_x = numpy2tensor(test_x)
tensor_test_y = numpy2tensor(test_y)
n_nodes_hl1 = 10
n_nodes_hl2 = 10
n_classes = 2
x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)
def neural_network_model(data):
hidden_1_layer = {'weights':tf.Variable(tf.random_normal([5,
n_nodes_hl1])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1,
n_nodes_hl2])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2,
n_classes])),
'biases':tf.Variable(tf.random_normal([n_classes]))}
l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']),
hidden_1_layer['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']),
hidden_2_layer['biases'])
l2 = tf.nn.relu(l2)
output = tf.matmul(l2,output_layer['weights']) + output_layer['biases']
return output
def train_neural_network(x):
prediction = neural_network_model(x)
cost =
tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,
labels=tensor_train_y))
optimizer1 = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
hm_epochs = 100
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss = 0
_, c = sess.run([optimizer1, cost], feed_dict={x:tensor_train_x,
y:tensor_train_y})
epoch_loss += c
print('Epoch', epoch+1, 'completed out
of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:test_x, y:test_y}))
train_neural_network(tensor_train_x)
I have faced this error several times, problem is obviously in our code. I didn't look through your code thoroughly as i am leaving for the day, but i smell that your dependent variable/ output variable shape is [1,712] which should be [712,1] so some where in the code try to fix it. Basically what it meant is you are having one row with 712 columns but it should be 712 rows with 1 column(output). Please mark this as answer if it helps. Ping me tomorrow if problem still exists. I will take a look at it.

Parsing epoch milliseconds from json with Spark 2

Has anyone parsed a millisecond timestamp using from_json in Spark 2+? How's it done?
So Spark changed the TimestampType to parse epoch numerical values as being in seconds instead of millis in v2.
My input is a hive table that has a json formatted string in a column which I'm trying to parse like this:
val spark = SparkSession
.builder
.appName("Problematic Timestamps")
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
val schema = StructType(
StructField("categoryId", LongType) ::
StructField("cleared", BooleanType) ::
StructField("dataVersion", LongType) ::
StructField("details", DataTypes.createArrayType(StringType)) ::
…
StructField("timestamp", TimestampType) ::
StructField("version", StringType) :: Nil
)
val item_parsed =
spark.sql("select * FROM source.jsonStrInOrc")
.select('itemid, 'locale,
from_json('internalitem, schema)
as 'internalitem,
'version, 'createdat, 'modifiedat)
val item_flattened = item_parsed
.select('itemid, 'locale,
$"internalitem.*",
'version as'outer_version, 'createdat, 'modifiedat)
This can parse a row with a column containing:
{"timestamp": 1494790299549, "cleared": false, "version": "V1", "dataVersion": 2, "categoryId": 2641, "details": [], …}
And that gives me timestamp fields like 49338-01-08 00:39:09.0 from a value 1494790299549 which I'd rather read as: 2017-05-14 19:31:39.549
Now I could set the schema for timestamp to be a long, then divide the value by 1000 and cast to a timestamp, but then I'd have 2017-05-14 19:31:39.000 not 2017-05-14 19:31:39.549. I'm having trouble figuring out how I could either:
Tell from_json to parse a millisecond timestamp (maybe by subclassing the TimestampType in some way to use in the schema)
Use a LongType in the schema and cast that to a Timestamp which preserves the milliseconds.
Addendum on UDFs
I found that trying to do the division in the select and then casting didn't look clean to me, though it's a perfectly valid method. I opted for a UDF that used a java.sql.timestamp which is actually specified in epoch milliseconds.
import java.sql.Timestamp
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{explode, from_json, udf}
import org.apache.spark.sql.types.
{BooleanType, DataTypes, IntegerType, LongType,
StringType, StructField, StructType, TimestampType}
val tsmillis = udf { t: Long => new Timestamp (t) }
val spark = SparkSession
.builder
.appName("Problematic Timestamps")
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
val schema = StructType(
StructField("categoryId", LongType) ::
StructField("cleared", BooleanType) ::
StructField("dataVersion", LongType) ::
StructField("details", DataTypes.createArrayType(StringType)) ::
…
StructField("timestamp", LongType) ::
StructField("version", StringType) :: Nil
)
val item_parsed =
spark.sql("select * FROM source.jsonStrInOrc")
.select('itemid, 'locale,
from_json('internalitem, schema)
as 'internalitem,
'version, 'createdat, 'modifiedat)
val item_flattened = item_parsed
.select('itemid, 'locale,
$"internalitem.categoryId", $"internalitem.cleared",
$"internalitem.dataVersion", $"internalitem.details",
tsmillis($"internalitem.timestamp"),
$"internalitem.version",
'version as'outer_version, 'createdat, 'modifiedat)
See how that's in the select.
I think it would be worthwhile to do a performance test to see if using withcolumn division and casting is faster than the udf.
Now I could set the schema for timestamp to be a long, then divide the value by 1000
Actually this exactly what you need, just keep the types right. Let's say you have only Long timestamp field:
val df = spark.range(0, 1).select(lit(1494790299549L).alias("timestamp"))
// df: org.apache.spark.sql.DataFrame = [timestamp: bigint]
If you divide by 1000:
val inSeconds = df.withColumn("timestamp_seconds", $"timestamp" / 1000)
// org.apache.spark.sql.DataFrame = [timestamp: bigint, timestamp_seconds: double]
you'll get timestamp in seconds as double (note that this is SQL, not Scala behavior).
All what is left is cast (Spark < 3.1)
inSeconds.select($"timestamp_seconds".cast("timestamp")).show(false)
// +-----------------------+
// |timestamp_seconds |
// +-----------------------+
// |2017-05-14 21:31:39.549|
// +-----------------------+
or (Spark >= 3.1) timestamp_seconds (or directly timestamp_millis)
import org.apache.spark.sql.functions.{expr, timestamp_seconds}
inSeconds.select(timestamp_seconds($"timestamp_seconds")).show(false)
// +------------------------------------+
// |timestamp_seconds(timestamp_seconds)|
// +------------------------------------+
// |2017-05-14 21:31:39.549 |
// +------------------------------------+
df.select(expr("timestamp_millis(timestamp)")).show(false)
// +---------------------------+
// |timestamp_millis(timestamp)|
// +---------------------------+
// |2017-05-14 21:31:39.549 |
// +---------------------------+

Can I export a tensorflow summary to CSV?

Is there a way to extract scalar summaries to CSV (preferably from within tensorboard) from tfevents files?
Example code
The following code generates tfevent files in a summary_dir within the same directory. Suppose you let it run and you find something interesting. You want to get the raw data for further investigation. How would you do that?
#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits
FLAGS = None
def inference(x):
"""
Build the inference graph.
Parameters
----------
x : placeholder
Returns
-------
Output tensor with the computed logits.
"""
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
return y
def loss(logits, labels):
"""
Calculate the loss from the logits and the labels.
Parameters
----------
logits : Logits tensor, float - [batch_size, NUM_CLASSES].
labels : Labels tensor, int32 - [batch_size]
"""
cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
logits=logits))
return cross_entropy
def training(loss, learning_rate=0.5):
"""
Set up the training Ops.
Parameters
----------
loss : Loss tensor, from loss().
learning_rate : The learning rate to use for gradient descent.
Returns
-------
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
y = inference(x)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
loss_ = loss(logits=y, labels=y_)
train_step = training(loss_)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('accuracy'):
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
tf.global_variables_initializer().run()
for train_step_i in range(100000):
if train_step_i % 100 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})
test_writer.add_summary(summary, train_step_i)
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.train.images,
y_: mnist.train.labels})
train_writer.add_summary(summary, train_step_i)
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
While the answer here is as requested within tensorboard it only allows to download a csv for a single run of a single tag.
If you have for example 10 tags and 20 runs (what is not at all much) you would need to do the above step 200 times (that alone will probably take you more than a hour).
If now you for some reason would like to actually do something with the data for all runs for a single tag you would need to write some weird CSV accumulation script or copy everything by hand (what will probably cost you more than a day).
Therefore I would like to add a solution that extracts a CSV file for every tag with all runs contained. Column headers are the run path names and row indices are the run step numbers.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = defaultdict(list)
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
def to_csv(dpath):
dirs = os.listdir(dpath)
d, steps = tabulate_events(dpath)
tags, values = zip(*d.items())
np_values = np.array(values)
for index, tag in enumerate(tags):
df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
df.to_csv(get_file_path(dpath, tag))
def get_file_path(dpath, tag):
file_name = tag.replace("/", "_") + '.csv'
folder_path = os.path.join(dpath, 'csv')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return os.path.join(folder_path, file_name)
if __name__ == '__main__':
path = "path_to_your_summaries"
to_csv(path)
My solution builds upon: https://stackoverflow.com/a/48774926/2230045
EDIT:
I created a more sophisticated version and released it on GitHub: https://github.com/Spenhouet/tensorboard-aggregator
This version aggregates multiple tensorboard runs and is able to save the aggregates to a new tensorboard summary or as a .csv file.
Just check the "Data download links" option on the upper-left in TensorBoard, and then click on the "CSV" button that will appear under your scalar summary.
Here is my solution which bases on the previous solutions but can scale up.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
final_out = {}
for dname in os.listdir(dpath):
print(f"Converting run {dname}",end="")
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
wall_time.append(event.wall_time)
steps.append(event.step)
out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_csv(f'{dname}.csv')
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
return final_out
if __name__ == '__main__':
path = "youre/path/here"
steps = tabulate_events(path)
pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')
Very minimal example:
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]
df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)
step train_loss
0 0 700.491516
1 1 163.593246
2 2 146.365448
3 3 153.830215
...
Plotting loss vs epochs example:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
y_key = "val_loss"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")
fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)
Just to add to #Spen
in case you want to export the data when you have varying numbers of steps.
This will make one large csv file.
Might need to change around the keys for it to work for you.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))
listDF = []
for tb_output_folder in listOutput:
print(tb_output_folder)
x = EventAccumulator(path=tb_output_folder)
x.Reload()
x.FirstEventTimestamp()
keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error']
listValues = {}
steps = [e.step for e in x.Scalars(keys[0])]
wall_time = [e.wall_time for e in x.Scalars(keys[0])]
index = [e.index for e in x.Scalars(keys[0])]
count = [e.count for e in x.Scalars(keys[0])]
n_steps = len(steps)
listRun = [tb_output_folder] * n_steps
printOutDict = {}
data = np.zeros((n_steps, len(keys)))
for i in range(len(keys)):
data[:,i] = [e.value for e in x.Scalars(keys[i])]
printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}
printOutDict['Name'] = listRun
DF = pd.DataFrame(data=printOutDict)
listDF.append(DF)
df = pd.concat(listDF)
df.to_csv('Output.csv')