Groupby aggregations and missing combinations of values - vega-lite

I recently started tinkering with Vega-Lite templates to make a confusion matrix for an open-source data science software called DVC. You can see the template in my PR here, but I'll also repeat a simplified version below:
{
...
"data": {
"values": [
{"actual": "Wake", "predicted": "Wake", "rev": "HEAD"},
{"actual": "Wake", "predicted": "Deep", "rev": "HEAD"},
{"actual": "Light", "predicted": "Wake", "rev": "HEAD"},
{"actual": "REM", "predicted": "Light", "rev": "HEAD"},
....
],
},
"spec": {
"transform": [
{
"aggregate": [{"op": "count", "as": "xy_count"}],
"groupby": ["actual", "predicted"],
},
{
"joinaggregate": [
{"op": "max", "field": "xy_count", "as": "max_count"}
],
"groupby": [],
},
{
"calculate": "datum.xy_count / datum.max_count",
"as": "percent_of_max",
},
],
"encoding": {
"x": {"field": "predicted", "type": "nominal", "sort": "ascending"},
"y": {"field": "actual", "type": "nominal", "sort": "ascending"},
},
"layer": [
{
"mark": "rect",
"width": 300,
"height": 300,
"encoding": {
"color": {
"field": "xy_count",
"type": "quantitative",
"title": "",
"scale": {"domainMin": 0, "nice": True},
}
},
},
{
"mark": "text",
"encoding": {
"text": {
"field": "xy_count",
"type": "quantitative"
},
"color": {
"condition": {
"test": "datum.xy_count / datum.max_count > 0.5",
"value": "white"
},
"value": "black"
}
}
}
]
}
}
So, since I'm doing a groupby aggregation, it's possible for there to be cells in the confusion matrix with no entries. Here's an example output: link
How can I fill in these cells with "fallback" or something. I also looked at using pivot and impute, but couldn't quite figure it out. Help much appreciated :)

You can do this by adding two Impute transforms to the end of your sequence of transforms:
{"impute": "xy_count", "groupby": ["actual"], "key": "predicted", "keyvals": ["Deep", "Light", "Wake", "REM"], "value": 0},
{"impute": "xy_count", "groupby": ["predicted"], "key": "actual", "keyvals": ["Deep", "Light", "Wake", "REM"], "value": 0}
The keyvals specify which missing values you would like to be imputed on each axis; you can leave it out if at least one of the groups is present for each keyval.

Related

Does anyone have an explanation as to why this graph is being made on vega?

Am trying to make a graph in vegalite whereby i show the evolution of stock prices overtime. Intuitively this should be very easy however for some reason only two lines seem to get output and they aren't reflective of the stock prices at all. Is there something wrong with my data or am i missing something quite basic?
{"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"title": {
"text": "Cases: UK nations",
"subtitle": [
"New cases by publish date, rolling rate"
],
"subtitleFontStyle": "italic",
"subtitleFontSize": 10,
"anchor": "start",
"color": "black"
},
"background": "whitesmoke",
"width": 800,
"height": 600,
"data": {
"url": "https://raw.githubusercontent.com/andrewsnowdon/andrewsnowdon.github.io/main/graph1megasheet.csv",
"format": {"type": "csv"}},
"layer": [
{
"encoding": {
"x": {"field": "Date", "type": "temporal"},
"y": {"field": "Open", "type": "quantitative"},
"color": {
"field":"Stockname",
"type": "nominal"
}
},
"layer": [
{"mark": "line"},
{
"params": [
{
"name": "label",
"select": {
"type": "point",
"encodings": ["x"],
"nearest": true,
"on": "mouseover"
}
}
],
"mark": "point",
"encoding": {
"opacity": {
"condition": {"param": "label", "empty": false, "value": 1},
"value": 0
}
}
}
]
},
{
"transform": [{"filter": {"param": "label", "empty": true}}],
"layer": [
{
"mark": {"type": "rule", "color": "grey"},
"encoding": {
"x": {"type": "temporal", "field": "Date", "aggregate": "min"}
}
},
{
"encoding": {
"text": {"type": "quantitative", "field": "Open"},
"x": {"type": "temporal", "field": "Date", "title": "Month"},
"y": {
"type": "quantitative",
"field": "Open",
"title": "Price"
}
},
"layer": [
{
"mark": {
"type": "text",
"stroke": "white",
"strokeWidth": 0.5,
"align": "left",
"dx": 5,
"dy": -5
}
},
{
"mark": {"type": "text", "align": "left", "dx": 5, "dy": -5},
"encoding": {"color": {"type": "quantitative"}}
}
]
}
]
}
],
"config": {}
}
Four of your stocks have identical data, so the lines are hidden below the last one drawn. You can see this by faceting your dataset:
{
"data": {
"url": "https://raw.githubusercontent.com/andrewsnowdon/andrewsnowdon.github.io/main/graph1megasheet.csv",
"format": {"type": "csv", "parse": {"Date": "date:'%d/%m/%Y'"}}
},
"mark": "line",
"encoding": {
"x": {"field": "Date", "type": "temporal"},
"y": {"field": "Open", "type": "quantitative"},
"facet": {"field": "Stockname", "type": "nominal", "columns": 3}
}
}
Notice the parse argument to the data format; this is required for correct parsing of your date entries (as mentioned in https://stackoverflow.com/a/70658380/2937831).

Slider for chloropeth map Vega-lite

I am trying to add a slider for my chloropeth map of Europe in vega-lite, to filter the data by year. I currently have a map which just shows data from 2019 (colour coded), and I am trying to make a slider so I can change years and see how the colours have changed over time.
Here is my code so far:
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"width": 300,
"height": 300,
"data": {
"url": "https://raw.githubusercontent.com/leakyMirror/map-of-europe/master/TopoJSON/europe.topojson",
"format": {"type": "topojson", "feature": "europe"}
},
"transform": [
{
"lookup": "properties.NAME",
"from": {
"data": {
"url": "https://raw.githubusercontent.com//jamesjeffery77/jamesjeffery77.github.io/main/share-electricity-low-carbon_fullDataset.csv"
},
"key": "country",
"fields": ["percentage"]
}
}
],
"params": [
{
"name": "year",
"value": 2019,
"bind": {
"input": "range",
"min": 1985,
"max": 2019,
"step": 1,
"name": "Select the year:"
}
}
],
"projection": {"type": "naturalEarth1"},
"mark": "geoshape",
"encoding": {
"color": {
"field": "percentage",
"type": "quantitative"},
"tooltip": [
{"field": "properties.NAME", "type": "nominal", "title": "country"},
{"field": "percentage", "type": "quantitative"}
]
}
}
I have been able to do make a bar chart using the same data which updates as I move the slider.The {"filter": "datum.year==year"} is what makes my bar chart able to do this, however it does not work on my chloropeth map (I have tried to add this within the "transform" array in both, with success for my bar chart). Here is the code for my bar chart in case that helps.
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"description": "",
"title": {
"text": "Thisisatest",
"subtitle":
"hellohello Source: OurWorldInData",
"subtitleFontStyle": "italic",
"subtitleFontSize": 10,
"anchor": "start",
"color": "black"
},
"data": {
"url": "https://raw.githubusercontent.com//jamesjeffery77/jamesjeffery77.github.io/main/share-electricity-low-carbon_fullDataset.csv"
},
"height": 300,
"width": 350,
"mark": {"type": "bar", "color": "skyblue"},
"transform": [
{"filter": "datum.year==year"},
{"filter": {
"field": "country",
"oneOf": [
"United Kingdom", "Spain", "France", "Netherlands", "Portugal", "Italy", "Poland", "Albania", "Germany", "Belgium", "Austria", "Denmark"]}
}
],
"params": [
{
"name": "year",
"value": 2019,
"bind": {
"input": "range",
"min": 1985,
"max": 2019,
"step": 1,
"name": "Select the year:"
}
}
],
"encoding": {
"y": {
"field": "percentage",
"type": "quantitative",
"title": "Percentage of low carbon energy",
"axis": {"grid": false}
},
"x": {
"field": "country",
"type": "nominal",
"title": "",
"axis": {"grid": false, "labelAngle": 20},
"sort": "-y"
},
"tooltip": [
{"field": "country", "title": "Country"},
{"field": "percentage", "title": "percentage of low carbon energy"}
]
}
}
What am I am doing wrong? Would appreciate any help! :)
Thanks
It was almost correct but in your lookup transform you need to provide the fields which you are going to be required in your final data like year field. Since there was no year field the filter transform was not working.
Below is the modified config or refer editor:
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"width": 300,
"height": 300,
"data": {
"url": "https://raw.githubusercontent.com//jamesjeffery77/jamesjeffery77.github.io/main/share-electricity-low-carbon_fullDataset.csv"
},
"transform": [
{
"lookup": "country",
"from": {
"data": {
"url": "https://raw.githubusercontent.com/leakyMirror/map-of-europe/master/TopoJSON/europe.topojson",
"format": {"type": "topojson", "feature": "europe"}
},
"key": "properties.NAME",
"fields": ["properties", "type", "geometry"]
}
},
{"filter": "datum.year==year"}
],
"params": [
{
"name": "year",
"value": 2019,
"bind": {
"input": "range",
"min": 1985,
"max": 2030,
"step": 1,
"name": "Select the year:"
}
}
],
"projection": {"type": "naturalEarth1"},
"mark": "geoshape",
"encoding": {
"color": {"field": "percentage", "type": "quantitative"},
"tooltip": [
{"field": "properties.NAME", "type": "nominal", "title": "country"},
{"field": "percentage", "type": "quantitative"}
]
}
}
Edit
I have inverted the main data and lookup data, which seems to bring all the years for your countries. Let me know if this works.

Vega Lite: Normalized Stacked Bar Chart + Overlay percentages as text

I have a stacked normalized bar chart similar to this:
https://vega.github.io/editor/#/examples/vega-lite/stacked_bar_normalize
I'm trying to show the related percentages (per bar segment) as text on the bars similar to: https://gist.github.com/pratapvardhan/00800a4981d43a84efdba0c4cf8ee2e1
I tried adding a transform field to calculate the percentages, but still couldn't get it to work after hours of trying.
I'm lost help 🥺
My best try:
{
"description":
"A bar chart showing the US population distribution of age groups and gender in 2000.",
"data": {
"url": "data/population.json"
},
"transform": [
{"filter": "datum.year == 2000"},
{"calculate": "datum.sex == 2 ? 'Female' : 'Male'", "as": "gender"},
{
"stack": "people",
"offset": "normalize",
"as": ["v1", "v2"],
"groupby": ["age"],
"sort": [{"field": "gender", "order": "descending"}]
}
],
"encoding": {
"y": {
"field": "v1",
"type": "quantitative",
"title": "population"
},
"y2": {"field": "v2"},
"x": {
"field": "age",
"type": "ordinal"
},
"color": {
"field": "gender",
"type": "nominal",
"scale": {
"range": ["#675193", "#ca8861"]
}
}
},
"layer":[
{ "mark": "bar"},
{"mark": {"type": "text", "dx": 0, "dy": 0},
"encoding": {
"color":{"value":"black"},
"text": { "field": "v1", "type": "quantitative", "format": ".1f"}}
}
]
}
You can use a joinaggregate transform to normalize each group, and then use "format": ".1%" to display fractions as percents. Using this, there is no need to manually compute the stack transform; it is simpler to specify the stack via the encoding, as in the example you linked to.
Here is the result (open in editor):
{
"description": "A bar chart showing the US population distribution of age groups and gender in 2000.",
"data": {"url": "data/population.json"},
"transform": [
{"filter": "datum.year == 2000"},
{"calculate": "datum.sex == 2 ? 'Female' : 'Male'", "as": "gender"},
{
"joinaggregate": [{"op": "sum", "field": "people", "as": "total"}],
"groupby": ["age"]
},
{"calculate": "datum.people / datum.total", "as": "fraction"}
],
"encoding": {
"y": {
"aggregate": "sum",
"field": "people",
"title": "population",
"stack": "normalize"
},
"order": {"field": "gender", "sort": "descending"},
"x": {"field": "age", "type": "ordinal"},
"color": {
"field": "gender",
"type": "nominal",
"scale": {"range": ["#675193", "#ca8861"]}
}
},
"layer": [
{"mark": "bar"},
{
"mark": {"type": "text", "dx": 20, "dy": 0, "angle": 90},
"encoding": {
"color": {"value": "white"},
"text": {"field": "fraction", "type": "quantitative", "format": ".1%"}
}
}
]
}

Text overlay in heatmap

Is it possible to overlay text in selectable heatmap
https://vega.github.io/editor/#/examples/vega-lite/selection_heatmap
If I plot a heatmap of actual vs predicted for binary in the selectable heatmap, can I overlay text as True Positives, FP, TN, FN with corresponding values
and turn off legend
Here full code
{
"$schema": "https://vega.github.io/schema/vega-lite/v2.4.json",
"data": {
"values": [
{"actual": "Good", "predicted": "Bad", "count": 6386},
{"actual": "Bad", "predicted": "Good", "count": 1261},
{"actual": "Bad", "predicted": "Bad", "count": 6386},
{"actual": "Good", "predicted": "Good", "count": 24030}
]
},
"mark": {"type": "rect", "strokeWidth": 2},
"encoding": {
"y": {
"field": "actual",
"type": "nominal"
},
"x": {
"field": "predicted",
"type": "nominal"
},
"fill": {
"field": "count",
"type": "quantitative"
},
"config": {
"scale": {
"bandPaddingInner": 0,
"bandPaddingOuter": 0
}
}
}
Output like
Is it possible to extract values from index and feed to count values
"url" : {
"%context%": true,
"index": "index",
"body": {
"size":1000,
"_source": ["modelMetrics"],
}
}
"format": {"property": "hits.hits"}
},
where index has
"_source" : {
"modelMetrics" : {
"TN" : 110868,
"FP" : 6386,
"FN" : 1261,
"TP" : 24030,
}
}
You can do this using a Layered chart construct. For example (view in editor):
{
"$schema": "https://vega.github.io/schema/vega-lite/v2.4.json",
"data": {
"values": [
{"actual": "Good", "predicted": "Bad", "label": "FP", "count": 6386},
{"actual": "Bad", "predicted": "Good", "label": "FN", "count": 1261},
{"actual": "Bad", "predicted": "Bad", "label": "TN", "count": 6386},
{"actual": "Good", "predicted": "Good", "label": "TP", "count": 24030}
]
},
"encoding": {
"y": {"field": "actual", "type": "nominal"},
"x": {"field": "predicted", "type": "nominal"}
},
"layer": [
{
"mark": {"type": "rect", "strokeWidth": 2},
"encoding": {
"fill": {"field": "count", "type": "quantitative", "legend": null}
}
},
{
"mark": {"type": "text", "dy": -5},
"encoding": {
"text": {"field": "label", "type": "nominal"},
"color": {
"condition": {"test": "datum.count < 10000", "value": "black"},
"value": "white"
}
}
},
{
"mark": {"type": "text", "dy": 5},
"encoding": {
"text": {"field": "count", "type": "nominal"},
"color": {
"condition": {"test": "datum.count < 10000", "value": "black"},
"value": "white"
}
}
}
],
"width": 100,
"height": 100
}

Vega lite select N number of objects (count)

I just started using Vega lite and was wondering how to cut out everything after my 10th object (I have thousands of rows and am just interested in the top 10).
This is what I have so far:
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {
"url": "https://raw.githubusercontent.com/DanStein91/Info-vis/master/anage.csv",
"format": {
"type": "csv"
}
},
"transform": [
{
"filter": {
"field": "Female_maturity_(days)",
"gt": 0
}
}
],
"title": {
"text": "",
"anchor": "middle"
},
"mark": "bar",
"encoding": {
"y": {
"field": "Common_name",
"type": "nominal",
"sort": {
"op": "mean",
"field": "Female_maturity_(days)",
"order": "descending"
}
},
"x": {
"field": "Female_maturity_(days)",
"type": "quantitative"
}
},
"config": {}
}
You can follow the Filtering Top K Items example from the documentation. The result looks something like this (view in vega editor):
{
"data": {
"url": "https://raw.githubusercontent.com/DanStein91/Info-vis/master/anage.csv",
"format": {"type": "csv", "parse": {"Female_maturity_(days)": "number"}}
},
"transform": [
{
"window": [{"op": "rank", "as": "rank"}],
"sort": [{"field": "Female_maturity_(days)", "order": "descending"}]
},
{"filter": "datum.rank <= 10"}
],
"mark": "bar",
"encoding": {
"y": {
"field": "Common_name",
"type": "nominal",
"sort": {
"op": "mean",
"field": "Female_maturity_(days)",
"order": "descending"
}
},
"x": {"field": "Female_maturity_(days)", "type": "quantitative"}
},
"title": {"text": "", "anchor": "middle"}
}
One note: when doing transforms on CSV data (as opposed to JSON data), it's important to use format.parse to specify the desired data type for the columns: by default, CSV columns are interpreted as strings, which can cause sorting-based operations to behave in unexpected ways.