How to use conditional statements in Vega-lite transform - vega-lite

My sample source code is following
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"description": "A simple bar chart with embedded data.",
"data": {
"values": [
{"a": "A", "b": 302},
{"a": "B", "b": 2794},
{"a": "C", "b": 96237},
{"a": "D", "b": 766995},
{"a": "E", "b": 7691230},
{"a": "F", "b": 59755899},
{"a": "G", "b": 229910863},
{"a": "H", "b": 9342989068},
{"a": "I", "b": 19617657788},
{"a": "J", "b": 140800000001}
]
},
"encoding": {
"x": {"field": "a", "type": "nominal", "axis": {"labelAngle": 0}},
"y": {"field": "b", "type": "quantitative"}
},
"layer": [{
"mark": "bar"
},
{
"mark":{
"type":"text",
"align":"center",
"baseline":"middle",
"dx":0,
"dy":-5
} ,
"encoding":{
"text":{"field":"b","type":"quantitative"}
}
}
]
}
I want Vega-lite to generate labels for text mark by following logic
if absoluteValue(b)>1 and if absoluteValue(b)<999 then format""
else
if absoluteValue(b)>1000 and if absoluteValue(b)<9999 then format.2s
else
if absoluteValue(b)>10000 and if absoluteValue(b)<99999 then format.3s
else
if absoluteValue(b)>100000 and if absoluteValue(b)<999999 then format.4s
else
if absoluteValue(b)>1000000 and if absoluteValue(b)<9999999 then format.2s
else
if absoluteValue(b)>10000000 and if absoluteValue(b)<99999999 then format.3s
else
if absoluteValue(b)>100000000 and if absoluteValue(b)<999999999 then format.4s
else
if absoluteValue(b)>1000000000 and if absoluteValue(b)<9999999999 then format.2s
else
if absoluteValue(b)>10000000000 and if absoluteValue(b)<99999999999 then format.3s
else
format.4s
Following is my desired result
| a | b | desired format |
|--- |-------------- |---------------- |
| A | 302 | 302 |
| B | 2794 | 2.8k |
| C | 96237 | 96.2k |
| D | 766995 | 767.0k |
| E | 7691230 | 7.7M |
| F | 59755899 | 59.8M |
| G | 229910863 | 229.9M |
| H | 9342989068 | 9.3G |
| I | 19617657788 | 19.6G |
| | 140800000001 | 140.8G |
Wahab Memon showed me in a different question Vega-lite to show text values in SI units how to use the conditional statements, but I don't know how to expand that to multiple statements. I also, can't find any documentation that demonstrates it. I was wondering if I can use if or switch statements to achieve this.
Can someone please point me to the correct direction. Thank you in advance !!!

Following are the possible ways to define your conditions using ternary in calculate which works same as if-else:
"transform": [
{
"calculate": " 0 < datum.b && datum.b < 999 ? format(datum.b,'.1s') : 999 < datum.b && datum.b < 9999? format(datum.b,'.2s') : 9999 < datum.b && datum.b < 9999? format(datum.b,'.3s') : format(datum.b,'.4s')",
"as": "textValue"
}
],
Another cleaner option is use multiple calculate:
"transform": [
{
"calculate": " 0 < datum.b && datum.b < < 999 ? format(datum.b,'.1s') : datum.textValue",
"as": "textValue"
}, {
"calculate": "999 < datum.b && datum.b < < 9999? format(datum.b,'.2s') : datum.textValue",
"as": "textValue"
}, {
"calculate": "9999 < datum.b && datum.b < < 9999? format(datum.b,'.3s') : datum.textValue",
"as": "textValue"
}
],
Edit
Refer the below snippet and editor:
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"description": "A simple bar chart with embedded data.",
"data": {
"values": [
{"a": "A", "b": 302},
{"a": "B", "b": 2794},
{"a": "C", "b": 96237},
{"a": "D", "b": 766995},
{"a": "E", "b": 7691230},
{"a": "F", "b": 59755899},
{"a": "G", "b": 229910863},
{"a": "H", "b": 9342989068},
{"a": "I", "b": 19617657788},
{"a": "J", "b": 140800000001}
]
},
"transform": [
{
"calculate": " 0 < datum.b && datum.b <= 999 ? format(datum.b,'') : datum.b",
"as": "textVal"
},
{
"calculate": "1000 <= datum.b && datum.b <= 9999? format(datum.b,'.2s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "10000 <= datum.b && datum.b <= 99999? format(datum.b,'.3s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "100000 <= datum.b && datum.b <= 999999? format(datum.b,'.4s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "1000000 < datum.b && datum.b <= 9999999? format(datum.b,'.2s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "10000000 < datum.b && datum.b <= 99999999? format(datum.b,'.3s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "100000000 < datum.b && datum.b <= 999999999? format(datum.b,'.4s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "1000000000 < datum.b && datum.b <= 9999999999? format(datum.b,'.2s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "10000000000 < datum.b && datum.b <= 99999999999? format(datum.b,'.3s') : datum.textVal",
"as": "textVal"
},
{
"calculate": "100000000000 < datum.b && datum.b <= 999999999999? format(datum.b,'.4s') : datum.textVal",
"as": "textVal"
}
],
"encoding": {
"x": {"field": "a", "type": "nominal", "axis": {"labelAngle": 0}},
"y": {"field": "b", "type": "quantitative"}
},
"layer": [
{"mark": {"type": "bar", "tooltip": true}},
{
"mark": {
"type": "text",
"align": "center",
"baseline": "middle",
"dx": 0,
"dy": -5,
"tooltip": true
},
"encoding": {"text": {"field": "textVal"}}
}
]
}

Related

compare two json files and match entries with the same value

I need compare two json files on keys: a,b,c,d,e
If all keys is match for entry between json file 1 and json file 2
so i should find delta betwen platform_time for this entry.
And then delete this entries from json file 1 and json file 2.
(both json files have 10000000000 entries ):
So here we should match :
1) one[0] and [two][1]
2) one[1] and [two][]
Data json one and json two:
first file -
"one": [
{
"a" : "2022-09-12 00:00:00.000",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:00.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "6",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},...]
second file -
"two": [
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "200",
"d" : "2021-09-11 23:59:59.997",
"e" : 81
},...
]
I start doing something like this, but iteration for all elements takes too much time.
Could you please help me optimize my code ?
import datetime
import json
import numpy as np
import random``
lst_in_seconds = []
f = open('one_all.json')
one = json.load(f)
f.close()
f1 = open('two_all.json')
two = json.load(f1)
f1.close()
counter_one_better = 0
counter_two_better = 0
counter_the_same = 0
for k in range(10000000000):
for i in range(10000000000):
if one['one'][k]['b'] == two['two'][i]['b'] and one['one'][k]['e'] == two['two'][i]['e']
and one['one'][k]['amount'] == two['two'][i]['amount']
and one['one'][k]['d'] == two['two'][i]['d']
and one['one'][k]['c'] == two['two'][i]['c']:
if (one['one'][k]['a']) < (two['two'][i]['a']):
# one better than two
delt_one = datetime.datetime.strptime((one['one'][k]['a']), '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime((two['two'][i]['a']), '%Y-%m-%d %H:%M:%S.%f')
delta = delt_two - delt_one
diff_in_seconds = delta.total_seconds()
lst_in_seconds.append(diff_in_seconds)
counter_one_better += 1
two['two'][i]['b'] = random.randint(0,100000)
break
elif (one['one'][k]['a']) == (two['two'][i]['a']):
# same
delt_one = datetime.datetime.strptime((one['one'][k]['a']), '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime((two['two'][i]['a']), '%Y-%m-%d %H:%M:%S.%f')
delta = delt_two - delt_one
diff_in_seconds = delta.total_seconds()
lst_in_seconds.append(diff_in_seconds)
counter_the_same += 1
two['two'][i]['b'] = random.randint(0,100000)
break
elif (one['one'][k]['a']) > (two['two'][i]['a']):
delt_one = datetime.datetime.strptime((one['one'][k]['a']), '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime((two['two'][i]['a']), '%Y-%m-%d %H:%M:%S.%f')
delta = delt_one - delt_two
diff_in_seconds = delta.total_seconds()
diff_in_seconds_to_str = float(('-' + str(diff_in_seconds)))
lst_in_seconds.append(diff_in_seconds_to_str)
counter_two_better += 1
two['two'][i]['b'] = random.randint(0,100000)
break
#print('counter_the_same',counter_the_same,'count')
#print('counter_one_better',counter_one_better,'count')
#print('counter_two_better',counter_two_better,'count','\n')
print('one better than two in ', round((counter_one_better / (counter_two_better+counter_one_better+counter_the_same))*100,4),'% case')
print('the same ', round((counter_the_same / (counter_two_better+counter_one_better+counter_the_same))*100,4),'% case')
print('two better than one in ', round((counter_two_better / (counter_two_better+counter_one_better+counter_the_same))*100,4),'% case','\n')
You can reduce the time taken to compare the files by pre-processing the two files to dictionaries with keys of the values to be compared. Then for each entry in one, you can look up the entries in two which have the same b,c,d, and e values and compare the times. Note that you can make your code a lot more "DRY" by noting that the only difference in the three branches of the if is which counter is updated.
from collections import defaultdict
import datetime
import random
one = { "one": [
{
"a" : "2022-09-12 00:00:00.000",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:00.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "6",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
}
] }
two = { "two": [
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "200",
"d" : "2021-09-11 23:59:59.997",
"e" : 81
}
] }
compare_keys = ['b', 'c', 'd', 'e']
value_key = 'a'
set_key = 'b'
ones = defaultdict(list)
for o in one['one']:
ones[tuple(o[k] for k in compare_keys)].append(o[value_key])
twos = defaultdict(list)
for t in two['two']:
twos[tuple(t[k] for k in compare_keys)].append({ k : t[k] for k in [value_key, set_key] })
counter_one_better = 0
counter_two_better = 0
counter_the_same = 0
lst_in_seconds = []
for k, o in ones.items():
t = twos.get(k)
if t is None:
continue
for o1 in o:
for i, t2 in enumerate(t):
delt_one = datetime.datetime.strptime(o1, '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime(t2[value_key], '%Y-%m-%d %H:%M:%S.%f')
delta = delt_two - delt_one
diff_in_seconds = delta.total_seconds()
lst_in_seconds.append(diff_in_seconds)
counter_one_better += diff_in_seconds < 0
counter_the_same += diff_in_seconds == 0
counter_two_better += diff_in_seconds > 0
t[i][set_key] = random.randint(0,100000)
print(counter_one_better, counter_the_same, counter_two_better)
print(lst_in_seconds)
# reconstruct the two dict
new_two = { 'two' : [ dict([*zip(compare_keys, k), *v.items()]) for k in twos for v in twos[k] ] }
Output (for your sample data):
# counter_one_better, counter_the_same, counter_two_better
0 0 2
# lst_in_seconds
[10.001, 30.0]
# new_two
{
"two": [
{
"b": 4459,
"c": "2",
"d": "2022-09-11 23:59:59.997",
"e": 87,
"a": "2022-09-12 00:00:30.000"
},
{
"b": 93855,
"c": "1",
"d": "2022-09-11 23:59:59.997",
"e": 88,
"a": "2022-09-12 00:00:10.001"
},
{
"b": "orange",
"c": "200",
"d": "2021-09-11 23:59:59.997",
"e": 81,
"a": "2022-09-12 00:00:30.000"
}
]
}

jq select filter chain

I am having the following array of objects that I would like to filter down. Like this:
1. LOGGEDIN == 0
2. Timestamp older than 5 minutes
3. IDLETIME > 60 && CPULOAD < 200
So for the second filter I’d like not to consider the objects filtered out on the first filter. And for the third filter I’d like not to consider the objects filtered out on the second filter. I tried to get the selection with jq:
1. jq '.[] | select(.LOGGEDIN=="0")'
2. jq '.[] | select(.TIMESTAMP | fromdateiso8601 < '$FIVEMINAGO')'
3. jq '.[] | select(.IDLETIME |tonumber > 60) | select(.CPULOAD |tonumber < 200)'
I’d like to wrap these up so that I end up with one array of objects, matching the filters and another array of objects, that do not. I’m on a Mac, zsh.
[
{
"SERIAL": "XXXSERIAL1XXX",
"TIMESTAMP": "2020-12-17 18:45:14",
"EMAIL": "email1#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "122",
"CPULOAD": "2",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL2XXX",
"TIMESTAMP": "2020-12-17 18:43:29",
"EMAIL": "email2#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL3XXX",
"TIMESTAMP": "2020-12-17 18:46:37",
"EMAIL": "email1#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL4XXX",
"TIMESTAMP": "2020-12-17 18:45:23",
"EMAIL": "email3#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "0",
"CPULOAD": "13",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL5XXX",
"TIMESTAMP": "2020-12-17 18:47:02",
"EMAIL": "email2#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL6XXX",
"TIMESTAMP": "2020-12-17 18:43:42",
"EMAIL": "email3#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "10",
"CPULOAD": "20",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL7XXX",
"TIMESTAMP": "2020-12-17 18:43:29",
"EMAIL": "email4#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL8XXX",
"TIMESTAMP": "2020-12-17 18:46:02",
"EMAIL": "email4#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL9XXX",
"TIMESTAMP": "2020-12-17 18:45:23",
"EMAIL": "email1#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "443",
"CPULOAD": "666",
"BLOCKED": "0"
}
]
Problems with the snippets you posted:
Don't try to generate code in the shell! Use --arg (or some other mechanism) to pass values to your program instead.
Your timestamps are not valid ISO8601 timestamps, much less what fromdateiso8601 expects.
| has the lowest precedence other than ;, so
.IDLETIME | tonumber > 60 means
.IDLETIME | ( tonumber > 60 ) but you want
( .IDLETIME | tonumber ) > 60.
We can start with this:
jq --arg TSCUT "$( date --date='5 minutes ago' +%s )" '
group_by(
.LOGGEDIN == "0" and
( .TIMESTAMP | sub(" "; "T") + "Z" | fromdateiso8601 ) < $TSCUT and
( .IDLETIME | tonumber ) > 60 and
( .CPULOAD | tonumber ) < 200
)
'
jqplay
The above segregates the matching records from those that don't, but we could end up with any of the following:
[ ]
[ [...matches...] ]
[ [...non-matches...] ]
[ [...non-matches...], [...matches...] ]
This isn't very useful. As such, I propose the following:
jq --arg TSCUT "$( date --date='5 minutes ago' +%s )" '
map(
._f = (
.LOGGEDIN == "0" and
( .TIMESTAMP | sub(" "; "T") + "Z" | fromdateiso8601 ) < $TSCUT and
( .IDLETIME | tonumber ) > 60 and
( .CPULOAD | tonumber ) < 200
)
) |
. as $a |
{
"matches": [ $a[] | select( ._f ) | del(._f) ],
"non-matches": [ $a[] | select( ._f | not ) | del(._f) ]
}
'
jqplay
I assumed that "$( ... )" means the same thing in zsh as it does in the POSIX shell. Adjust as needed.
Thanks to #oguz ismail for pointing out group_by, even though I retain my original solution.

couchbase N1QL group-by in sub-document

given the below data model:
{
"events": [
{
"customerId": "a",
"type": "credit" ,
"value": 10
},
{
"customerId": "a",
"type": "credit" ,
"value": 10
},
{
"customerId": "b",
"type": "credit" ,
"value": 5
},
{
"customerId": "b",
"type": "credit" ,
"value": 5
}
]
}
how can i query the sum of credits by customerId ? i.e:
{
{
"customerId": "a",
"total": "20
},
{
"customerId": "b",
"total": "10
}
}
Use SUBQUERY expression per document aggregation
SELECT d.*,
(SELECT e.customerId, SUM(e.`value`) AS total
FROM d.events AS e
WHERE ......
GROUP BY e.customerId) AS events
FROM default AS d
WHERE ...........;
For Whole Query
SELECT e.customerId, SUM(e.`value`) AS total
FROM default AS d
UNNEST d.events AS e
WHERE ......
GROUP BY e.customerId;

Json Iteration in spark

Input Json file
{
"CarBrands": [{
"model": "audi",
"make": " (YEAR == \"2009\" AND CONDITION in (\"Y\") AND RESALE in (\"2015\")) ",
"service": {
"first": null,
"second": [],
"third": []
},
"dealerspot": [{
"dealername": [
"\"first\"",
"\"abc\""
]
},
{
"dealerlat": [
"\"45.00\"",
"\"38.00\""
]
}
],
"type": "ok",
"plate": true
},
{
"model": "bmw",
"make": " (YEAR == \"2010\" AND CONDITION OR (\"N\") AND RESALE in (\"2016\")) ",
"service": {
"first": null,
"second": [],
"third": []
},
"dealerspot": [{
"dealerlat": [
"\"99.00\"",
"\"38.00\""
]
},
{
"dealername": [
"\"sports\"",
"\"abc\""
]
}
],
"type": "ok",
"plate": true
},
{
"model": "toy",
"make": " (YEAR == \"2013\" AND CONDITION in (\"Y\") AND RESALE in (\"2018\")) ",
"service": {
"first": null,
"second": [],
"third": []
},
"dealerspot": [{
"dealerlat": [
"\"35.00\"",
"\"38.00\""
]
},
{
"dealername": [
"\"nelson\"",
"\"abc\""
]
}
],
"type": "ok",
"plate": true
}
]
}
expected output
+-------+-------------+-----------+
model | dealername | dealerlat |
--------+-------------+-----------+
audi | first | 45 |
bmw | sports | 99 |
toy | nelson | 35 |
--------+-------------+-----------+
import sparkSession.implicits._
val tagsDF = sparkSession.read.option("multiLine", true).option("inferSchema", true).json("src/main/resources/carbrands.json");
val df = tagsDF.select(explode($"CarBrands") as "car_brands")
val dfd = df.withColumn("_tmp", split($"car_brands.make", "\"")).select($"car_brands.model".as("model"),$"car_brands.dealerspot.dealername"(0)(0).as("dealername"),$"car_brands.dealerspot.dealerlat"(0)(0).as("dealerlat"))
note : since dealername and dealerlat position is not fixed, the index (0)(0) doesnt produce the desired output. please help
You can convert dealerspot into JSON string and then use JSONPath with get_json_object():
import org.apache.spark.sql.functions.{get_json_object,to_json,trim,explode}
val df1 = (tagsDF.withColumn("car_brands", explode($"CarBrands"))
.select("car_brands.*")
.withColumn("dealerspot", to_json($"dealerspot")))
//+--------------------+--------------------+-----+-----+----------+----+
//| dealerspot| make|model|plate| service|type|
//+--------------------+--------------------+-----+-----+----------+----+
//|[{"dealername":["...| (YEAR == "2009" ...| audi| true|[, [], []]| ok|
//|[{"dealerlat":["\...| (YEAR == "2010" ...| bmw| true|[, [], []]| ok|
//|[{"dealerlat":["\...| (YEAR == "2013" ...| toy| true|[, [], []]| ok|
//+--------------------+--------------------+-----+-----+----------+----+
df1.select(
$"model"
, trim(get_json_object($"dealerspot", "$[*].dealername[0]"), "\"\\") as "dealername"
, trim(get_json_object($"dealerspot", "$[*].dealerlat[0]"), "\"\\") as "dealerlat"
).show
//+-----+----------+---------+
//|model|dealername|dealerlat|
//+-----+----------+---------+
//| audi| first| 45.00|
//| bmw| sports| 99.00|
//| toy| nelson| 35.00|
//+-----+----------+---------+

Extract key from JSON string in MySQL

My table contains string in json format. I need to get the sum and average of each key.
+----+------------------------------------------------------------------------------------+------------+
| id | json_data | subject_id |
+----+------------------------------------------------------------------------------------+------------+
| 1 | {"id": "a", "value": "30"}, {"id": "b", "value": "20"}, {"id": "c", "value": "30"} | 1 |
+----+------------------------------------------------------------------------------------+------------+
| 2 | {"id": "a", "value": "40"}, {"id": "b", "value": "50"}, {"id": "c", "value": "60"} | 1 |
+----+------------------------------------------------------------------------------------+------------+
| 3 | {"id": "a", "value": "20"} | 1 |
+----+------------------------------------------------------------------------------------+------------+
Expected result is
{"id": "a", "sum": 90, "avg": 30},
{"id": "b", "sum": 70, "avg": 35},
{"id": "c", "sum": 120, "avg": 40}
I've tried
SELECT (
JSON_OBJECT('id', id, 'sum', sum_data, 'avg', avg_data)
) FROM (
SELECT
JSON_EXTRACT(json_data, "$.id") as id,
SUM(JSON_EXTRACT(json_data, "$.sum_data")) as sum_data,
AVG(JSON_EXTRACT(json_data, "$.avg_data")) as avg_data
FROM Details
GROUP BY JSON_EXTRACT(json_data, "$.id")
) as t
But no luck. How can I sort this out?
Input json needs to correct
create table json_sum (id int primary key auto_increment, json_data json);
insert into json_sum values (0,'[{"id": "a", "value": "30"}, {"id": "b", "value": "20"}, {"id": "c", "value": "30"}]');
insert into json_sum values (0,'[{"id": "a", "value": "40"}, {"id": "b", "value": "50"}, {"id": "c", "value": "60"}]');
insert into json_sum values (0,'[{"id": "a", "value": "20"}]');
select
json_object("id", jt.id, "sum", sum(jt.value), "avg", avg(jt.value))
from json_sum, json_table(json_data, "$[*]" columns (
row_id for ordinality,
id varchar(10) path "$.id",
value varchar(10) path "$.value")
) as jt
group by jt.id
Output:
json_object("id", jt.id, "sum", sum(jt.value), "avg", avg(jt.value))
{"id": "a", "avg": 30.0, "sum": 90.0}
{"id": "b", "avg": 35.0, "sum": 70.0}
{"id": "c", "avg": 45.0, "sum": 90.0}