How to get average values from dictionary looking values using SQL? - mysql

my dataframe looks like this:
id value
a 0:3,1:0,2:0,3:4
a 0:0,1:0,2:2,3:0
a 0:0,1:5,2:4,3:0
I want to write a query to get average values of keys in column value?
So for example for 0:3,1:0,2:0,3:4 it must be (0+0+0+3+3+3+3)/7 = 1.71.
For 0:0,1:0,2:2,3:0 it must be (2+2)/2=2.
For 0:0,1:5,2:4,3:0 it must be (1+1+1+1+1+2+2+2+2)/9 = 1.44.
So desired result is:
id value
a 1.71
a 2.00
a 1.44
How to do that? Are there sql functions to get this result?

See this DBFIDDLE
code:
CREATE PROCEDURE `avg_dict`(s varchar(100))
BEGIN
SET #result = CONCAT('SELECT (', replace(replace(s, ":","*"),",","+"), ')/(',regexp_replace(s,",?[0-9]:","+"),')');
PREPARE stmt FROM #result;
EXECUTE stmt ;
DEALLOCATE PREPARE stmt;
END
results:
stmt
output
CALL avg_dict("0:3,1:0,2:0,3:4");
1.1743
CALL avg_dict("0:0,1:0,2:2,3:0");
2.0000
CALL avg_dict("0:0,1:5,2:4,3:0");
1.4444

With some combination of split's, transforms and repeat you can achieve your goal:
WITH dataset(id, value) AS (
values ('a', '0:3,1:0,2:0,3:4'),
('a', '0:0,1:0,2:2,3:0'),
('a', '0:0,1:5,2:4,3:0')
)
SELECT id,
reduce(arr, 0.0, (s, x)->s + x, s->s) / cardinality(arr)
FROM(
SELECT *,
flatten(
transform(
transform(
split(value, ','),
s->split(s, ':')
),
arr->repeat(
cast(arr [ 1 ] as INTEGER),
cast(arr [ 2 ] as INTEGER)
)
)
) as arr
FROM dataset
)
Output:
id
_col1
a
1.7142857142857142
a
2.0
a
1.4444444444444444
Note:
Outer select can be substituted with array_average but I used he select cause Athena's version of Presto does not support it.
UPD
Another version which can be more performant:
SELECT id,
reduce(
arr,
CAST(ROW(0.0, 0) AS ROW(sum DOUBLE, count INTEGER)),
(s, r)->CAST(
ROW(r.num * r.count + s.sum, s.count + r.count) AS ROW(sum DOUBLE, count INTEGER)
),
s->IF(s.count = 0, NULL, s.sum / s.count)
)
FROM(
SELECT *,
transform(
split(value, ','),
s->CAST(
ROW(
CAST(split(s, ':') [ 1 ] AS DOUBLE),
(CAST(split(s, ':') [ 2 ] AS INTEGER))
) AS ROW(num DOUBLE, count INTEGER)
)
) as arr
FROM dataset
)

Related

How to split column values by comma and return it as an array

As you can see below I have Name column. I want to split it by / and return the value in array.
MyTable
Id
Name
1
John/Warner/Jacob
2
Kol
If I write a query as
Select Id, Name from MyTable
it will return
{
"id": 1,
"name": "John/Warner/Jacob",
},
{
"id": 2,
"name": "Kol",
},
Which query should I write to get below result ?
{
"id": 1,
"name": ["John", "Warner", "Jacob"],
},
{
"id": 2,
"name": ["Kol"] ,
},
Don't think you can return an array in the query itself, but you could do this...
SELECT id,
SUBSTRING_INDEX(name, '/', 1)
AS name_part_1,
SUBSTRING_INDEX(name, '/', -1)
AS name_part_2
FROM tableName;
Only way to build it as an array would be when processing the result accordingly in whatever language you are using.
You can define a function split, which is based on the fact that substring_index(substring_index(name,'/',x),'/',-1) will return the x-th part of a name when separated by '/'.
CREATE FUNCTION `test`.`SPLIT`(s varchar(200), c char, i integer) RETURNS varchar(200) CHARSET utf8mb4
DETERMINISTIC
BEGIN
DECLARE retval varchar(200);
WITH RECURSIVE split as (
select 1 as x,substring_index(substring_index(s,c,1),c,-1) as y, s
union all
select x+1,substring_index(substring_index(s,c,x+1),c,-1),s from split where x<= (LENGTH(s) - LENGTH(REPLACE(s,c,'')))
)
SELECT y INTO retval FROM split WHERE x=i ;
return retval;
END
and then do:
with mytable as (
select 1 as Id, 'John/Warner/Jacob' as Name
union all
select 2, 'Kol')
select
id, split(Name,'/',x) as name
from mytable
cross join (select 1 as x union all select 2 union all select 3) x
order by id, name;
output:
Id
name
1
Jacob
1
John
1
Warner
2
[NULL]
2
[NULL]
2
Kol
It is, of course, possible to refine this, and leave out the NULL values ...
I will not convert this output to JSON for you ...

Nested SELECT statements and reading in nested JSON file in SQL Server

The discussed problem has been solved partly in here:
Read in nested JSON file in SQL Server
but now the JSON file was extended with more objects of different format.
Declare #json nvarchar(max)
SELECT #json =
N'{
"Model": {
"Energy-X/A": {
"x": 1,
"y": 2,
"z": 3
},
"Energy-X/B": {
"x": 4,
"y": 5,
"z": 6
}
},
"Energy":
{
"Energy-X/A": [
[
100.123456, null
],
[
101.123456, null
]
],
"Energy-X/B": [
[
102.123456, null
],
[
103.123456, null
]
]
}
}'
select * from openjson(#json, '$.Model')
with (x [int] '$."Energy-X/A".x',
y [int] '$."Energy-X/A".y',
z [int] '$."Energy-X/A".z',
x [int] '$."Energy-X/B".x',
y [int] '$."Energy-X/B".y',
z [int] '$."Energy-X/B".z'
);
select commaDelimited.* from openjson (#json)
with (energyXA nvarchar(max) '$.Energy."Energy-X/A"' as json,
energyXB nvarchar(max) '$.Energy."Energy-X/B"' as json
) as energy
cross apply (
select
(select string_agg(isnull(value, 'null'), ',') from openjson(energyXA, '$[0]')),
(select string_agg(isnull(value, 'null'), ',') from openjson(energyXB, '$[0]'))
union all
select
(select string_agg(isnull(value, 'null'), ',') from openjson(energyXA, '$[1]')),
(select string_agg(isnull(value, 'null'), ',') from openjson(energyXB, '$[1]'))
) commaDelimited ([Energy-X/A], [Energy-X/B]);
The solution works and the values can be extracted but now I want to combine both SELECT statements into one query and construct a correlated subquery. The columns should appear when "Energy-X/A" and Energy-X/B" match like:
Energy-X/A
Energy-X/A
x
y
z
100.123456, null
101.123456, null
1
2
3
Energy-X/B
Energy-X/B
x
y
z
102.123456, null
103.123456, null
4
5
6
or also better output would be to sum up the values of Energy-X/A and Energy-X/B in one, separate column (using a delimiter such as semicolon):
Energy-X/A
x
y
z
100.123456, null ; 101.123456, null
1
2
3
Energy-X/B
x
y
z
102.123456, null ; 103.123456, null
1
2
3
I am grateful for any help!
Since you changed your expected results significantly, I've completely re-written your query.
Start by unpivoting the A and B values into separate rows using a (values) table and json_query.
Then read those columns using openjson.
In the case of Energy you need two levels of aggregation also, in order to get your second expected result.
select
commaDelimited.*,
model.*
from (values
(json_query(#json, '$.Model.BCS'), json_query(#json, '$.Energy."Energy-X/A"')),
(json_query(#json, '$.Model.BCSA'), json_query(#json, '$.Energy."Energy-X/B"'))
) j(model, energy)
outer apply openjson(j.model)
with (
x int,
y int,
z int
) model
outer apply (
select
Energy = string_agg(c.Energy, ' ; ')
from openjson(j.energy) energy
cross apply (
select
Energy = string_agg(isnull(Xinner.value, 'null'), ', ')
from openjson(energy.value) Xinner
) c
) commaDelimited;
db<>fiddle

Parse JSON list with no key in PLSQL

What I'm trying to do is fill up a table with the data from a JSON. The file is formatted like this.
[
{
"name": "Victor",
"age": "20"
},
{
"name": "Ana",
"age": "23"
}
]
I can't change how it's formatted.
I tried using APEX_JSON to parse it and add row by row, but I can't even use the GET_COUNT, none of the paths I tried worked.
The database is an Oracle 11g, so there's no JSON_TABLE
--oracle 12c or later
SELECT *
FROM JSON_TABLE (
'[{"name":"Victor", "age":"20"},{"name":"Ana", "age":"23"}]',
'$[*]'
COLUMNS
NAME VARCHAR2 (2000) PATH '$.name',
AGE VARCHAR2 (2000) PATH '$.age')
--oracle 11g
SELECT *
FROM XMLTABLE (
'/json/row'
PASSING apex_json.to_xmltype (
'[{"name":"Victor", "age":"20"},{"name":"Ana", "age":"23"}]')
COLUMNS
NAME VARCHAR2 (2000) PATH '/row/name',
AGE VARCHAR2 (2000) PATH '/row/age')
You can use XMLTABLE along with APEX_JSON.TO_XMLTYPE() function in order to simulate JSON_TABLE such as
WITH t(jsCol) AS
(
SELECT '[
{
"name": "Victor",
"age": "20"
},
{
"name": "Anna",
"age": "23"
}
]'
FROM dual
)
SELECT name, age
FROM t,
XMLTABLE('/json/row'
PASSING APEX_JSON.TO_XMLTYPE(jsCol)
COLUMNS
name VARCHAR2(100) PATH 'name',
age VARCHAR2(100) PATH 'age'
)
NAME
AGE
Victor
20
Anna
23
With APEX_JSON you can do something like this:
DECLARE
l_json_text VARCHAR2(32767);
l_json_values apex_json.t_values;
BEGIN
l_json_text := '[
{"name":"Victor", "age":"20"},
{"name":"Ana", "age":"23"}
]
';
apex_json.parse(
p_values => l_json_values,
p_source => l_json_text
);
DBMS_OUTPUT.put_line('----------------------------------------');
FOR r IN 1 .. nvl(apex_json.get_count(p_path => '.', p_values => l_json_values),0) loop
dbms_output.put_line(apex_json.get_varchar2(p_path => '[%d].name', p0 => r, p_values => l_json_values));
dbms_output.put_line(apex_json.get_varchar2(p_path => '[%d].age', p0 => r, p_values => l_json_values));
/* insert into your_table
(name,
age
)
VALUES
(
apex_json.get_varchar2(p_path => '[%d].name', p0 => r, p_values => l_json_values),
apex_json.get_varchar2(p_path => '[%d].age', p0 => r, p_values => l_json_values)
);
*/
END loop;
DBMS_OUTPUT.put_line('----------------------------------------');
END;
/
If you can find a proper JSON parser then you should use that; however, if one is not available, you could parse it yourself. From Oracle 11gR2, you can use:
INSERT INTO table_name (name, age)
WITH jsondata (json) AS (
SELECT '[
{"name":"Victor", "age":"20"},
{"name":"Ana", "age":"23"},
{
"name":"Betty",
"age":"24"
},
{
"age":"25",
"name":"Carol"
}
]' FROM DUAL
),
data (json, items, i, name, age) AS (
SELECT json,
REGEXP_COUNT(
json,
'\{\s*"name"\s*:\s*"(.*?)"\s*,\s*"age"\s*:\s*"(.*?)"\s*\}'
|| '|\{\s*"age"\s*:\s*"(.*?)"\s*,\s*"name"\s*:\s*"(.*?)"\s*\}',
1,
'n'
),
1,
REGEXP_SUBSTR(
REGEXP_SUBSTR(
json,
'\{\s*"name"\s*:\s*"(.*?)"\s*,\s*"age"\s*:\s*"(.*?)"\s*\}'
|| '|\{\s*"age"\s*:\s*"(.*?)"\s*,\s*"name"\s*:\s*"(.*?)"\s*\}',
1,
1,
'n'
),
'"name"\s*:\s*"(.*?)"',
1,
1,
'n',
1
),
REGEXP_SUBSTR(
REGEXP_SUBSTR(
json,
'\{\s*"name"\s*:\s*"(.*?)"\s*,\s*"age"\s*:\s*"(.*?)"\s*\}'
|| '|\{\s*"age"\s*:\s*"(.*?)"\s*,\s*"name"\s*:\s*"(.*?)"\s*\}',
1,
1,
'n'
),
'"age"\s*:\s*"(.*?)"',
1,
1,
'n',
1
)
FROM jsondata
UNION ALL
SELECT json,
items,
i + 1,
REGEXP_SUBSTR(
REGEXP_SUBSTR(
json,
'\{\s*"name"\s*:\s*"(.*?)"\s*,\s*"age"\s*:\s*"(.*?)"\s*\}'
|| '|\{\s*"age"\s*:\s*"(.*?)"\s*,\s*"name"\s*:\s*"(.*?)"\s*\}',
1,
i + 1,
'n'
),
'"name"\s*:\s*"(.*?)"',
1,
1,
'n',
1
),
REGEXP_SUBSTR(
REGEXP_SUBSTR(
json,
'\{\s*"name"\s*:\s*"(.*?)"\s*,\s*"age"\s*:\s*"(.*?)"\s*\}'
|| '|\{\s*"age"\s*:\s*"(.*?)"\s*,\s*"name"\s*:\s*"(.*?)"\s*\}',
1,
i + 1,
'n'
),
'"age"\s*:\s*"(.*?)"',
1,
1,
'n',
1
)
FROM data
WHERE i < items
)
SELECT name, age
FROM data;
(Note: the regular expression does not handle escaped quotes in the strings as I am assuming they will not occur in names; however, if they do then instead of .*? you can use (\(["\/bfnrt]|u[0-9a-fA-F]{4})|[^"])*.)
Which, given the table:
CREATE TABLE table_name (name VARCHAR2(30), age NUMBER);
Then after the insert:
SELECT * FROM table_name;
Outputs:
NAME
AGE
Victor
20
Ana
23
Betty
24
Carol
25
db<>fiddle here
Last time done that with a clob variable.
Try to do it like :
DECLARE
json_body clob := '[
{"name":"Victor", "age":"20"},
{"name":"Ana", "age":"23"}
]';
BEGIN
FOR items IN (SELECT *
FROM
JSON_TABLE(json_body FORMAT JSON,'$[*]'
COLUMNS (
name_ varchar (200) PATH '$.name',
age_ varchar (200) PATH '$.age')))
LOOP
INSERT INTO T_DATA (
name,
age
) VALUES (
items.name_,
items.age_
);
END LOOP;
END;
/
This will put your data into a table and then you can play with them
select * from T_DATA;
Resulting into :
result

Optimize - Function that SELECTs from TEMP TABLE within loop to get averages of JSON values

I have a Mysql Function that runs as part of a larger query reading a few million records. In order to detect anomalies, I'm figuring out the average change over time. The data in the table is stored as JSON objects with UNIX timestamps as the key for up to 30 days.
As an example, the input (input_array) would look something like:
[{"1532944806": 16}, {"1533031206": 14}, {"1533117605": 13}, {"1533204305": 12}, {"1533290708": 10}, {"1533463506": 9}, {"1533549907": 9}, {"1533636306": 9}, {"1533722707": 9}, {"1533809108": 9}, {"1533895506": 9}, {"1533981906": 8}, {"1534068306": 7}, {"1534154706": 7}, {"1534241108": 7}, {"1534590304": 7}, {"1534673106": 12}, {"1534759508": 6}, {"1534845905": 7}, {"1534932306": 7}, {"1535018707": 5}, {"1535105106": 3}, {"1535191505": 7}, {"1535277907": 6}, {"1535364305": 7}, {"1535450706": 2}, {"1535537107": 1}]
I'm only looking to average decreasing changes - not any change that increases over a day.
I'm checking that a value for the previous day exists, and if so, I'm calculating change and adding it into a temporary table that gets queried at to select the average.
So far I have:
CREATE FUNCTION `daily_averages`(input_array JSON) RETURNS int(4)
READS SQL DATA
DETERMINISTIC
BEGIN
DECLARE array_length INTEGER(2);
DECLARE prev_value INTEGER(4);
DECLARE idx INTEGER(4);
DROP TEMPORARY TABLE IF EXISTS collection;
CREATE TEMPORARY TABLE collection (change INTEGER(4) SIGNED DEFAULT 0);
SELECT JSON_LENGTH(input_array) INTO array_length;
SET idx = 0;
WHILE idx < array_length DO
SELECT
IF(idx-1 > -1,
CONVERT(
JSON_EXTRACT(
JSON_EXTRACT(
JSON_EXTRACT( input_array, CONCAT( '$[', idx-1, ']' ) )
, '$.*'
)
, '$[0]'
), SIGNED INTEGER
)
, -1
)
INTO prev_value;
INSERT INTO collection
SELECT (prev_value -
(
CONVERT(
JSON_EXTRACT(
JSON_EXTRACT(
JSON_EXTRACT( input_array, CONCAT( '$[', idx, ']' ) )
, '$.*'
)
, '$[0]'
), SIGNED INTEGER
)
)
)
FROM DUAL
WHERE prev_value > 0;
SET idx = idx + 1;
END WHILE;
RETURN (SELECT AVG(change) FROM collection WHERE change > -1);
END
With about 2.7 million records, it takes about 20 minutes to run currently. I'm looking to optimize this or re-write it by avoiding the DROP/CREATE overhead.
It seems unnecessary to create a table just to calculate an average, it's simple to do in the loop. Instead of inserting each value into a table, add it to a total variable. At the end, return total/count.
Since you're totalling the differences between values,
You can also use SET statements to assign variables, rather than SELECT ... INTO variable.
DECLARE array_length INTEGER(2);
DECLARE prev_value INTEGER(4);
DECLARE idx INTEGER(4);
DECLARE total INTEGER(4);
DECLARE counter INTEGER(4);
DECLARE cur_value INTEGER(4);
SET array_length = JSON_LENGTH(input_array);
SET total = 0;
SET counter = 0;
-- Initialize prev_value to the first element
SET prev_value = CONVERT(
JSON_EXTRACT(
JSON_EXTRACT(
JSON_EXTRACT( input_array, '$[0]' )
, '$.*'
)
, '$[0]'
), SIGNED INTEGER
);
SET idx = 1;
WHILE idx < array_length DO
SET cur_value = CONVERT(
JSON_EXTRACT(
JSON_EXTRACT(
JSON_EXTRACT( input_array, CONCAT( '$[', idx, ']' ) )
, '$.*'
)
, '$[0]'
), SIGNED INTEGER
);
IF cur_value < prev_value
THEN
SET total = total + (prev_value - cur_value);
SET counter = counter + 1;
END IF;
SET prev_value = cur_value;
SET idx = idx + 1;
END WHILE;
RETURN total / counter;
Digging inside a million JSON strings. I'm amazed it took only 20 minutes.
As you insert the rows, do some calculations and store the results somewhere. Then use that for doing the monitoring.
Even if you can't do it as you insert the rows, do it only to the 'new' rows. Again save the previous info somewhere.
As for DROP/CREATE... That can be sped up by having a permanent table, then use only TRUNCATE TABLE at the start of each proc call.
The (4) in INTEGER(4) does not mean anything. You will always get a 32-bit integer. (This note probably has no impact on the proc.)

parsing JSON string in oracle

i have JSON string in one column in oracle 10g database like
[{"id":"1","contactBy":"Rajesh Kumar"},{"id":"2","contactBy":"Rakesh Kumar"}]
I have to get the value for ContactBy in that column for one of the reports.
is there any built in function to parse the JSON string in Oracle 10g or any user defined funciton to parse the String
As said by Jens in comments, JSON support is only available from 12c, but you can use regular expressions as a workaround to get what you want:
select regexp_replace(regexp_substr('[{"id": "1", "contactBy":"Rajesh Kumar"},{"id": "2","contactBy": "Emmanuel Test"}]',
'"contactBy":\s*("(\w| )*")', 1, level),
'"contactBy":\s*"((\w| )*)"', '\1', 1, 1) contact
from dual
connect by regexp_substr('[{"id": "1","contactBy":"Rajesh Kumar"},{"id": "2","contactBy": "Emmanuel Test"}]', '"contactBy":\s*("(\w| )*")', 1, level) is not null
;
EDIT : request modified to take both special characters and display answers in a single row:
select listagg(contact, ', ') within group (order by lev)
from
(
select regexp_replace(regexp_substr('[{"id": "1", "contactBy":"Rajesh Kumar"},{"id": "2","contactBy": "Emmanuel Test+-"}]',
'"contactBy":\s*(".*?")', 1, level),
'"contactBy":\s*"(.*?)"', '\1', 1, 1) contact, level lev
from dual
connect by regexp_substr('[{"id": "1","contactBy":"Rajesh Kumar"},{"id": "2","contactBy": "Emmanuel Test+-"}]', '"contactBy":\s*(".*?")', 1, level) is not null
)
;
# Emmanuel your code is really helped a lot, thank you very much. but your query is taking too much of time, so i changed to a function , which will return the required values.
CREATE OR REPLACE FUNCTION SFGETCRCONTACTBY(INCRID NUMBER) RETURN VARCHAR2 AS
TEMPINT NUMBER :=0;
OUTPUT VARCHAR2(10000) ;
TEMPVAR VARCHAR2(1000);
BEGIN
SELECT REGEXP_COUNT(CR_CONTACT_BY, '"contactBy":\S*(".*?")')
INTO TEMPINT
FROM T_LOAN_REQUEST_MARKET WHERE CR_ID=INCRID;
WHILE TEMPINT > 0
LOOP
SELECT REGEXP_REPLACE(REGEXP_SUBSTR(CR_CONTACT_BY, '"contactBy":\S*(".*?")', 1,TEMPINT), '"contactBy":\S*"(.*?)"', '\1', 1, 1) INTO TEMPVAR
FROM T_LOAN_REQUEST_MARKET WHERE CR_ID=INCRID;
IF OUTPUT IS NULL THEN
OUTPUT := TEMPVAR;
ELSE
OUTPUT := OUTPUT ||',' || TEMPVAR;
END IF;
TEMPINT := TEMPINT-1;
END LOOP;
RETURN OUTPUT;
END;
/