How to fill the missing date in a sql table - mysql

I have a sql table related to discontinuous dates:
CREATE TABLE IF NOT EXISTS date_test1 ( items CHAR ( 8 ), trade_date date );
INSERT INTO `date_test1` VALUES ( 'a', '2020-03-20');
INSERT INTO `date_test1` VALUES ( 'b', '2020-03-20');
INSERT INTO `date_test1` VALUES ('a', '2020-03-21');
INSERT INTO `date_test1` VALUES ( 'c', '2020-03-22');
INSERT INTO `date_test1` VALUES ( 'd', '2020-03-22');
INSERT INTO `date_test1` VALUES ('a', '2020-03-25');
INSERT INTO `date_test1` VALUES ( 'e', '2020-03-26');
In this table, '2020-03-23' and '2020-03-24' are missed. I want to fill them by their previous data, in this table, the '2020-03-22' data.
Expected result:
The number of continues missing dates and of the records in one day are both uncertain.
So how to do this in mysql?

This solution uses Python and assumes that there aren't so many rows that they cannot be read into memory. I do not warrant this code free from defects; use at your own risk. So I suggest you run this against a copy of your table or make a backup first.
This code uses the pymysql driver.
import pymysql
from datetime import date, timedelta
from itertools import groupby
import sys
conn = pymysql.connect(db='x', user='x', password='x', charset='utf8mb4', use_unicode=True)
cursor = conn.cursor()
# must be sorted by date:
cursor.execute('select items, trade_date from date_test1 order by trade_date, items')
rows = cursor.fetchall() # tuples: (datetime.date, str)
if len(rows) == 0:
sys.exit(0)
groups = []
for k, g in groupby(rows, key=lambda row: row[1]):
groups.append(list(g))
one_day = timedelta(days=1)
previous_group = groups.pop(0)
next_date = previous_group[0][1]
for group in groups:
next_date = next_date + one_day
while group[0][1] != next_date:
# missing date
for tuple in previous_group:
cursor.execute('insert into date_test1(items, trade_date) values(%s, %s)', (tuple[0], next_date))
print('inserting', tuple[0], next_date)
conn.commit()
next_date = next_date + one_day
previous_group = group
Prints:
inserting c 2020-03-23
inserting d 2020-03-23
inserting c 2020-03-24
inserting d 2020-03-24
Discussion
With your sample data, after the rows are fetched, rows is:
(('a', datetime.date(2020, 3, 20)), ('b', datetime.date(2020, 3, 20)), ('a', datetime.date(2020, 3, 21)), ('c', datetime.date(2020, 3, 22)), ('d', datetime.date(2020, 3, 22)), ('a', datetime.date(2020, 3, 25)), ('e', datetime.date(2020, 3, 26)))
After the following is run:
groups = []
for k, g in groupby(rows, key=lambda row: row[1]):
groups.append(list(g))
groups is:
[[('a', datetime.date(2020, 3, 20)), ('b', datetime.date(2020, 3, 20))], [('a', datetime.date(2020, 3, 21))], [('c', datetime.date(2020, 3, 22)), ('d', datetime.date(2020, 3, 22))], [('a', datetime.date(2020, 3, 25))], [('e', datetime.date(2020, 3, 26))]]
That is, all the tuples with the same date are grouped together in a list so it becomes to easier to detect missing dates.

Related

sqlalchemy update using list of tuples

Is there an efficient way to update rows based on list of tuples in sqlalchemy?
If its a single row, then I can simply do:
session.query(table).filter(table.id == 10).update({'values': 'x'})
session.commit
however, the data i'm getting is a list of tuples
[(10, 'x'),(20,'y'),(30,'z'),(40,'p')]
table has IDs 10,20,30,40 etc.
is there efficient way to update instead of multiple individual updates?
You can convert the list of tuples to a list of dicts and then use update() with bindparam() as illustrated in the tutorial:
from pprint import pprint
import sqlalchemy as sa
engine = sa.create_engine("sqlite://")
tbl = sa.Table(
"tbl",
sa.MetaData(),
sa.Column("id", sa.Integer, primary_key=True, autoincrement=False),
sa.Column("value", sa.String(50)),
)
tbl.create(engine)
with engine.begin() as conn:
conn.execute(
tbl.insert(),
[
{"id": 10, "value": "old_10"},
{"id": 20, "value": "old_20"},
{"id": 30, "value": "old_30"},
],
)
with engine.begin() as conn:
# initial state
print(conn.execute(sa.select(tbl)).all())
# [(10, 'old_10'), (20, 'old_20'), (30, 'old_30')]
new_data = [(10, "x"), (20, "y"), (30, "z")]
params = [dict(tbl_id=a, new_value=b) for (a, b) in new_data]
pprint(params, sort_dicts=False)
"""
[{'tbl_id': 10, 'new_value': 'x'},
{'tbl_id': 20, 'new_value': 'y'},
{'tbl_id': 30, 'new_value': 'z'}]
"""
upd = (
sa.update(tbl)
.values(value=sa.bindparam("new_value"))
.where(tbl.c.id == sa.bindparam("tbl_id"))
)
print(upd)
# UPDATE tbl SET value=:new_value WHERE tbl.id = :tbl_id
conn.execute(upd, params)
# check results
print(conn.execute(sa.select(tbl)).all())
# [(10, 'x'), (20, 'y'), (30, 'z')]

How to generate a JSON value array in Azure SQL

DECLARE #segArr NVARCHAR(max)
set #segArr = N'[1,2,3]'
DECLARE #segTb table (
k int,
v NVARCHAR(20)
);
insert into #segTb
VALUES
(0, 'a'),
(1, 'b'),
(2, 'c'),
(3, 'd'),
(4, 'e'),
(5, 'f');
select t.v from #segTb t
JOIN openjson(#segArr) a on a.[key] = t.k
for JSON auto;
I have a simple table with a key-value like structure and a JSON array that is a list of keys of the values I wanted.
The select statement can create the desired result, but the JSON format is wrong. It outputs an array of objects.
[
{
"v": "a"
},
{
"v": "b"
},
{
"v": "c"
}
]
But what I needed is an array of direct values.
[ "a", "b", "c" ]
You can use more conventional string manipulation methods to create JSON arrays in Azure SQL DB such as STRING_AGG which aggregates strings with the given separator (eg comma) and QUOTENAME which surrounds strings with the given quote character. A simple example:
SELECT QUOTENAME( STRING_AGG( QUOTENAME(v, '"' ), ',' ), '[' ) AS yourArray
FROM
(
SELECT t.v FROM #segTb t
INNER JOIN OPENJSON(#segArr) a ON a.[key] = t.k
) x;
For your particular example you could just create a user-defined table-type, with CREATE TYPE. This would have the advantage of you being able to add a primary key to the type, to guarantee duplicates cannot be added, give the optimizer a bit more info at run-time and use native relational abilities rather than bolted-on NOSQL abilities. A simple example which should run end-to-end:
IF NOT EXISTS ( SELECT * FROM sys.types st INNER JOIN sys.schemas ss ON st.schema_id = ss.schema_id WHERE st.name = N'tvpItems' AND ss.name = N'dbo')
CREATE TYPE dbo.tvpItems AS TABLE
(
k INT PRIMARY KEY
);
GO
DECLARE #items AS dbo.tvpItems
INSERT INTO #items VALUES ( 1 ), ( 2 ), ( 3 )
DECLARE #segTb TABLE (
k INT,
v NVARCHAR(20)
);
INSERT INTO #segTb
VALUES
(0, 'a'),
(1, 'b'),
(2, 'c'),
(3, 'd'),
(4, 'e'),
(5, 'f');
SELECT t.v
FROM #segTb t
INNER JOIN #items i ON i.k = t.k
FOR JSON AUTO;

Parse data from csv file into given format using Prolog

I have the csv file that contains data as:
A B C
A - 4 5
B 8 - 6
C 2 3 -
I want to have facts in the following form:
num(a,b,4).
num(a,c,5).
num(b,a,8).
num(b,c,6).
num(c,a,2).
num(c,b,3).
There should not be facts for similar alphabets like num(a,a,-).
I am using prolog's csv_read_file as:
csv_read_file(Path, Rows, [functor(num), arity(4)]), maplist(assert, Rows).
and its giving me output as:
Rows = [num('', 'A', 'B', 'C'), num('A', -, 4, 5), num('B', 8, -, 6), num('C', 2, 3, -)]
It seems to be a basic question, but I am not able to think about condition to perform this. Any help will be highly appreciated.
As per Isabelle Newbie Answer:
Open :- csv_read_file('Path', Rows, [functor(num), arity(4)]), table_entry(Rows, Row).
header_row_entry(Header,Row,Entry):-
arg(1, Row, RowName),
functor(Header, _, Arity),
between(2,Arity,ArgIndex),
arg(ArgIndex, Header, ColumnName),
arg(ArgIndex, Row, Value),
Entry = num(RowName, ColumnName, Value),
writeln(Entry).
table_entry(Entries, Entry):-
Entries = [Header | Rows],
member(Row, Rows),
header_row_entry(Header, Row, Entry).
Now, can anyone explain how and where I should use maplist to convert the rows in form of facts (neglect filtering of '-' and lowercase for now) so that when I query:
?-num(A,B,X).
I should get:
X=4
Next task is, I want to implement depth first search algorithm on it. Any details regarding this will be highly appreciated.
Consider a table header num('', 'A', 'B', 'C') and a row in the table num('B', 8, -, 6). From this you want to compute a table entry identified by the row's name, which here is 'B', and by a column name: the column name being 'A' for the first value (8), 'B' for the second (-), 'C' for the third (6).
Here's a simple way to do this, involving some typing and the obligatory copy-and-paste errors:
header_row_entry(Header, Row, Entry) :-
Header = num('', ColumnName, _, _),
Row = num(RowName, Value, _, _),
Entry = num(RowName, ColumnName, Value).
header_row_entry(Header, Row, Entry) :-
Header = num('', _, ColumnName, _),
Row = num(RowName, _, Value, _),
Entry = num(RowName, ColumnName, Value).
header_row_entry(Header, Row, Entry) :-
Header = num('', _, _, ColumnName),
Row = num(RowName, _, _, Value),
Entry = num(RowName, ColumnName, Value).
This enumerates all the entries in a row on backtracking:
?- Header = num('', 'A', 'B', 'C'), Row = num('B', 8, -, 6),
header_row_entry(Header, Row, Entry).
Header = num('', 'A', 'B', 'C'),
Row = num('B', 8, -, 6),
Entry = num('B', 'A', 8) ;
Header = num('', 'A', 'B', 'C'),
Row = num('B', 8, -, 6),
Entry = num('B', 'B', -) ;
Header = num('', 'A', 'B', 'C'),
Row = num('B', 8, -, 6),
Entry = num('B', 'C', 6).
To enumerate all the entries in an entire table, it remains to enumerate all rows, then enumerate row entries as above. Here this is:
table_entry(Entries, Entry) :-
Entries = [Header | Rows],
member(Row, Rows),
header_row_entry(Header, Row, Entry).
And now, given your table:
?- Table = [num('', 'A', 'B', 'C'), num('A', -, 4, 5), num('B', 8, -, 6), num('C', 2, 3, -)], table_entry(Table, Entry).
Table = [num('', 'A', 'B', 'C'), num('A', -, 4, 5), num('B', 8, -, 6), num('C', 2, 3, -)],
Entry = num('A', 'A', -) ;
Table = [num('', 'A', 'B', 'C'), num('A', -, 4, 5), num('B', 8, -, 6), num('C', 2, 3, -)],
Entry = num('A', 'B', 4) ;
Table = [num('', 'A', 'B', 'C'), num('A', -, 4, 5), num('B', 8, -, 6), num('C', 2, 3, -)],
Entry = num('A', 'C', 5) ;
Table = [num('', 'A', 'B', 'C'), num('A', -, 4, 5), num('B', 8, -, 6), num('C', 2, 3, -)],
Entry = num('B', 'A', 8) ;
Table = [num('', 'A', 'B', 'C'), num('A', -, 4, 5), num('B', 8, -, 6), num('C', 2, 3, -)],
Entry = num('B', 'B', -) . % etc.
Depending on what you want exactly, it remains to lowercase the row and column names (the irritatingly named downcase_atom in SWI-Prolog, for example) and filter out the - entries. You can then assert the entries using a failure-driven loop or by collecting all of them using findall and asserting using maplist.
Now that we have a working solution, we might want header_row_entry to be a bit nicer. We can use arg/3 to capture more explicitly that we are trying to pair a column name and a value that are at the same argument position in their respective header and row terms:
header_row_entry(Header, Row, Entry) :-
arg(1, Row, RowName),
functor(Header, _, Arity),
between(2, Arity, ArgIndex),
arg(ArgIndex, Header, ColumnName),
arg(ArgIndex, Row, Value),
Entry = num(RowName, ColumnName, Value).
This is shorter than the above and applicable to any number of columns in the table.

MYSQL sum() with exclusion of records

I need to calculat fin_sum for visit_numbers and industry_code, but if id in other table then i need to exclude records which have id in other table from calculation but, if id is not esixst in table themn it should be included.
I have table with next structure
create temporary table client_transactions_final
(
id int,
fin_amount decimal (6,2),
ind_code varchar(10),
visit_number int
);
insert into client_transactions_final values
(1, 100, 'Ind 1', 2),
(1, 300, 'Ind 2', 3),
(2, 300, 'Ind 3', 4),
(2, 100, 'Ind 1', 2),
(3, 300, 'Ind 2', 3),
(4, 300, 'Ind 3', 5),
(5, 100, 'Ind 1', 2),
(6, 300, 'Ind 2', 5),
(6, 300, 'Ind 3', 4)
create temporary table term_map
(
id int
);
insert into term_map values
(2),
(4);
from this table i am runing select which do sum groyp by visit_numbers and industry code
SELECT visit_number,
case when id in (SELECT id FROM term_map) then
--sum(fin_amount) do not include ids from SELECT id FROM term_map into
else
SUM(fin_amount) end revenue
FROM
client_transactions_final
GROUP BY visit_number , ind_code
Howewer i need to calculat fin_sum for visit_numbers and industry_code, but if id in other table then i need to exclude records which have id in other table from calculation but, if id is not esixst in table themn it should be included. I alreade try difrentaproaches on how to do it but non of it is working any ideas how to do it ?
Try this:-
SELECT visit_number,
SUM(fin_amount) revenue
FROM
client_transactions_final
where id not in (SELECT id FROM term_map)
GROUP BY visit_number , ind_code

Select/Query calculate time between timestamps without weekends

Problem presented is to calculate for each row returned the time ("ResponseTime") between 2 timestamps ("StartDateTime" and "EndDateTime") excluding the weekends. Does not take into consideration Work hours or Holidays.
Weekends in this case are defined as Saturday 00:00:00 to Sunday 23:59:59.
Had a tough time coming up with a solution for this question so thought I would share my final product. Found lots of solutions online but most either used a calendar table, which I couldn't use in this application, or had a logic I didn't understand. Solution shared below. Please feel free to offer your own solution based on the problem or to correct any errors you see in my code. Regards,
EDIT: as per comments provided by #JuanCarlosOropeza solution I presented is not optimal. Providing sample data for him to forward a different solution. If anyone has improvements as well feel free to participate.
CREATE TABLE SourceTable
(`id` int, `StartDateTime` datetime, `EndDateTime` datetime)
;
INSERT INTO SourceTable
(`id`, `StartDateTime`, `EndDateTime`)
VALUES
(1, '2016-09-20 12:52:00', '2016-09-23 13:15:00'),
(2, '2016-09-19 19:15:00', '2016-09-22 19:15:00'),
(3, '2016-09-01 10:35:00', '2016-09-06 13:15:00'),
(4, '2016-09-26 10:34:00', '2016-09-29 11:25:00'),
(5, '2016-09-01 13:01:00', '2016-09-06 14:55:00'),
(6, '2016-09-05 02:21:00', '2016-09-08 19:15:00'),
(7, '2016-09-27 14:14:00', '2016-10-01 19:15:00'),
(8, '2016-09-27 04:18:00', '2016-09-30 14:15:00'),
(9, '2016-09-01 14:50:00', '2016-09-06 17:25:00'),
(10, '2016-09-20 12:52:00', '2016-09-23 13:15:00'),
(11, '2016-09-26 02:14:00', '2016-09-29 10:15:00'),
(12, '2016-09-01 12:04:00', '2016-09-06 17:05:00'),
(13, '2016-09-20 15:30:00', '2016-09-23 15:15:00'),
(14, '2016-09-02 16:04:00', '2016-09-07 20:55:00'),
(15, '2016-09-23 10:41:00', '2016-09-28 13:05:00'),
(16, '2016-09-27 16:28:00', '2016-10-01 13:15:00'),
(17, '2016-09-27 15:33:00', '2016-10-01 22:45:00'),
(18, '2016-09-20 12:53:00', '2016-09-23 13:25:00'),
(19, '2016-09-19 13:49:00', '2016-09-22 13:05:00'),
(20, '2016-09-20 13:46:00', '2016-09-23 13:15:00'),
(21, '2016-09-01 16:32:00', '2016-09-06 18:05:00'),
(22, '2016-09-01 10:35:00', '2016-09-06 22:45:00'),
(23, '2016-09-26 12:40:00', '2016-09-29 12:35:00'),
(24, '2016-09-27 10:37:00', '2016-09-30 21:25:00'),
(25, '2016-09-27 09:41:00', '2016-09-30 15:15:00'),
(26, '2016-09-16 02:09:00', '2016-09-21 10:05:00'),
(27, '2016-09-20 15:13:00', '2016-09-23 15:15:00'),
(28, '2016-09-20 15:30:00', '2016-09-23 15:15:00'),
(29, '2016-09-27 09:55:00', '2016-09-30 13:25:00'),
(30, '2016-09-27 04:18:00', '2016-09-30 14:15:00')
;
I created this solution considering the following logic assumptions.
StartDateTime always occurs before EndDateTime (though had some that didn't and it calculated the time difference correctly)
Week StartDateTime occurred: WEEK(StartDateTime,1)
Week EndDateTime occurred: WEEK(EndDateTime,1)
Start of weekend of week StartDateTime: ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime))
Start of workweek after first weekend: ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),7-WEEKDAY(StartDateTime))
Full Query:
SELECT
id,
StartDateTime,
EndDateTime,
CASE
WHEN ( WEEK(EndDateTime,1) = WEEK(StartDateTime,1) )
THEN
CASE
WHEN ( StartDateTime >= ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)) )
THEN SEC_TO_TIME(0)
ELSE
CASE
WHEN ( EndDateTime >= ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)) )
THEN ( TIMEDIFF(ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)),StartDateTime) )
ELSE ( TIMEDIFF(EndDateTime,StartDateTime) )
END
END
ELSE
CASE
WHEN ( StartDateTime >= ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)) )
THEN
CASE
WHEN ( EndDateTime >= ADDDATE(ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)),(WEEK(EndDateTime,1) - WEEK(StartDateTime,1)) * 7) )
THEN ( SEC_TO_TIME(120*3600*(WEEK(EndDateTime,1) - WEEK(StartDateTime,1))) )
ELSE ( SEC_TO_TIME(120*3600*(WEEK(EndDateTime,1) - WEEK(StartDateTime,1) - 1) + TIME_TO_SEC(TIMEDIFF(EndDateTime, ADDDATE(ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),7-WEEKDAY(StartDateTime)),7*(WEEK(EndDateTime,1) - WEEK(StartDateTime,1) - 1))))) )
END
ELSE
CASE
WHEN ( EndDateTime >= ADDDATE(ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)),(WEEK(EndDateTime,1) - WEEK(StartDateTime,1)) * 7) )
THEN ( SEC_TO_TIME(120*(WEEK(EndDateTime,1) - WEEK(StartDateTime,1)) + TIME_TO_SEC(TIMEDIFF(ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)),StartDateTime))) )
ELSE ( SEC_TO_TIME(TIME_TO_SEC(TIMEDIFF(EndDateTime, ADDDATE(ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),7-WEEKDAY(StartDateTime)),7*(WEEK(EndDateTime,1) - WEEK(StartDateTime,1) - 1)))) + TIME_TO_SEC(TIMEDIFF(ADDDATE(TIMESTAMP(DATE(StartDateTime),'00:00:00'),5-WEEKDAY(StartDateTime)),StartDateTime))) )
END
END
END as ResponseTime
FROM
SourceTable;
First CASE checks if both timestamps happened on the same week. Second layer checks if StartDateTime happened during the first weekend. Third layer checks if EndDateTime happened during a weekend. Based on these considerations outputs the correct calculation.