Mysql slow performance with group by order by - mysql

I am using Mysql 5.7 I have table which having 7006500 rows. My query performing group by and fetching row which has maximum count with each group on column which is already indexed but still takes time for execution. Below is my query,execution plan and table schema.
Table Schema
CREATE TABLE templog (
id bigint(20) unsigned NOT NULL AUTO_INCREMENT,
userid bigint(12) unsigned NOT NULL,
type tinyint(3) NOT NULL DEFAULT '0',
os tinyint(4) NOT NULL DEFAULT '0',
day date DEFAULT NULL,
activetime smallint(5) unsigned NOT NULL DEFAULT '0',
createdat datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
timegroupid tinyint(4) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`),
KEY templog_type_IDX (`type`,`day`,`userid`,`timegroupid`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=7006500 DEFAULT CHARSET=utf8;
My Query:-
SELECT SQL_NO_CACHE y.userid, y.timegroupid as besttime,y.cnt
FROM (
SELECT #row_number := CASE WHEN #userid=x.userid THEN #row_number+1 ELSE 1 END AS row_number ,
#userid := x.userid AS userid ,x.cnt,x.timegroupid
FROM (
SELECT userid, timegroupid ,COUNT(userid) as cnt
from templog
where type = 3
AND day BETWEEN '2020-01-01' AND '2020-01-20'
AND userid < 771267
GROUP by userid, timegroupid
ORDER by userid DESC ,cnt DESC
) x,
( SELECT #row_number:=0, #userid:='') AS t
) y
where y.row_number = 1
ORDER by y.userid DESC
LIMIT 1000;
Query Explain format:
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "12.00"
},
"ordering_operation": {
"using_filesort": true,
"table": {
"table_name": "y",
"access_type": "ref",
"possible_keys": [
"<auto_key0>"
],
"key": "<auto_key0>",
"used_key_parts": [
"row_number"
],
"key_length": "9",
"ref": [
"const"
],
"rows_examined_per_scan": 10,
"rows_produced_per_join": 10,
"filtered": "100.00",
"cost_info": {
"read_cost": "10.00",
"eval_cost": "2.00",
"prefix_cost": "12.00",
"data_read_per_join": "320"
},
"used_columns": [
"row_number",
"userid",
"cnt",
"timegroupid"
],
"attached_condition": "((`y`.`row_number` <=> 1))",
"materialized_from_subquery": {
"using_temporary_table": true,
"dependent": false,
"cacheable": true,
"query_block": {
"select_id": 2,
"cost_info": {
"query_cost": "6441.25"
},
"nested_loop": [
{
"table": {
"table_name": "t",
"access_type": "system",
"rows_examined_per_scan": 1,
"rows_produced_per_join": 1,
"filtered": "100.00",
"cost_info": {
"read_cost": "0.00",
"eval_cost": "0.20",
"prefix_cost": "0.00",
"data_read_per_join": "16"
},
"used_columns": [
"#row_number:=0",
"#userid:=''"
],
"materialized_from_subquery": {
"using_temporary_table": true,
"dependent": false,
"cacheable": true,
"query_block": {
"select_id": 4,
"message": "No tables used"
}
}
}
},
{
"table": {
"table_name": "x",
"access_type": "ALL",
"rows_examined_per_scan": 25725,
"rows_produced_per_join": 25725,
"filtered": "100.00",
"cost_info": {
"read_cost": "1296.25",
"eval_cost": "5145.00",
"prefix_cost": "6441.25",
"data_read_per_join": "602K"
},
"used_columns": [
"userid",
"timegroupid",
"cnt"
],
"materialized_from_subquery": {
"using_temporary_table": true,
"dependent": false,
"cacheable": true,
"query_block": {
"select_id": 3,
"cost_info": {
"query_cost": "140807.11"
},
"ordering_operation": {
"using_filesort": true,
"grouping_operation": {
"using_temporary_table": true,
"using_filesort": false,
"table": {
"table_name": "templog",
"access_type": "range",
"possible_keys": [
"templog_type_IDX"
],
"key": "templog_type_IDX",
"used_key_parts": [
"type",
"day"
],
"key_length": "13",
"rows_examined_per_scan": 694718,
"rows_pr
oduced_per_join": 25725,
"filtered": "33.33",
"using_index": true,
"cost_info": {
"read_cost": "1863.51",
"eval_cost": "5145.03",
"prefix_cost": "140807.11",
"data_read_per_join": "803K"
},
"used_columns": [
"id",
"userid",
"type",
"day",
"timegroupid"
],
"attached_condition": "((`templog`.`type` = 3) and (`templog`.`day` between '2020-01-01' and '2020-01-20') and (`templog`.`userid` < 771267))"
}
}
}
}
}
}
}
]
}
}
}
}
}
}
Is there any other to optimize query or change index order or rewrite query in another way for better performance?

Do not count on #variables working like you would expect them to. I think the next version is beginning to disallow them.
The optimizer is free to throw away the ORDER BY in the derived table. This will lead to wrong results. Tacking on a large LIMIT to the subquery may prevent that.
Build and maintain a "summary table". This can significantly speed up this and similar queries.
CREATE TABLE Summary (
userid ...,
timegroupid ...,
type ...,
day ...,
cnt SMALLINT UNSIGNED NOT NULL, -- COUNT(*)
tottime INT UNSIGNED NOT NULL, -- SUM(activetime)
PRIMARY KEY(timegroupid, userid, type, day)
However, without understanding the data better, I cannot predict whether this table will be noticeably smaller than the original. If it is significantly smaller, this summary table will not be practical.
I added another tag -- follow it for more discussion of groupwise-max.

Related

why select a more column make mysql use index condition pushdown?

in the test database employees, employees has 300,000+ rows.
CREATE TABLE employees (
emp_no INT NOT NULL,
birth_date DATE NOT NULL,
first_name VARCHAR(14) NOT NULL,
last_name VARCHAR(16) NOT NULL,
gender ENUM ('M','F') NOT NULL,
hire_date DATE NOT NULL,
PRIMARY KEY (emp_no)
);
alter table employees add index IDX_LAST_NAME(last_name);
explain
select last_name, emp_no
from employees
where last_name in ('Aamodt', 'Aaaa', 'Ab', 'Ac')
order by last_name asc, emp_no asc
limit 100;
shows
It uses covering index for where filter and order by.
explain format=JSON
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "43.97"
},
"ordering_operation": {
"using_filesort": false,
"table": {
"table_name": "employees",
"access_type": "range",
"possible_keys": [
"IDX_LAST_NAME"
],
"key": "IDX_LAST_NAME",
"used_key_parts": [
"last_name"
],
"key_length": "66",
"rows_examined_per_scan": 208,
"rows_produced_per_join": 208,
"filtered": "100.00",
"using_index": true,
"cost_info": {
"read_cost": "23.17",
"eval_cost": "20.80",
"prefix_cost": "43.97",
"data_read_per_join": "27K"
},
"used_columns": [
"emp_no",
"last_name"
],
"attached_condition": "(`employees`.`employees`.`last_name` in ('Aamodt','Aaaa','Ab','Ac'))"
}
}
}
}
While
explain
select last_name, emp_no, first_name
from employees
where last_name in ('Aamodt', 'Aaaa', 'Ab', 'Ac')
order by last_name asc, emp_no asc
limit 100;
shows
explain format=JSON
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "168.10"
},
"ordering_operation": {
"using_filesort": false,
"table": {
"table_name": "employees",
"access_type": "range",
"possible_keys": [
"IDX_LAST_NAME"
],
"key": "IDX_LAST_NAME",
"used_key_parts": [
"last_name"
],
"key_length": "66",
"rows_examined_per_scan": 208,
"rows_produced_per_join": 208,
"filtered": "100.00",
"index_condition": "(`employees`.`employees`.`last_name` in ('Aamodt','Aaaa','Ab','Ac'))",
"cost_info": {
"read_cost": "147.30",
"eval_cost": "20.80",
"prefix_cost": "168.10",
"data_read_per_join": "27K"
},
"used_columns": [
"emp_no",
"first_name",
"last_name"
]
}
}
}
}
I think it use IDX_LAST_NAME index for where filer and order by, but also need to retrieve full table rows.
But why the second SQL use index condition pushdown? Considering performance, does the
second SQL only take more cost for retrieving the full rows than the first SQL?
These must be InnoDB tables. Indexes in that storage engine always have the PK appended to them, so your first query can be covered by the index on last_name.
Your second query probably uses your index on last_name for two things: finding the right rows and handling the ORDER BY last_name ... LIMIT 100 operation without sorting the entire table. But it still must hit the clustered index implementing the primary key to retrieve the first_name.
To see more details use EXPLAIN ANALYZE FORMAT=TREE on MySQL or ANALYZE FORMAT=JSON on MariaDB to retrieve your execution plan.
To know whether your query is unacceptably slow, you need to test it. If your second query is unacceptably slow you can create an index to cover it.
CREATE INDEX l_e_f ON employees (last_name, emp_id, first_name);

Query runs duration: fast / fetch: slow, but GROUP BY (or any aggregate function) makes it duration: slow / fetch: fast

I have a table process_execution_data (29,4mi rows) that consists of
id | key_name | value | piid | created | modified | process_id
-------+------------+------------+------------+-------------+-------------+-----------
<int> | <varchar> | <longtext> | <varchar> | <datetime> | <datetime> | <int(11)>
Basically, this table holds the key, value pairs of existing variables during a execution piid of a process process_id
There lots of indexes (singulars and composites), pretty much all columns and composition-of-columns are covered.
An "average" process \ piid has roughly 60~ disctinct key_name.
For this specific database/table, there are roughly 30~ different process_id
SELECT
p.name as process_name,
ped.value as fw_spaceleft_disk_C
FROM process_execution_data ped
INNER JOIN process p ON ped.process_id = p.id -- This table 'process' is just a {id:name} table, 30~ rows
WHERE
ped.key_name = 'fw_disk_space_left_c'
This query runs (Duration: 0.375s, Fetch: ~40s) and returns 238k rows.
If I add a GROUP BY ped.process_id it now returns the expected 30 rows, but takes (Duration: 50s, Fetch: 0.000s)
EXPLAIN SELECT with the GROUP BY.
id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra
---+-------------+-------+------------+------------+---------------+---------------+---------+-------------------+-------+----------+----------------------------------------------
1 | SIMPLE | p | | index | id_name,id | id_name | 519 | | 66 | 100.00 | Using index; Using temporary; Using filesort
1 | SIMPLE | ped | | ref | (5 indexes)** | processid_key | 519 | brain.p.id,const | 31 | 100.00 |
**all,key_piid_modified_created,processid_key,processid,key
My current understanding is that Duration is the time it takes for the database/engine to process the query and collect the beginning of results, whereas Fetch is how long it takes to completely send?/transfer? all the rows of data.
Therefore, if I have a fast Duration but long Fetch query it means that the query is optimized/fast but the data is too large and if it gets solved somehow (ie: by reducing the amount of rows of data), it should be Duration: fast and Fetch: fast and then problem solved.
I quite do not understand how the engine is able to process a query that results into 283k rows of data in blazingly fast 0.375s but when it is told to GROUP BY it takes 20x that time to "simply" group results.
Questions:
a) Is my understanding of Duration/Fetch wrong?
b) Is it 'normal' for a extremely fast Duration (0.375s) query to become slow (Duration 50s) because of a GROUP BY?
c) Is there anything that can be optimized? Feels like 40s~50s (sum of Duration/Fetch, for either query) is longer than it should.
Additional data as requested by Rick James
SHOW CREATE TABLE `process_execution_data`
>>>
CREATE TABLE `process_execution_data` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`key_name` varchar(512) NOT NULL,
`value` longtext,
`piid` varchar(512) NOT NULL,
`created` datetime DEFAULT NULL,
`modified` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`process_id` int(11) DEFAULT NULL,
`flags` varchar(512) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `all` (`process_id`,`key_name`,`modified`,`created`),
KEY `key_piid_modified_created` (`key_name`,`piid`,`modified`,`created`),
KEY `processid_key` (`process_id`,`key_name`),
KEY `processid` (`process_id`),
KEY `key` (`key_name`),
KEY `piid` (`piid`),
KEY `created` (`created`),
KEY `modified` (`modified`)
) ENGINE=InnoDB AUTO_INCREMENT=31134333 DEFAULT CHARSET=latin1
SHOW CREATE TABLE `process`
>>>
CREATE TABLE `process` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(512) DEFAULT NULL,
`status` varchar(512) DEFAULT NULL,
`description` varchar(512) DEFAULT NULL,
`bpm` varchar(512) DEFAULT NULL,
KEY `id_name` (`id`,`name`),
KEY `id` (`id`),
KEY `name` (`name`)
) ENGINE=InnoDB AUTO_INCREMENT=261 DEFAULT CHARSET=latin1
and
EXPLAIN FORMAT=JSON SELECT
p.name as process_name,
ped.value as fw_spaceleft_disk_C
FROM process_execution_data ped
INNER JOIN process p ON ped.process_id = p.id -- This table 'process' is just a {id:name} table, 30~ rows
WHERE
ped.key_name = 'fw_disk_space_left_c'
>>>
'{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "2.42"
},
"nested_loop": [
{
"table": {
"table_name": "ped",
"access_type": "ref",
"possible_keys": [
"all",
"key_piid_modified_created",
"processid_key",
"processid",
"key"
],
"key": "key_piid_modified_created",
"used_key_parts": [
"key"
],
"key_length": "514",
"ref": [
"const"
],
"rows_examined_per_scan": 1,
"rows_produced_per_join": 1,
"filtered": "100.00",
"cost_info": {
"read_cost": "1.00",
"eval_cost": "0.20",
"prefix_cost": "1.20",
"data_read_per_join": "2K"
},
"used_columns": [
"key",
"value",
"process_id"
],
"attached_condition": "(`brain`.`ped`.`process_id` is not null)"
}
},
{
"table": {
"table_name": "p",
"access_type": "ref",
"possible_keys": [
"id_name",
"id"
],
"key": "id_name",
"used_key_parts": [
"id"
],
"key_length": "4",
"ref": [
"brain.ped.process_id"
],
"rows_examined_per_scan": 1,
"rows_produced_per_join": 1,
"filtered": "100.00",
"using_index": true,
"cost_info": {
"read_cost": "1.00",
"eval_cost": "0.22",
"prefix_cost": "2.42",
"data_read_per_join": "2K"
},
"used_columns": [
"id",
"name"
]
}
}
]
}
}'
EXPLAIN FORMAT=JSON SELECT
p.name as process_name,
ped.value as fw_spaceleft_disk_C
FROM process_execution_data ped
INNER JOIN process p ON ped.process_id = p.id -- This table 'process' is just a {id:name} table, 30~ rows
WHERE
ped.key_name = 'fw_disk_space_left_c'
GROUP BY ped.process_id
>>>
'{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "2.42"
},
"grouping_operation": {
"using_temporary_table": true,
"using_filesort": true,
"nested_loop": [
{
"table": {
"table_name": "ped",
"access_type": "ref",
"possible_keys": [
"all",
"key_piid_modified_created",
"processid_key",
"processid",
"key"
],
"key": "key_piid_modified_created",
"used_key_parts": [
"key"
],
"key_length": "514",
"ref": [
"const"
],
"rows_examined_per_scan": 1,
"rows_produced_per_join": 1,
"filtered": "100.00",
"index_condition": "(`brain`.`ped`.`key` <=> ''fw_disk_space_left_c'')",
"cost_info": {
"read_cost": "1.00",
"eval_cost": "0.20",
"prefix_cost": "1.20",
"data_read_per_join": "2K"
},
"used_columns": [
"id",
"key",
"value",
"process_id"
],
"attached_condition": "(`brain`.`ped`.`process_id` is not null)"
}
},
{
"table": {
"table_name": "p",
"access_type": "ref",
"possible_keys": [
"id_name",
"id"
],
"key": "id_name",
"used_key_parts": [
"id"
],
"key_length": "4",
"ref": [
"brain.ped.process_id"
],
"rows_examined_per_scan": 1,
"rows_produced_per_join": 1,
"filtered": "100.00",
"using_index": true,
"cost_info": {
"read_cost": "1.00",
"eval_cost": "0.22",
"prefix_cost": "2.42",
"data_read_per_join": "2K"
},
"used_columns": [
"id",
"name"
]
}
}
]
}
}
}'
In process, I would change
KEY `id_name` (`id`,`name`),
KEY `id` (`id`),
to
PRIMARY KEY(id)
unless you have some reason not to.
Meanwhile, ped needs
INDEX(key_name, process_id, value)
and I would drop these
KEY `processid_key` (`process_id`,`key_name`),
KEY `processid` (`process_id`),
as being redundant with all.
The column order in an INDEX is important. More discussion: Index Cookbook
Duration/Fetch
Those are cryptic terms used by Explain. I look only at the total time. They may be saying that the GROUP BY has to generate a temp table and that messes with their timers.

Different speed of theoretically equal queries on MySQL

I have found a strange speed issue with one of my MySQL queries when run on two different columns, date_from vs date_to.
The table structure is the following:
create table if not exists table1 (
id unsigned int,
field2 int,
field3 varchar(32),
date_from date not null,
date_to date not null,
field6 text
);
create unique index idx_uniq_table1 on table1 (id, field2, field3, date_from);
create index idx_table1_id on table1 (id);
create index idx_table1_field2 on table1 (field2);
create index idx_table1_field3 on table1 (field3);
create index idx_table1_date_from on table1 (date_from);
create index idx_table1_date_to on table1 (date_to);
When I run this query using date_from, execution time is 1.487 seconds:
select field3, min(date_from) from table1 group by field3;
When I run this other query using date_to, execution time is 13.804 seconds, almost 10 times slower:
select field3, max(date_to) from table1 group by field3;
Both columns are NOT NULL, so there are no empty values.
The table has ~7M rows.
The only difference that I see between these two columns is that date_from appears in the unique index but, as far as I know, that should't make a difference if not filtering by all four columns in the index.
Am I missing anything?
This is the explain of the date_from column:
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "889148.90"
},
"grouping_operation": {
"using_filesort": false,
"table": {
"table_name": "table1",
"access_type": "index",
"possible_keys": [
"idx_uniq_table1",
"idx_table1_id",
"idx_table1_field2",
"idx_table1_field3",
"idx_table1_date_from",
"idx_table1_date_to"
],
"key": "idx_table1_field3",
"used_key_parts": [
"field3"
],
"key_length": "130",
"rows_examined_per_scan": 5952609,
"rows_produced_per_join": 5952609,
"filtered": "100.00",
"using_index": true,
"cost_info": {
"read_cost": "293888.00",
"eval_cost": "595260.90",
"prefix_cost": "889148.90",
"data_read_per_join": "908M"
},
"used_columns": [
"id",
"field2",
"field3",
"date_from"
]
}
}
}
}
This is the explain of the date_to column:
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "889148.90"
},
"grouping_operation": {
"using_filesort": false,
"table": {
"table_name": "table1",
"access_type": "index",
"possible_keys": [
"idx_uniq_table1",
"idx_table1_id",
"idx_table1_field2",
"idx_table1_field3",
"idx_table1_date_from",
"idx_table1_date_to"
],
"key": "idx_table1_field3",
"used_key_parts": [
"field3"
],
"key_length": "130",
"rows_examined_per_scan": 5952609,
"rows_produced_per_join": 5952609,
"filtered": "100.00",
"cost_info": {
"read_cost": "293888.00",
"eval_cost": "595260.90",
"prefix_cost": "889148.90",
"data_read_per_join": "908M"
},
"used_columns": [
"id",
"field2",
"field3",
"date_from",
"date_to"
]
}
}
}
}
The only difference I see is in used_columns, at the end, where one contains date_to and the other doesn't.
Naughty. There is no PRIMARY KEY.
Since the "used columns" does not seem to agree with the queries, I don't want to try to explain the timing difference.
Replace the index on field3 by these two:
INDEX(field3, date_from)
INDEX(field3, date_to)
Those will speed up your two Selects.
In addition to Rick's answer about proper index based on what your criteria is... The reason for the speed difference is that the one index that had both the field3 and date_from, the engine was able to use the data within the actual index instead of having to go to the raw data pages that contain the entire record. The index that only had the date_to still had to go to every raw data record to get the field3, thus taking the time.
That is why you can utilize covering indexes. Having each component of data you are looking for to optimize the query. Not saying you want an index with 20 columns, but in this context of what might be common criteria for filtering is exactly why you do.

Avoid table scan and use index instead in query

I am designing a new database and have noticed my queries are not scaling as well as they should be. When my aggregations involve hundreds of records I am seeing significant increases in response times. I am wondering if my query is deeply flawed or if I am just not using the right index.
I have done a lot of tweaking to my query but have not come up with a way to eliminate doing a full table scan and instead use an index. When I use a tool similar to EXPLAIN on my query I see the following:
Full table scans are generally inefficient, avoid using them.
Your query uses MySQL's 'filesort' operation. This tends to slow down queries.
Your query uses MySQL's temporary tables. This can require extra I/O and tends to slow down queries.
Table:
CREATE TABLE `indexTable` (
`id` int(10) unsigned NOT NULL,
`userId` int(10) unsigned NOT NULL,
`col1` varbinary(320) NOT NULL,
`col2` tinyint(3) unsigned NOT NULL,
`col3` tinyint(3) unsigned NOT NULL,
`createdAt` bigint(20) unsigned NOT NULL,
`updatedAt` bigint(20) unsigned NOT NULL,
`metadata` json NOT NULL,
PRIMARY KEY (`id`,`userId`,`col1`,`col2`,`col3`),
KEY `createdAt` (`createdAt`),
KEY `id_userId_col1_col2_createdAt` (`id`,`userId`,`col1`,`col2`,`createdAt`),
KEY `col1_col2_createdAt` (`col1`,`col2`,`createdAt`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8
Query:
SELECT t1.id, t1.userId, t1.col1, t1.col2, t1.col3, t1.metadata
FROM indexTable as t1
INNER JOIN(
SELECT col1, col2, MAX(createdAt) AS maxCreatedAt
FROM indexTable
WHERE id = ? AND userId = ?
GROUP BY col1, col2
ORDER BY maxCreatedAt
LIMIT 10 OFFSET 0) AS sub
ON t1.col1 = sub.col1
AND t1.col2 = sub.col2
AND t1.createdAt = sub.maxCreatedAt
WHERE t1.id = ? AND t1.userId = ?
ORDER BY t1.createdAt;
PK: id, userId, col1, col2, col3
Index: createdAt
Explain:
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "34.50"
},
"ordering_operation": {
"using_temporary_table": true,
"using_filesort": true,
"cost_info": {
"sort_cost": "10.00"
},
"nested_loop": [
{
"table": {
"table_name": "sub",
"access_type": "ALL",
"rows_examined_per_scan": 10,
"rows_produced_per_join": 10,
"filtered": "100.00",
"cost_info": {
"read_cost": "10.50",
"eval_cost": "2.00",
"prefix_cost": "12.50",
"data_read_per_join": "3K"
},
"used_columns": [
"col1",
"col2",
"maxCreatedAt"
],
"attached_condition": "(`sub`.`maxCreatedAt` is not null)",
"materialized_from_subquery": {
"using_temporary_table": true,
"dependent": false,
"cacheable": true,
"query_block": {
"select_id": 2,
"cost_info": {
"query_cost": "10.27"
},
"ordering_operation": {
"using_filesort": true,
"grouping_operation": {
"using_temporary_table": true,
"using_filesort": false,
"table": {
"table_name": "indexTable",
"access_type": "ref",
"possible_keys": [
"PRIMARY",
"createdAt",
"id_userId_col1_col2_createdAt",
"col1_col2_createdAt"
],
"key": "PRIMARY",
"used_key_parts": [
"id",
"userId"
],
"key_length": "8",
"ref": [
"const",
"const"
],
"rows_examined_per_scan": 46,
"rows_produced_per_join": 46,
"filtered": "100.00",
"cost_info": {
"read_cost": "1.07",
"eval_cost": "9.20",
"prefix_cost": "10.27",
"data_read_per_join": "16K"
},
"used_columns": [
"id",
"userId",
"createdAt",
"col1",
"col2",
"col3"
],
"attached_condition": "((`MyDB`.`indexTable`.`id` <=> 53) and (`MyDB`.`indexTable`.`userId` <=> 549814))"
}
}
}
}
}
}
},
{
"table": {
"table_name": "t1",
"access_type": "ref",
"possible_keys": [
"PRIMARY",
"createdAt",
"id_userId_col1_col2_createdAt",
"col1_col2_createdAt"
],
"key": "id_userId_col1_col2_createdAt",
"used_key_parts": [
"id",
"userId",
"col1",
"col2",
"createdAt"
],
"key_length": "339",
"ref": [
"const",
"const",
"sub.col1",
"sub.col2",
"sub.maxCreatedAt"
],
"rows_examined_per_scan": 1,
"rows_produced_per_join": 10,
"filtered": "100.00",
"cost_info": {
"read_cost": "10.00",
"eval_cost": "2.00",
"prefix_cost": "24.50",
"data_read_per_join": "3K"
},
"used_columns": [
"id",
"userId",
"createdAt",
"updatedAt",
"col1",
"col2",
"col3",
"metadata",
]
}
}
]
}
}
}
This query finds the most recent record in the grouping of col1 and col2, orders by createdAt, and limits the entries to 10.
The "derived" table (subquery) needs this composite index:
INDEX(id, userid, -- in either order
col1, col2, -- in this order
createdAt) -- to make it "covering"
With that index, is probably will not do a full table scan. However, it will involve a filesort. This is because the ORDER BY is not the same as the GROUP BY and it is an aggregate.
t1 needs
INDEX(col1, col2, -- in either order
createdAt)
sub,maxCreatedAt -- typo??
ORDER BY t1.createdAt -- another necessary filesort.
Do not beware of filesorts. Especially when there are only 10 rows (as in the second case).
Without seeing SHOW CREATE TABLE, I cannot say whether the "filesort" and the "temporary table" touched the disk at all, or was done in RAM.
FORCE INDEX is almost always a bad idea -- even if it helps today, it may hurt tomorrow.
The Optimizer will deliberately (and rightly) use a table scan if too much of the table needs to be looked at -- it is faster than bouncing between the index and the data.
I was able to solve this issue by updating my query to include id and userId in the GROUP BY. I was then able to join on the two additional columns and for some reason that made MySQL use the right index.

Optimization of query using covering indices

I have the following query with a subquery and self join:
SELECT bucket.patient_sid AS sid
FROM
(SELECT clinical_data.patient_sid,
clinical_data.lft,
clinical_data.rgt
FROM clinical_data INNER JOIN
(SELECT clinical_data.patient_sid,
clinical_data.lft,
clinical_data.rgt,
clinical_data.attribute_id
FROM clinical_data
WHERE clinical_data.attribute_id = '33' AND clinical_data.string_value = '2160-0') AS attribute
ON clinical_data.patient_sid = attribute.patient_sid
AND clinical_data.lft >= attribute.lft
AND clinical_data.rgt <= attribute.rgt
WHERE clinical_data.attribute_id = '36') AS bucket;
I have the following indices defined on this:
KEY `idx_bucket` (`attribute_id`,`string_value`)
KEY `idx_self_join` (`patient_sid`,`attribute_id`,`lft`,`rgt`)
When I look at the query using EXPLAIN, the subquery using the covering index idx_bucket is definitely optimized, but the self join and where clause are not. Furthermore, why does it report that only patient_sid and attribute_id are used for used_key_parts while an attachment_condition is shown for lft, rgt (what does this mean?). Both lft and 'rgt` are just defined as integers with no special properties, so why aren't they being used in my covering index?
Even more strange is when I define
KEY `idx_self_join` (`patient_sid`,`lft`,`rgt`,`attribute_id`)
only patient_sid is registered in used_key_parts. Furthermore filtered drops to 1.60% from 11.00%!
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "645186.71"
},
"nested_loop": [
{
"table": {
"table_name": "clinical_data",
"access_type": "ref",
"possible_keys": [
"fk_attribute_idx",
"idx_value_string",
"idx_value_double",
"idx_bucket",
"idx_self_join_idx"
],
"key": "idx_bucket",
"used_key_parts": [
"attribute_id",
"string_value"
],
"key_length": "308",
"ref": [
"const",
"const"
],
"rows_examined_per_scan": 126402,
"rows_produced_per_join": 126402,
"filtered": "100.00",
"cost_info": {
"read_cost": "126402.00",
"eval_cost": "25280.40",
"prefix_cost": "151682.40",
"data_read_per_join": "46M"
},
"used_columns": [
"patient_sid",
"string_value",
"attribute_id",
"lft",
"rgt"
],
"attached_condition": "(`ns_large2`.`clinical_data`.`patient_sid` is not null)"
}
},
{
"table": {
"table_name": "clinical_data",
"access_type": "ref",
"possible_keys": [
"fk_attribute_idx",
"idx_value_string",
"idx_value_double",
"idx_bucket",
"idx_self_join_idx"
],
"key": "idx_self_join_idx",
"used_key_parts": [
"attribute_id",
"patient_sid"
],
"key_length": "10",
"ref": [
"const",
"ns_large2.clinical_data.patient_sid"
],
"rows_examined_per_scan": 14,
"rows_produced_per_join": 201169,
"filtered": "11.11",
"using_index": true,
"cost_info": {
"read_cost": "131327.39",
"eval_cost": "40233.83",
"prefix_cost": "645186.71",
"data_read_per_join": "73M"
},
"used_columns": [
"patient_sid",
"attribute_id",
"lft",
"rgt"
],
"attached_condition": "((`ns_large2`.`clinical_data`.`lft` >= `ns_large2`.`clinical_data`.`lft`) and (`ns_large2`.`clinical_data`.`rgt` <= `ns_large2`.`clinical_data`.`rgt`))"
}
}
]
}
}
Here's your basic JOIN:
SELECT
FROM clinical_data cd1
JOIN clinical_data cd2
ON cd1.patient_sid = cd2.patient_sid
AND cd2.attribute_id = '33'
WHERE cd1.attribute_id = '36'
Here's what I finally came up with:
SELECT
cd1.patient_sid as sid
FROM clinical_data cd1
JOIN clinical_data cd2
ON cd1.patient_sid = cd2.patient_sid
AND cd1.lft >= cd2.lft
AND cd1.rgt <= cd2.rgt
WHERE cd1.attribute_id = '36'
AND cd2.attribute_id = '33'
AND cd2.string_value = '2160-0'
"Used_columns" says that it is 'covering'. The final "used key parts" are not all used as a "key" because they are needed in a "range", not '='.
Get rid of the outer query:
SELECT clinical_data.patient_sid, clinical_data.lft, clinical_data.rgt
FROM clinical_data
INNER JOIN
( SELECT clinical_data.patient_sid, clinical_data.lft, clinical_data.rgt,
clinical_data.attribute_id
FROM clinical_data
WHERE clinical_data.attribute_id = '33'
AND clinical_data.string_value = '2160-0'
) AS attribute ON clinical_data.patient_sid = attribute.patient_sid
AND clinical_data.lft >= attribute.lft
AND clinical_data.rgt <= attribute.rgt
WHERE clinical_data.attribute_id = '36'
Sorry, but the lft-rgt schema is not very efficient.