Query doesn't end if limit is not added - AWS RDS MariaDB - mysql

I am currently migrating some databases from a physical server running MySQl over to AWS RDS. Current server uses MySQL 5.5 and the new RDS one uses MariaDB 10.1. Everything has been going smoothly until I tried to run an application against the new server. The following query will not complete even given 10 minutes, unless I add a large limit to it.
SELECT al.*
FROM US.Products p
JOIN US.Products_Contributors pc ON p.Product_id = pc.Product_id
JOIN US.Contributors c ON pc.Contributor_Id = c.Contributor_Id
JOIN US.Products_Category pca ON pca.Product_id = p.Product_id
JOIN US.Categories ca ON ca.Category_Id = pca.Category_Id
JOIN US.Asset_Links al ON (al.Asset_link_Id = p.Product_id) OR (al.Asset_link_Id = c.Contributor_ID)
WHERE p.Product_ISBN13 is not null
AND (
ca.Category_Code_3 in ("JNF","JUV")
)
AND al.Asset_Link_Table in ("Contributors","Products")
AND al.Asset_id != 0
GROUP BY al.Asset_Links_Id;
The query on the old server completes in around 11 seconds. If I add 'LIMIT 900000' to the query on the new server it finishes in around 7 seconds. There are ~800,000 rows returned before the GROUP by and ~150,000 rows returned after the group by. If I put the LIMIT to anything over 900000 then the query will not complete.
Things I have tried:
Been through DB parameters and increased buffer sizes on new server to be the same as the old server
Profiling queries: All of the time is spent in 'Copying to tmp table', which is consistently the same amount of time when a LIMIT clause is added as long as it is below 900000.
EXPLAIN outputs the same on both servers
Repairing and optimising tables on new server
Increasing RDS instance size - its now db.m4.xlarge
Setting RDS disk to Provisioned IOPS at 1000 IOPS
Running query on a different MySQL 5.5 server and it runs the same as it does on my original source server (not RDS).
Installing MySQL 5.7 on my EC2 and it runs the query very slowly unless the LIMIT is added.
So it seems to me that this is an issue with the MySQL version being used? But why would adding the LIMIT resolve the issue? And why does it only work up to 900000.
Any help would be greatly appreciated!
Thanks
UPDATE: 16:19 17/12/2017
EXPLAIN on original server MySQL 5.5:
EXPLAIN on RDS MariaDB 10.1 (No LIMIT):
{
"query_block": {
"select_id": 1,
"filesort": {
"temporary_table": {
"function": "buffer",
"table": {
"table_name": "p",
"access_type": "ALL",
"possible_keys": ["PRIMARY", "ISBN", "Product_Id"],
"rows": 120108,
"filtered": 99.999,
"attached_condition": "(p.Product_ISBN13 is not null)"
},
"table": {
"table_name": "pc",
"access_type": "ref",
"possible_keys": ["Products_contributor", "Contributor_Id"],
"key": "Products_contributor",
"key_length": "4",
"used_key_parts": ["Product_Id"],
"ref": ["US.p.Product_Id"],
"rows": 1,
"filtered": 100
},
"table": {
"table_name": "c",
"access_type": "eq_ref",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["Contributor_Id"],
"ref": ["US.pc.Contributor_Id"],
"rows": 1,
"filtered": 100,
"using_index": true
},
"table": {
"table_name": "pca",
"access_type": "ref",
"possible_keys": ["Products_Category", "Category_Id"],
"key": "Products_Category",
"key_length": "4",
"used_key_parts": ["Product_Id"],
"ref": ["US.p.Product_Id"],
"rows": 2,
"filtered": 100
},
"table": {
"table_name": "ca",
"access_type": "eq_ref",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["Category_Id"],
"ref": ["US.pca.Category_Id"],
"rows": 1,
"filtered": 100,
"index_condition": "(ca.Category_Id = pca.Category_Id)",
"attached_condition": "(ca.Category_Code_3 in ('JNF','JUV'))"
},
"block-nl-join": {
"table": {
"table_name": "al",
"access_type": "ALL",
"possible_keys": ["Asset_Link_Id", "Asset_Id"],
"rows": 908975,
"filtered": 95.517,
"attached_condition": "((al.Asset_Link_Table in ('Contributors','Products')) and (al.Asset_Id <> 0))"
},
"buffer_type": "flat",
"buffer_size": "1024Kb",
"join_type": "BNL",
"attached_condition": "((al.Asset_Link_Id = p.Product_Id) or (al.Asset_Link_Id = pc.Contributor_Id))"
}
}
}
}
}
EXPLAIN RDS MariaDB 10.1 (LIMIT 908974):
{
"query_block": {
"select_id": 1,
"filesort": {
"temporary_table": {
"function": "buffer",
"table": {
"table_name": "p",
"access_type": "ALL",
"possible_keys": ["PRIMARY", "ISBN", "Product_Id"],
"rows": 120108,
"filtered": 99.999,
"attached_condition": "(p.Product_ISBN13 is not null)"
},
"table": {
"table_name": "pc",
"access_type": "ref",
"possible_keys": ["Products_contributor", "Contributor_Id"],
"key": "Products_contributor",
"key_length": "4",
"used_key_parts": ["Product_Id"],
"ref": ["US.p.Product_Id"],
"rows": 1,
"filtered": 100
},
"table": {
"table_name": "c",
"access_type": "eq_ref",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["Contributor_Id"],
"ref": ["US.pc.Contributor_Id"],
"rows": 1,
"filtered": 100,
"using_index": true
},
"table": {
"table_name": "pca",
"access_type": "ref",
"possible_keys": ["Products_Category", "Category_Id"],
"key": "Products_Category",
"key_length": "4",
"used_key_parts": ["Product_Id"],
"ref": ["US.p.Product_Id"],
"rows": 2,
"filtered": 100
},
"table": {
"table_name": "ca",
"access_type": "eq_ref",
"possible_keys": ["PRIMARY"],
"key": "PRIMARY",
"key_length": "4",
"used_key_parts": ["Category_Id"],
"ref": ["US.pca.Category_Id"],
"rows": 1,
"filtered": 100,
"index_condition": "(ca.Category_Id = pca.Category_Id)",
"attached_condition": "(ca.Category_Code_3 in ('JNF','JUV'))"
},
"range-checked-for-each-record": {
"keys": ["Asset_Link_Id", "Asset_Id"],
"table": {
"table_name": "al",
"access_type": "ALL",
"possible_keys": ["Asset_Link_Id", "Asset_Id"],
"key": "Asset_Id",
"key_length": "4",
"used_key_parts": ["Asset_Id"],
"rows": 908975,
"filtered": 95.517
}
}
}
}
}
}
What I did notice is that setting the limit to any number which is 1 less than the reported number of joined ros then the query uses "Range checked for each record (index map: 0x6)", where as on MySQL 5.5 it uses that if there is a limit or not. I have found that if I add force index(Asset_Link_Id) to the last join it will always use the "Range checked..." and then the query will complete.
Whilte modifying and optimising all the queries is the ideal solution it is not the best in this case. The reason I don't really want to modify the query as the server I am migrating has hundreds of dfferent scripts / applications on it and if I have to modify a lot of queries in lots of different applications then this is going to take me a very long time and I won't be able to meet the migration deadline. So at this point, if this behaviour cannot be controlled by a setting, then I will probably resort to using MySQL 5.5 on the new server instead of MariaDB 10.1.
Can it be explained why the query optimiser chooses a different route with a large / undefined limit in 5.7 where it didn't in 5.5? Also after reading about dynamic range and the join_buffer why it is slower using the buffer over the range? From what I have read I would have thought this was more performant?

(This Answer does not directly address the "why did this slow down" question, but, as a consolation prize, addresses other performance issues.)
I see what looks like two many-to-many mapping tables. The typical implementation of such is less efficient than it could be.
Please follow the tips in https://mariadb.com/kb/en/library/building-the-best-index-for-a-given-select/#many-to-many-mapping-table -- then see if performance improves with or without the LIMIT.
The EXPLAIN is likely to change; let's see it.
Profiling -- Yeah, that is usually useless; it has a couple of uninformative messages where it spends 99% of its time.
Do not increase buffer sizes to the point of causing swapping; that will hurt a lot.
On the versions that allow for such, please provide EXPLAIN FORMAT=JSON SELECT ...
OR is often a performance killer; turn it into UNION:
( SELECT ...
JOIN US.Asset_Links al ON al.Asset_link_Id = p.Product_id
...
) UNION DISTINCT
( SELECT ...
JOIN US.Asset_Links al al.Asset_link_Id = c.Contributor_ID
...
)
(Because of GROUP BY, I may not have mapped the OR into UNION correctly.)

Related

How come using limit dramatically speeds up a simple but long select query

I stumbled upon this today and was quite shocked. When searching Google I normally see the question is revered as in - using limit caused it to return slower.
I have a MySQL table with a few million rows in it.
The PK is id and as such it's a unique index.
When I performed a query of the form select a, b, c, ... from table where id in (1, 2, 3, ..., 5000) it took about 15-20 minutes to fetch all results.
However, when I simply added limit 1000000 at the end (I used an extremely larger number than needed on purpose), it returned in a few seconds.
I know that using limit with smaller numbers than returned help as it's returning as soon as the "quota" is filled, but here I can't find the reason for such a dramatic improvement.
Can anyone please explain it?
Should I just add a limit to every query to improve its performance?
Why doesn't MySQL searches with and without it the same?
Update
Per requested the explain for each:
With limit (takes a few seconds)
{
"id" : 1,
"select_type" : "SIMPLE",
"table" : "table",
"partitions" : null,
"type" : "range",
"possible_keys" : "PRIMARY",
"key" : "PRIMARY",
"key_len" : "4",
"ref" : null,
"rows" : 4485,
"filtered" : 100.0,
"Extra" : "Using where"
}
Without limit (takes 15-20 minutes)
{
"id" : 1,
"select_type" : "SIMPLE",
"table" : "table",
"partitions" : null,
"type" : "ALL",
"possible_keys" : "PRIMARY",
"key" : null,
"key_len" : null,
"ref" : null,
"rows" : 69950423,
"filtered" : 50.0,
"Extra" : "Using where"
}
I'm not fluent in this but it looks like it used the key when I used limit but it didn't when I ran it without it.
Possibly other differences in the filtered and type fields which I don't know what they mean.
How come?
Update 2
A lot of questions asked so I'll attempt to provide details for all.
The MySQL version is 8.0.28 and the table engine is InnoDB.
I've ran the tests a few times one after the other, not only once.
Running the same EXPLAIN with fewer (10) values in the IN clause returned the same result for both with limit and without it!
{
"id" : 1,
"select_type" : "SIMPLE",
"table" : "table",
"partitions" : null,
"type" : "range",
"possible_keys" : "PRIMARY",
"key" : "PRIMARY",
"key_len" : "4",
"ref" : null,
"rows" : 10,
"filtered" : 100.0,
"Extra" : "Using where"
}
Now the FORMAT=JSON (with redacted parts):
Without limit
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "8369910.88"
},
"table": {
"table_name": "table",
"access_type": "ALL",
"possible_keys": [
"PRIMARY"
],
"rows_examined_per_scan": 70138598,
"rows_produced_per_join": 35069299,
"filtered": "50.00",
"cost_info": {
"read_cost": "4862980.98",
"eval_cost": "3506929.90",
"prefix_cost": "8369910.88",
"data_read_per_join": "558G"
},
"used_columns": [...],
"attached_condition": "(`db`.`table`.`id` in (...))"
}
}
}
With limit
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "8371410.92"
},
"table": {
"table_name": "table",
"access_type": "range",
"possible_keys": [
"PRIMARY"
],
"key": "PRIMARY",
"used_key_parts": [
"id"
],
"key_length": "4",
"rows_examined_per_scan": 4485,
"rows_produced_per_join": 35069255,
"filtered": "100.00",
"cost_info": {
"read_cost": "4864485.17",
"eval_cost": "3506925.54",
"prefix_cost": "8371410.92",
"data_read_per_join": "558G"
},
"used_columns": [...],
"attached_condition": "(`db`.`table`.`id` in (...))"
}
}
}
As there is a very long thread under the post in the comments, I will just add the answer here that is both mine and #Bill and it looks like the issue is a very long argument list in IN() part of the statement.
The culprit is to change range_optimizer_max_mem_size parameter number to accommodate more inputs in IN, as exceeding that parameter will cause a full table scan.
range optimize is reserving memory for range scanning, so not having enough of that memory set - will result full table scan
Now why does the LIMIT clause makes it happen - This part I would guess:
LIMIT is forcing MySQL to use a different range scan type
LIMIT is actually limiting the number of resources that will be returned so MySQL would know it will not return more than X, where without limit it would assume it can return 69950423 which would be more than some other memory limits that you set up, worth trying with limit equals number of rows in the table

MySql query 1000x slower with slighty higher LIMIT (not offset)

I have a MySQL database with about 12Mio records. Now I use the following query to query the required rows from that database:
SELECT date_time, price_l0, amount_l0, price_l1, amount_l1, price_l2, amount_l2, price_l3, /* 34 more columns */
FROM book_states
WHERE date_time > ? and
date_time < ? and
bookID = ?
ORDER BY date_time ASC
LIMIT 4350
The problem is when I use a LIMIT of about 4340 this query takes about 0.002/0.15 seconds to run. However, if I use a limit of say 4350 it takes 3.0/0.15 seconds (!) to run.
If I select fewer columns that threshold between a very fast and a very slow query is slighty higher, but it takes 3seconds or more even if I select only one columns if the LIMIT is above 5000.
Now I suspect this is an MySQL setup problem or some sort of RAM limitation, but since I am not a MySQL expert by any means, I'm asking you to explain what causes this drastic performance issue.
EDIT:
This is the JSON Explain data for a query that is taking 3sec
{
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "282333.60"
},
"ordering_operation": {
"using_filesort": true,
"table": {
"table_name": "book_states",
"access_type": "ref",
"possible_keys": [
"index1",
"index2",
"index3"
],
"key": "index2",
"used_key_parts": [
"bookID"
],
"key_length": "2",
"ref": [
"const"
],
"rows_examined_per_scan": 235278,
"rows_produced_per_join": 81679,
"filtered": "34.72",
"index_condition": "(`datastore`.`book_states`.`bookID` <=> 29)",
"cost_info": {
"read_cost": "235278.00",
"eval_cost": "16335.84",
"prefix_cost": "282333.60",
"data_read_per_join": "14M"
},
"used_columns": [
"id",
"date_time",
"bookID"
],
"attached_condition": "((`datastore`.`book_states`.`date_time` > '2018-09-28T16:18:49') and (`datastore`.`book_states`.`date_time` < '2018-09-29T23:18:49'))"
}
}
}
}
The best index for your query is on: (bookID, date_time). Note the order of the columns, it is quite important.
MySQL is struggling to optimize your query with the indexes on-hand. It can select the records, using the date_time part of your mentioned index (or using an index on bookId) and then sort the results.
Or, it can scan your compound index (which has records ordered by date/time), filtering out the unneeded books as it goes.
Choosing between these two methods is what you are (presumably) seeing. Which is better depends on the gathered statistics, and they necessarily provide only partial information.
So, switch the columns in the index and the problem should go away, at least for this particular query.

How to create a calculated item using item.create?

I am familiar with creating a calculated item with the GUI, but now I want to do it with the API. There is a "formula" field for an item object, but its type is int/float as stated in the document. So where do I put the formula needed by a calculated item?enter image description here
As per the Zabbix manual, it's the params field.
The string that you fill in "params" goes to "Formula"
Tested on version 6
"params": "last(//mysql.innodb_buffer_pool_pages_total) -
last(//mysql.innodb_buffer_pool_pages_free)"
Full context:
{
"jsonrpc": "2.0",
"method": "item.create",
"params": {
"name": "MySQL: Buffer pool utilization",
"key_": "mysql.buffer_pool_utilization",
"params": "last(//mysql.innodb_buffer_pool_pages_total) -
last(//mysql.innodb_buffer_pool_pages_free)"
"hostid": "30074",
"type": 15,
"value_type": 3,
"interfaceid": "30084",
"tags": [
{
"tag": "Disc usage"
},
{
"tag": "Equipment",
"value": "Workstation"
}
],
"delay": "30s"
},
"auth": "038e1d7b1735c6a5436ee9eae095879e",
"id": 1
}

Couchbase N1QL query generally slow

Im using couchbase for quite some time, but I never really experienced couchbase to be real fast. Its rather exceptionally slow.
I wonder what setting am I missing.
I have a root Server with following specs
Intel® Xeon® E5-2680V4 (4 Cores)
12 GB DDR4 ECC
60 GB SSD
Im running Couchbase 4.5.1-2844 Community Edition (build-2844)
with 7.05GB RAM Allocated
The bucket has 1 Data Node and uses 4.93 GB with 3,093,889 Documents.
The bucket Type is "Couchbase" with Cache Metadata set to "Value Ejection". Replicas are disabled. Disk I/O Optimization is set to Low. Flushing is not enabled.
All 3 million documents look smiliar to this one:
{
"discovered_by": 0,
"color": "FFBA00",
"updated_at": "2018-01-18T21:40:17.361Z",
"replier": 0,
"message": "Irgendwas los hier in Luckenwalde?🤔",
"children": "",
"view_count": 0,
"post_own": "FFBA00",
"user_handle": "oj",
"vote_count": [
{
"timestamp": "2018-01-19 09:48:48",
"votes": 0
}
],
"child_count": 3,
"share_count": 0,
"oj_replied": false,
"location": {
"loc_coordinates": {
"lat": 0,
"lng": 0
},
"loc_accuracy": 0,
"country": "",
"name": "Luckenwalde",
"city": ""
},
"tags": [],
"post_id": "59aef043f087270016dc5836",
"got_thanks": false,
"image_headers": "",
"cities": [
"Luckenwalde"
],
"pin_count": 0,
"distance": "friend",
"image_approved": false,
"created_at": "2017-09-05T18:43:15.904Z",
"image_url": ""
}
And a query could look like this
select COUNT(*) from sauger where color = 'FFBA00'
without an Index it fails to execute (timeout) via the couchbase-webapp, but with an index
CREATE INDEX color ON sauger(color)
the result takes up to 16 seconds, but after some tries it takes 2 to 3 seconds each time.
There are 6 different "Color-Strings" (like "FFBA00") and the result of the query is 466920 (which is a 6th of the total documents)
An Explain of above said query gives me this:
[
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexCountScan",
"covers": [
"cover ((`sauger`.`color`))",
"cover ((meta(`sauger`).`id`))"
],
"index": "color",
"index_id": "cc3524c6d5a8ef94",
"keyspace": "sauger",
"namespace": "default",
"spans": [
{
"Range": {
"High": [
"\"FFBA00\""
],
"Inclusion": 3,
"Low": [
"\"FFBA00\""
]
}
}
],
"using": "gsi"
},
{
"#operator": "IndexCountProject",
"result_terms": [
{
"expr": "count(*)"
}
]
}
]
},
"text": "select COUNT(*) from sauger where color = 'FFBA00'"
}
]
Everything is set up correctly, but still such simple queries take awfully long (and there is nothing else writing or reading from the database, and the server its running on is totally idle)
Make sure you don't have a primary index. That will consume a lot of the index service's memory. Your statement saying the query times out without the index makes me think there's a primary index, otherwise the query would fail immediately.
Edit: Adding more details on Primary Indexes from the Indexing Best Practices blog post
Avoid Primary Keys in Production
Unexpected full primary scans are a possible and any possibility of such occurrences should be removed by avoiding primary indexes altogether in Production. N1QL Index Selection is a rule based system for now that checks for a possible index that will satisfy the query, and if there is no such, then it resorts to using the Primary Index. Primary index has all the keys of the documents, and hence query will fetch all keys from the primary index and then hop to Data Service to fetch the documents and then apply filters. As you can see, this is a very expensive operation and should be avoided at all costs.
If there are no Primary Indexes created, and the query is not able to find a matching index to serve the query, then the Query Service errors with the following message. This is helpful and should help you in creating the required Secondary index suitably:
“No index available on keyspace travel-sample that matches your query. Use CREATE INDEX or CREATE PRIMARY INDEX to create an index, or check that your expected index is online.”

Cloudant Selector Query

I would like to query using cloudant db using selector, for example that is shown below: user would like to have loanborrowed whose amount exceeds a number, how to access the array in a cloudant selector to find a specific record
{
"_id": "65c5e4c917781f7365f4d814f6e1665f",
"_rev": "2-73615006996721fef9507c2d1dacd184",
"userprofile": {
"name": "tom",
"age": 30,
"employer": "Microsoft"
},
"loansBorrowed": [
{
"loanamount": 5000,
"loandate": "01/01/2001",
"repaymentdate": "01/01/2001",
"rateofinterest": 5.6,
"activeStatus": true,
"penalty": {
"penalty-amount": 500,
"reasonforPenalty": "Exceeded the date by 10 days"
}
},
{
"loanamount": 3000,
"loandate": "01/01/2001",
"repaymentdate": "01/01/2001",
"rateofinterest": 5.6,
"activeStatus": true,
"penalty": {
"penalty-amount": 400,
"reasonforPenalty": "Exceeded the date by 10 days"
}
},
{
"loanamount": 2000,
"loandate": "01/01/2001",
"repaymentdate": "01/01/2001",
"rateofinterest": 5.6,
"activeStatus": true,
"penalty": {
"penalty-amount": 500,
"reasonforPenalty": "Exceeded the date by 10 days"
}
}
]
}
If you use the default Cloudant Query index (type text, index everything):
{
"index": {},
"type": "text"
}
Then the following query selector should work to find e.g. all documents with a loanamount > 1000:
"loansBorrowed": { "$elemMatch": { "loanamount": { "$gt": 1000 } } }
I'm not sure that you can coax Cloudant Query to only index nested fields within an array so, if you don't need the flexibility of the "index everything" approach, you're probably better off creating a Cloudant Search index which indexes just the specific fields you need.
While Will's answer works, I wanted to let you know that you have other indexing options with Cloudant Query for handling arrays. This blog has the details on various tradeoffs (https://cloudant.com/blog/mango-json-vs-text-indexes/), but long story short, I think this might be the best indexing option for you:
{
"index": {
"fields": [
{"name": "loansBorrowed.[].loanamount", "type": "number"}
]
},
"type": "text"
}
Unlike Will's index-everything approach, here you're only indexing a specific field, and if the field contains an array, you're also indexing every element in the array. Particularly for "type": "text" indexes on large datasets, specifying a field to index will save you index-build time and storage space. Note that text indexes that specify a field must use the following form in the "fields": field: {"name": "fieldname", "type": "boolean,number, or string"}
So then the corresponding Cloudant Query "selector": statement would be this:
{
"selector": {
"loansBorrowed": {"$elemMatch": {"loanamount": {"$gt": 4000}}}
},
"fields": [
"_id",
"userprofile.name",
"loansBorrowed"
]
}
Also note that you don't have to include "fields": as part of your "selector": statement, but I did here to only project certain parts of the JSON. If you omit it from your "selector": statement, the entire document will be returned.