Mysql optimization and explode - mysql

I have the following query that displays the top 10 most drawn pairs of numbers from the whole
table
select
p, count(p) as frequency
from
(SELECT
id,
CASE power1 <= power2 WHEN TRUE THEN CONCAT(power1,"-",power2) ELSE CONCAT(power2,"-",power1)
END p
FROM power
UNION
SELECT
id,
CASE power1<=power3 WHEN TRUE THEN CONCAT(power1,"-",power3) ELSE CONCAT(power3,"-",power1) END p
FROM power
UNION
SELECT
id,
CASE power1<=power4 WHEN TRUE THEN CONCAT(power1,"-",power4) ELSE CONCAT(power4,"-",power1) END p
FROM power
UNION
...............................................
SELECT
id,
CASE power19<=power20 WHEN TRUE THEN CONCAT(power19,"-",power20) ELSE CONCAT(power20,"-",power19)
END p
FROM power) as b
group by
p
order by
frequency desc, p asc
limit
0, 10
How can I impose a limit to take just the first 100 lines in descending order by ID? The query would be like this:
ORDER BY id LIMIT 0,100
But I haven't been able to adapt it for the above.
Could the code be optimized more than that?
power1, Power2 are values from tables.... would it work if i would have a string like 3,4,5,6 and then explode "," and after that power1 becomes 3, power2 to become 4, etc?
I mean the table format to look something like this :
table2
LATER EDIT :
I have table like this :
Table: data
+----+----+-----+
| id | nr | set |
+----+----+-----+
| 1 | 52 | 1 |
| 2 | 47 | 1 |
| 3 | 4 | 1 |
| 4 | 3 | 1 |
| 5 | 77 | 1 |
| 6 | 71 | 1 |
| 7 | 6 | 1 |
| 8 | 41 | 1 |
| 9 | 15 | 1 |
| 10 | 79 | 1 |
| 11 | 35 | 2 |
| 12 | 50 | 2 |
| 13 | 16 | 2 |
| 14 | 1 | 2 |
| 15 | 32 | 2 |
| 16 | 77 | 2 |
| 17 | 30 | 2 |
| 18 | 7 | 2 |
| 19 | 20 | 2 |
| 20 | 28 | 2 |
| .. | .. | ... |
+----+----+-----+
I have like 34360 id
And the following query :
SELECT
`n1`.`nr` AS `num_1`,
`n2`.`nr` AS `num_2`,
COUNT(1) AS `total`
FROM (select * from data ORDER BY id DESC limit 0,1000) AS `n1`
JOIN `data` AS `n2`
ON `n1`.`set` = `n2`.`set` AND `n1`.`nr` < `n2`.`nr`
GROUP BY `n1`.`nr`, `n2`.`nr`
ORDER BY `total` DESC
LIMIT 20
And is working fine !
I would like to know how i can find out the pairs of numbers that have not been drawn together for the longest time. Per example:
1,42 (together, as a pair) has not been drawn for 24 draws
32,45-as a pair as well-has not been drawn for 22 draws
etc

Consider the following:
Un-normalised:
id power1 power2 power3 power4
1 4 9 10 16
2 6 12 15 19
3 2 4 6 7
4 3 8 15 17
5 2 10 11 14
6 4 10 12 19
7 1 4 9 11
Normalised:
id power value
1 1 4
1 2 9
1 3 10
1 4 16
2 1 6
2 2 12
2 3 15
2 4 19
3 1 2
3 2 4
3 3 6
3 4 7
4 1 3
4 2 8
4 3 15
4 4 17
5 1 2
5 2 10
5 3 11
5 4 14
6 1 4
6 2 10
6 3 12
6 4 19
7 1 1
7 2 4
7 3 9
7 4 11
So...
DROP TABLE IF EXISTS my_table;
CREATE TABLE my_table
(id INT NOT NULL
,power INT NOT NULL
,value INT NOT NULL
,PRIMARY KEY(id,power)
);
INSERT INTO my_table VALUES
(1,1,4),(1,2,9),(1,3,10),(1,4,16),
(2,1,6),(2,2,12),(2,3,15),(2,4,19),
(3,1,2),(3,2,4),(3,3,6),(3,4,7),
(4,1,3),(4,2,8),(4,3,15),(4,4,17),
(5,1,2),(5,2,10),(5,3,11),(5,4,14),
(6,1,4),(6,2,10),(6,3,12),(6,4,19),
(7,1,1),(7,2,4),(7,3,9),(7,4,11);
SELECT LEAST(x.value,y.value)a -- LEAST/GREATEST is only necessary in the event that
, GREATEST(x.value,y.value) b -- power1 value may be greater than powerN value
, COUNT(*) freq
FROM my_table x
JOIN my_table y
ON y.id = x.id
AND y.power < x.power
GROUP
BY LEAST(x.value, y.value) -- again only necessary if using LEAST/GREATEST above
, GREATEST(x.value,y.value)
ORDER
BY freq DESC
, a
, b;
+----+----+------+
| a | b | freq |
+----+----+------+
| 4 | 9 | 2 |
| 4 | 10 | 2 |
| 12 | 19 | 2 |
| 1 | 4 | 1 |
| 1 | 9 | 1 |
| 1 | 11 | 1 |
| 2 | 4 | 1 |
| 2 | 6 | 1 |
| 2 | 7 | 1 |
| 2 | 10 | 1 |
| 2 | 11 | 1 |
| 2 | 14 | 1 |
| 3 | 8 | 1 |
| 3 | 15 | 1 |
| 3 | 17 | 1 |
| 4 | 6 | 1 |
| 4 | 7 | 1 |
| 4 | 11 | 1 |
| 4 | 12 | 1 |
| 4 | 16 | 1 |
| 4 | 19 | 1 |
| 6 | 7 | 1 |
| 6 | 12 | 1 |
| 6 | 15 | 1 |
| 6 | 19 | 1 |
| 8 | 15 | 1 |
| 8 | 17 | 1 |
| 9 | 10 | 1 |
| 9 | 11 | 1 |
| 9 | 16 | 1 |
| 10 | 11 | 1 |
| 10 | 12 | 1 |
| 10 | 14 | 1 |
| 10 | 16 | 1 |
| 10 | 19 | 1 |
| 11 | 14 | 1 |
| 12 | 15 | 1 |
| 15 | 17 | 1 |
| 15 | 19 | 1 |
+----+----+------+

While I fully agree with #Strawberry about normalising your data, the following is an example of how to possibly do it with your current data structure (not tested).
SELECT CASE a.power_val <= b.power_val WHEN TRUE THEN CONCAT(a.power_val,"-",b.power_val) ELSE CONCAT(b.power_val,"-",a.power_val) END p,
COUNT(a.id) as frequency
FROM
(
SELECT id,1 AS power_col, power1 AS power_val FROM power UNION
SELECT id,2, power2 FROM power UNION
SELECT id,3, power3 FROM power UNION
SELECT id,4, power4 FROM power UNION
SELECT id,5, power5 FROM power UNION
SELECT id,6, power6 FROM power UNION
SELECT id,7, power7 FROM power UNION
SELECT id,8, power8 FROM power UNION
SELECT id,9, power9 FROM power UNION
SELECT id,10, power10 FROM power UNION
SELECT id,11, power11 FROM power UNION
SELECT id,12, power12 FROM power UNION
SELECT id,13, power13 FROM power UNION
SELECT id,14, power14 FROM power UNION
SELECT id,15, power15 FROM power UNION
SELECT id,16, power16 FROM power UNION
SELECT id,17, power17 FROM power UNION
SELECT id,18, power18 FROM power UNION
SELECT id,19, power19 FROM power UNION
SELECT id,20, power20 FROM power
ORDER BY id DESC
LIMIT 2000
) a
INNER JOIN
(
SELECT id, 1 AS power_col, power1 AS power_val FROM power UNION
SELECT id, 2, power2 FROM power UNION
SELECT id,3, power3 FROM power UNION
SELECT id,4, power4 FROM power UNION
SELECT id,5, power5 FROM power UNION
SELECT id,6, power6 FROM power UNION
SELECT id,7, power7 FROM power UNION
SELECT id,8, power8 FROM power UNION
SELECT id,9, power9 FROM power UNION
SELECT id,10, power10 FROM power UNION
SELECT id,11, power11 FROM power UNION
SELECT id,12, power12 FROM power UNION
SELECT id,13, power13 FROM power UNION
SELECT id,14, power14 FROM power UNION
SELECT id,15, power15 FROM power UNION
SELECT id,16, power16 FROM power UNION
SELECT id,17, power17 FROM power UNION
SELECT id,18, power18 FROM power UNION
SELECT id,19, power19 FROM power UNION
SELECT id,20, power20 FROM power
ORDER BY id DESC
LIMIT 2000
) b
ON a.id = b.id
AND a.power_col != b.power_col
GROUP BY p
ORDER BY frequency DESC, p ASC
LIMIT 0,10
Note using normalised data structures would likely be far quicker.
EDIT
Think something like the following might give you what you need.
The big sub query is to get every possible combination (idea is to also cope with pairs that have never been used), with the first number being smaller than the 2nd just for consistency. This is then joined against the tables of data to get the matching numbers and the respective id fields. Then uses MIN to get the smallest id:-
SELECT all_combo.num_1,
all_combo.num_2,
MIN(d1.id)
FROM
(
SELECT sub0.nr AS num_1,
sub1.nr AS num_2
FROM
(
SELECT DISTINCT nr
FROM data
) sub0
INNER JOIN
(
SELECT DISTINCT nr
FROM data
) sub1
WHERE sub0.nr < sub1.nr
) all_combo
LEFT OUTER JOIN data d1 ON all_combo.num_1
LEFT OUTER JOIN data d2 ON all_combo.num_2 AND d1.set = d2.set
GROUP BY all_combo.num_1,
all_combo.num_2

Related

Adding a moving average column to a table using values from previous 2 entries

I currently have the following simplified tables in my database. The points table contains rows of points awarded to each user for every bid form they have voted in.
I would like to add a column to this table that for each row, it shows the AVERAGE of the previous TWO points awarded to THAT user.
Users
+----+----------------------+
| id | name |
+----+----------------------+
| 1 | Flossie Schamberger |
| 2 | Lawson Graham |
| 3 | Hadley Reilly |
+----+----------------------+
Bid Forms
+----+-----------------+
| id | name |
+----+-----------------+
| 1 | Summer 2017 |
| 2 | Winter 2017 |
| 3 | Summer 2018 |
| 4 | Winter 2019 |
| 5 | Summer 2019 |
+----+-----------------+
Points
+-----+---------+--------------------+------------+------------+
| id | user_id | leave_bid_forms_id | bid_points | date |
+-----+---------+--------------------+------------+------------+
| 1 | 1 | 1 | 6 | 2016-06-19 |
| 2 | 2 | 1 | 8 | 2016-06-19 |
| 3 | 3 | 1 | 10 | 2016-06-19 |
| 4 | 1 | 2 | 4 | 2016-12-18 |
| 5 | 2 | 2 | 8 | 2016-12-18 |
| 6 | 3 | 2 | 4 | 2016-12-18 |
| 7 | 1 | 3 | 10 | 2017-06-18 |
| 8 | 2 | 3 | 12 | 2017-06-18 |
| 9 | 3 | 3 | 4 | 2017-06-18 |
| 10 | 1 | 4 | 4 | 2017-12-17 |
| 11 | 2 | 4 | 4 | 2017-12-17 |
| 12 | 3 | 4 | 2 | 2017-12-17 |
| 13 | 1 | 5 | 16 | 2018-06-17 |
| 14 | 2 | 5 | 12 | 2018-06-17 |
| 15 | 3 | 5 | 10 | 2018-06-17 |
+-----+---------+--------------------+------------+------------+
For each row in the points table I would like an average_points column to be calculated like follows.
The average point column is the average of that users PREVIOUS 2 points. So for the first entry in the table for each user, the average is obviously 0 because there were no previous points awarded to them.
The previous 2 points for each user should be determined using the date column.
The table below is what I would like to have as the final output.
For clarity, to the side of the table, I have added the calculation and numbers used to arrive at the value in the averaged_points column.
+-----+---------+--------------------+------------+-----------------+
| id | user_id | leave_bid_forms_id | date | averaged_points |
+-----+---------+--------------------+------------+-----------------+
| 1 | 1 | 1 | 2016-06-19 | 0 | ( 0 + 0 ) / 2
| 2 | 2 | 1 | 2016-06-19 | 0 | ( 0 + 0 ) / 2
| 3 | 3 | 1 | 2016-06-19 | 0 | ( 0 + 0 ) / 2
| 4 | 1 | 2 | 2016-12-18 | 3 | ( 6 + 0 ) / 2
| 5 | 2 | 2 | 2016-12-18 | 4 | ( 8 + 0 ) / 2
| 6 | 3 | 2 | 2016-12-18 | 5 | ( 10 + 0) / 2
| 7 | 1 | 3 | 2017-06-18 | 5 | ( 4 + 6 ) / 2
| 8 | 2 | 3 | 2017-06-18 | 8 | ( 8 + 8 ) / 2
| 9 | 3 | 3 | 2017-06-18 | 7 | ( 4 + 10) / 2
| 10 | 1 | 4 | 2017-12-17 | 7 | ( 10 + 4) / 2
| 11 | 2 | 4 | 2017-12-17 | 10 | ( 12 + 8) / 2
| 12 | 3 | 4 | 2017-12-17 | 4 | ( 4 + 4 ) / 2
| 13 | 1 | 5 | 2018-06-17 | 7 | ( 4 + 10) / 2
| 14 | 2 | 5 | 2018-06-17 | 8 | ( 4 + 12) / 2
| 15 | 3 | 5 | 2018-06-17 | 3 | ( 2 + 4 ) / 2
+-----+---------+--------------------+------------+-----------------+
I've been trying to use subqueries to solve this issue as AVG doesn't seem to be affected by any LIMIT clause I have.
So far I have come up with
select id, user_id, leave_bid_forms_id, `date`,
(
SELECT
AVG(bid_points)
FROM (
Select `bid_points`
FROM points as p2
ORDER BY p2.date DESC
Limit 2
) as thing
) AS average_points
from points as p1
This is in this sqlfiddle but to be honest I'm out of my depth here.
Am I on the right path? Wondering if someone would be able to show me where I need to tweak things please!
Thanks.
EDIT
Using the the answer below as a basis I was able to tweak the sql to work with the tables provided in the original sqlfiddle.
I have added that to this sqlfiddle to show it working
The corrected sql to match the code above is
select p.*,
IFNULL(( (coalesce(points_1, 0) + coalesce(points_2, 0)) /
( (points_1 is not null) + (points_2 is not null) )
),0) as prev_2_avg
from (select p.*,
(select p2.bid_points
from points p2
where p2.user_id = p.user_id and
p2.date < p.date
order by p2.date desc
limit 1
) as points_1,
(select p2.bid_points
from points p2
where p2.user_id = p.user_id and
p2.date < p.date
order by p2.date desc
limit 1, 1
) as points_2
from points as p
) p;
Although I am about to ask another question about the best way to make this dynamic with the number of previous poingt that need to be averaged.
You can use window functions, which were introduced in MySQL 8.
select p.*,
avg(points) over (partition by user_id
order by date
rows between 2 preceding and 1 preceding
) as prev_2_avg
from p;
In earlier versions, this is a real pain, because MySQL does not support nested correlation clauses. One method is with a separate column for each one:
select p.*,
( (coalesce(points_1, 0) + coalesce(points_2, 0)) /
( (points_1 is not null) + (points_2 is not null) )
) as prev_2_avg
from (select p.*,
(select p2.points
from points p2
where p2.user_id = p.user_id and
p2.date < p.date
order by p2.date desc
limit 1
) as points_1,
(select p2.points
from points p2
where p2.user_id = p.user_id and
p2.date < p.date
order by p2.date desc
limit 1, 1
) as points_2
from p
) p;

Not able to order column in mysql

I am using this table from the Northwind dataset (can be generated from query below)
+-----------+-----------+
| NumOrders | CustCount |
+-----------+-----------+
| 1 | 1 |
| 2 | 2 |
| 3 | 7 |
| 4 | 6 |
| 5 | 10 |
| 6 | 8 |
| 7 | 7 |
| 8 | 4 |
| 9 | 5 |
| 10 | 11 |
| 11 | 4 |
| 12 | 3 |
| 13 | 3 |
| 14 | 6 |
| 15 | 3 |
| 17 | 1 |
| 18 | 3 |
| 19 | 2 |
| 28 | 1 |
| 30 | 1 |
| 31 | 1 |
+-----------+-----------+`
And I want to write a query to provide a histogram of the number of x people who made y number of orders
select
case
when NumOrders > 0 and NumOrders <= 5 then '0 - 5'
when NumOrders > 5 and NumOrders <=10 then '6 - 10'
else '10+'
end as Bucket,
CustomerCount = sum(CustCount)
from (
select
NumOrders,
CustCount = count(*)
from (
select *
from (
select
CustomerID,
count(*) as NumOrders
from orders
group by CustomerID
) c
) b
group by NumOrders
)a
group by
(
case
when NumOrders > 0 and NumOrders <= 5 then '0 - 5'
when NumOrders > 5 and NumOrders <=10 then '6 - 10'
else '10+'
end
)
From the query above I am getting this output, which is ordered incorrectly.
+--------+---------------+
| Bucket | CustomerCount |
+--------+---------------+
| 0 - 5 | 26 |
| 10+ | 28 |
| 6 - 10 | 35 |
+--------+---------------+
I would like it to be ordered as
+--------+---------------+
| Bucket | CustomerCount |
+--------+---------------+
| 0 - 5 | 26 |
| 6 - 10 | 35 |
| 10+ | 28 |
+--------+---------------+
Can someone suggest how to order it correctly?
You just need
Order by NumOrders
at the very end of your query
I can't see what part of the problem this fails to solve...
DROP TABLE IF EXISTS my_table;
CREATE TABLE my_table
(NumOrders SERIAL PRIMARY KEY
,CustCount INT NOT NULL
);
INSERT INTO my_table VALUES
(1 ,1),
(2 ,2),
(3 ,7),
(4 ,6),
(5 ,0),
(6 ,8),
(7 ,7),
(8 ,4),
(9 ,5),
(10,1),
(11,4),
(12,3),
(13,3),
(14,6),
(15,3),
(17,1),
(18,3),
(19,2),
(28,1),
(30,1),
(31,1);
SELECT CASE WHEN numorders BETWEEN 0 AND 5 THEN '0-5'
WHEN numorders BETWEEN 6 AND 10 THEN '6-10'
ELSE '+10' END bucket
, COUNT(*) total
FROM my_table
GROUP
BY bucket
ORDER
BY numorders;
+--------+-------+
| bucket | total |
+--------+-------+
| 0-5 | 5 |
| 6-10 | 5 |
| +10 | 11 |
+--------+-------+

SUM from the results of a subquery of N results as max for each user

Let's suppose this schema:
CREATE TABLE test
(
test_Id INT NOT NULL PRIMARY KEY AUTO_INCREMENT,
user_Id INT NOT NULL,
date DATE,
result VARCHAR(255) NOT NULL,
) engine=innodb;
My goal is to pick up the last 5 results as maximum for each different user_Id, ordered from newest to oldest. Besides that, depending on result column I want to calculate a ratio of those last results, to be able to pick up the 3 users with best ratio.
So let's take this data as example:
test_Id | user_Id | date | result
1 | 1 |2016-09-05 | A
2 | 3 |2016-09-13 | A
3 | 3 |2016-09-30 | A
4 | 4 |2016-09-22 | A
5 | 4 |2016-09-11 | C
6 | 7 |2016-09-18 | D
7 | 4 |2016-09-08 | B
8 | 6 |2016-09-20 | E
9 | 7 |2016-09-16 | A
10 | 7 |2016-09-29 | E
11 | 7 |2016-09-23 | A
12 | 7 |2016-09-16 | B
13 | 4 |2016-09-15 | B
14 | 7 |2016-09-07 | C
15 | 7 |2016-09-09 | A
16 | 3 |2016-09-26 | A
17 | 4 |2016-09-11 | C
18 | 4 |2016-09-30 | E
What I have been able to achieve is this query:
SELECT p.user_Id, p.RowNumber, p.date, p.result,
SUM(CASE WHEN p.result='A' OR p.result='B'
THEN 1 ELSE 0 END) as avg
FROM (
SELECT #row_num := IF(#prev_value=user_Id,#row_num+1,1)
AS RowNumber, test_Id, user_Id, date, result,
#prev_value := user_Id
FROM test,
(SELECT #row_num := 1) x,
(SELECT #prev_value := '') y
WHERE #prev_value < 5
ORDER BY user_Id, YEAR(date) DESC, MONTH(date) DESC,
DAY(date) DESC
) p
WHERE p.RowNumber <=10
GROUP BY p.user_Id, p.test_Id
ORDER BY p.user_Id, p.RowNumber;
This query provides me this kind of output:
RowNumber |test_Id | user_Id | date | result | avg
1 | 1 | 1 |2016-09-05 | A | 1
1 | 3 | 3 |2016-09-30 | A | 1
2 | 16 | 3 |2016-09-26 | A | 1
3 | 2 | 3 |2016-09-13 | A | 1
1 | 18 | 4 |2016-09-30 | E | 0
2 | 4 | 4 |2016-09-22 | A | 1
3 | 13 | 4 |2016-09-15 | B | 1
4 | 5 | 4 |2016-09-11 | C | 0
5 | 17 | 4 |2016-09-11 | C | 0
1 | 8 | 6 |2016-09-20 | E | 0
1 | 10 | 7 |2016-09-29 | E | 0
2 | 11 | 7 |2016-09-23 | A | 1
3 | 6 | 7 |2016-09-18 | D | 0
4 | 9 | 7 |2016-09-16 | A | 1
5 | 12 | 7 |2016-09-16 | B | 1
What I was expecting is that in the avg column would get the total of the results for each user that match the condition (A or B value), to be able to calculate a ratio from the 5 results for each user_id. (0, 0.2, 0.4, 0.6, 0.8, 1).
Something like this:
RowNumber |test_Id | user_Id | date | result | avg
1 | 1 | 1 |2016-09-05 | A | 1
1 | 3 | 3 |2016-09-30 | A | 3
2 | 16 | 3 |2016-09-26 | A | 3
3 | 2 | 3 |2016-09-13 | A | 3
1 | 18 | 4 |2016-09-30 | E | 2
2 | 4 | 4 |2016-09-22 | A | 2
3 | 13 | 4 |2016-09-15 | B | 2
4 | 5 | 4 |2016-09-11 | C | 2
5 | 17 | 4 |2016-09-11 | C | 2
1 | 8 | 6 |2016-09-20 | E | 0
1 | 10 | 7 |2016-09-29 | E | 3
2 | 11 | 7 |2016-09-23 | A | 3
3 | 6 | 7 |2016-09-18 | D | 3
4 | 9 | 7 |2016-09-16 | A | 3
5 | 12 | 7 |2016-09-16 | B | 3
Am I being restricted by the GROUP BY p.user_Id, p.test_Id clause when doing the SUM? I tried the query with only user_Id as GROUP BY clause and only test_Id too as GROUP BY clause, without getting the expected results.
I think you need to calculate the avg and then join
select a.rn,a.test_id,a.user_id,a.date,a.result,u.avg from
(
select t1.*
, if (t1.user_id <> #p, #rn:=1,#rn:=#rn+1) rn
, #p:=t1.user_id p
from (select #rn:=0, #p:='') rn,test t1
order by t1.user_id, t1.date desc
) a
join
(
select s.user_id
, sum(case when s.result = 'A' or s.result = 'B' then 1 else 0 end) as avg
from
(
select t1.*
, if (t1.user_id <> #p, #rn:=1,#rn:=#rn+1) rn
, #p:=t1.user_id p
from (select #rn:=0, #p:='') rn,test t1
order by t1.user_id, t1.date desc
) s
where s.rn <= 5
group by s.user_id
) u on u.user_id = a.user_id
where a.rn <= 5

How do i perform grouped ranking in MySQL with row ties?

ID_STUDENT | ID_CLASS | GRADE | RANK
2 | 1 | 90 | 1
1 | 1 | 90 | 1
3 | 1 | 90 | 1
4 | 1 | 70 | 4
6 | 2 | 90 | 1
1 | 2 | 80 | 2
5 | 2 | 78 | 3
7 | 3 | 90 | 1
6 | 3 | 50 | 2
How should i sort and rank the data to get the above result? Thanks in advance
One method is with a subquery:
select sc.*,
(select count(*) + 1
from studentclass sc2
where sc2.grade > sc.grade and sc2.id_class = sc.id_class
) as rank
from studentclass sc order by id_class, grade;
In ANSI SQL (and most other databases), this is provided by the rank() function. You might be interested in the differences between rank(), dense_rank(), and row_number().

Complex SQL query suggestions please

I have three tables with schema as below:
Table: Apps
| ID (bigint) | USERID (Bigint)| START_TIME (datetime) |
-------------------------------------------------------------
| 1 | 13 | 2013-05-03 04:42:55 |
| 2 | 13 | 2013-05-12 06:22:45 |
| 3 | 13 | 2013-06-12 08:44:24 |
| 4 | 13 | 2013-06-24 04:20:56 |
| 5 | 13 | 2013-06-26 08:20:26 |
| 6 | 13 | 2013-09-12 05:48:27 |
Table: Hosts
| ID (bigint) | APPID (Bigint)| DEVICE_ID (Bigint) |
-------------------------------------------------------------
| 1 | 1 | 1 |
| 2 | 2 | 1 |
| 3 | 1 | 1 |
| 4 | 3 | 3 |
| 5 | 1 | 4 |
| 6 | 2 | 3 |
Table: Usage
| ID (bigint) | APPID (Bigint)| HOSTID (Bigint) | Factor (varchar) |
-------------------------------------------------------------------------------------
| 1 | 1 | 1 | Low |
| 2 | 1 | 3 | High |
| 3 | 2 | 2 | Low |
| 4 | 3 | 4 | Medium |
| 5 | 1 | 5 | Low |
| 6 | 2 | 2 | Medium |
Now if put is userid, i want to get the count of rows of table rows for each month (of all app) for each "Factor" month wise for the last 6 months.
If a DEVICE_ID appears more than once in a month (based on START_TIME, based on joining Apps and Hosts), only the latest rows of Usage (based on combination of Apps, Hosts and Usage) be considered for calculating count.
Example output of the query for the above example should be: (for input user id=13)
| MONTH | USAGE_COUNT | FACTOR |
-------------------------------------------------------------
| 5 | 0 | High |
| 6 | 0 | High |
| 7 | 0 | High |
| 8 | 0 | High |
| 9 | 0 | High |
| 10 | 0 | High |
| 5 | 2 | Low |
| 6 | 0 | Low |
| 7 | 0 | Low |
| 8 | 0 | Low |
| 9 | 0 | Low |
| 10 | 0 | Low |
| 5 | 1 | Medium |
| 6 | 1 | Medium |
| 7 | 0 | Medium |
| 8 | 0 | Medium |
| 9 | 0 | Medium |
| 10 | 0 | Medium |
How is this calculated?
For Month May 2013 (05-2013), there are two Apps from table Apps
In table Hosts , these apps are associated with device_id's 1,1,1,4,3
For this month (05-2013) for device_id=1, the latest value of start_time is: 2013-05-12 06:22:45 (from tables hosts,apps), so in table Usage, look for combination of appid=2&hostid=2 for which there are two rows one with factor Low and other Medium,
For this month (05-2013) for device_id=4, by following same procedure we get one entry i.e 0 Low
Similarly all the values are calculated.
To get the last 6 months via query i'm trying to get it with the following:
SELECT MONTH(DATE_ADD(NOW(), INTERVAL aInt MONTH)) AS aMonth
FROM
(
SELECT 0 AS aInt UNION SELECT -1 UNION SELECT -2 UNION SELECT -3 UNION SELECT -4 UNION SELECT -5
)
Please check sqlfiddle: http://sqlfiddle.com/#!2/55fc2
Because the calculation you're doing involves the same join multiple times, I started by creating a view.
CREATE VIEW `app_host_usage`
AS
SELECT a.id "appid", h.id "hostid", u.id "usageid",
a.userid, a.start_time, h.device_id, u.factor
FROM apps a
LEFT OUTER JOIN hosts h ON h.appid = a.id
LEFT OUTER JOIN `usage` u ON u.appid = a.id AND u.hostid = h.id
WHERE a.start_time > DATE_ADD(NOW(), INTERVAL -7 MONTH)
The WHERE condition is there because I made the assumption that you don't want July 2005 and July 2006 to be grouped together in the same count.
With that view in place, the query becomes
SELECT months.Month, COUNT(DISTINCT device_id), factors.factor
FROM
(
-- Get the last six months
SELECT (MONTH(NOW()) + aInt + 11) % 12 + 1 "Month" FROM
(SELECT 0 AS aInt UNION SELECT -1 UNION SELECT -2 UNION SELECT -3 UNION SELECT -4 UNION SELECT -5) LastSix
) months
JOIN
(
-- Get all known factors
SELECT DISTINCT factor FROM `usage`
) factors
LEFT OUTER JOIN
(
-- Get factors for each device...
SELECT
MONTH(start_time) "Month",
device_id,
factor
FROM app_host_usage a
WHERE userid=13
AND start_time IN (
-- ...where the corresponding usage row is connected
-- to an app row with the highest start time of the
-- month for that device.
SELECT MAX(start_time)
FROM app_host_usage a2
WHERE a2.device_id = a.device_id
GROUP BY MONTH(start_time)
)
GROUP BY MONTH(start_time), device_id, factor
) usageids ON usageids.Month = months.Month
AND usageids.factor = factors.factor
GROUP BY factors.factor, months.Month
ORDER BY factors.factor, months.Month
which is insanely complicated, but I've tried to comment explaining what each part does. See this sqlfiddle: http://sqlfiddle.com/#!2/5c871/1/0