Get 100 rows with maximum of 10 rows per group - mysql

I have the following query, and I would like to get 100 items from the database, but host_id is in the urls table many times, and I would like to get a maximum of 10 unique rows from that table per host_id.
select *
from urls
join hosts using(host_id)
where
(
last_run_date is null
or last_run_date <= date_sub(curdate(), interval 30 day)
)
and ignore_url != 1
limit 100
So, I would like:
Maximum Results = 100
Max Rows Per Host = 10
I am not sure what I would need to do to accomplish this task. Is there a way to do this without a subquery?
Hosts Table
CREATE TABLE `hosts` (
`host_id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
`host` VARCHAR(50) NOT NULL,
`last_fetched` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
`ignore_host` TINYINT(1) UNSIGNED NOT NULL,
PRIMARY KEY (`host_id`),
UNIQUE INDEX `host` (`host`)
)
Urls Table
CREATE TABLE `urls` (
`url_id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
`parent_url_id` INT(10) UNSIGNED NOT NULL,
`scheme` VARCHAR(5) NOT NULL,
`host_id` INT(10) UNSIGNED NOT NULL,
`path` VARCHAR(500) NOT NULL,
`query` VARCHAR(500) NOT NULL,
`date_found` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
`last_run_date` DATETIME NULL DEFAULT NULL,
`ignore_url` TINYINT(1) UNSIGNED NOT NULL,
PRIMARY KEY (`url_id`),
UNIQUE INDEX `host_path_query` (`host_id`, `path`, `query`)
)

Thats it (I hope)
I cant test i real. i have no data. pls test it and give me a little ping.
SELECT *
FROM (
SELECT
#nr:=IF(#lasthost = host_id, #nr+1, 1) AS nr,
u.*,
#lasthost:=IF(#lasthost = host_id, #lasthost, host_id) AS lasthost
FROM
urls u,
( SELECT #nr:=4, #lasthost:=-1 ) AS tmp
WHERE (
last_run_date IS NULL
OR last_run_date <= date_sub(curdate(), INTERVAL 30 DAY)
)
AND ignore_url != 1
ORDER BY host_id, last_run_date
) AS t
LEFT JOIN HOSTS USING(host_id)
WHERE t.nr < 11
LIMIT 100;
ok,
first:
I only select the rows with your query, and order it
by the host_id and time
SELECT
u.*
FROM
urls u
( SELECT #nr:=4, #lasthost:=-1 ) AS tmp
WHERE (
last_run_date IS NULL
OR last_run_date <= date_sub(curdate(), INTERVAL 30 DAY)
)
AND ignore_url != 1
ORDER BY host_id, last_run_date
second
I add to variables nr and lasthost and setup it in the select. Now
i count nr each row and reset it to 1 if the host_id is change. So i get a
list of rows numbert from 1 to n for each host_id
SELECT
#nr:=IF(#lasthost = host_id, #nr+1, 1) AS nr,
u.*,
#lasthost:=IF(#lasthost = host_id, #lasthost, host_id) AS lasthost
FROM
urls u,
( SELECT #nr:=4, #lasthost:=-1 ) AS tmp
WHERE (
last_run_date IS NULL
OR last_run_date <= date_sub(curdate(), INTERVAL 30 DAY)
)
AND ignore_url != 1
ORDER BY host_id, last_run_date
third
i put it this query in a new select so i can join your second table and restrict the result only for rows less 11 and also limit the result to 100
SELECT *
FROM (
SELECT
#nr:=IF(#lasthost = host_id, #nr+1, 1) AS nr,
u.*,
#lasthost:=IF(#lasthost = host_id, #lasthost, host_id) AS lasthost
FROM
urls u,
( SELECT #nr:=4, #lasthost:=-1 ) AS tmp
WHERE (
last_run_date IS NULL
OR last_run_date <= date_sub(curdate(), INTERVAL 30 DAY)
)
AND ignore_url != 1
ORDER BY host_id, last_run_date
) AS t
LEFT JOIN HOSTS USING(host_id)
WHERE t.nr < 11
LIMIT 100;
Thats all

So you need a limited JOIN. Another guess:
SELECT * FROM hosts
LEFT JOIN urls ON
urls.host_id = hosts.host_id
WHERE urls.host_id IN
(SELECT host_id FROM urls
LIMIT 0,10)
LIMIT 0,100

Related

SQL query to select all rows with max column value

CREATE TABLE `user_activity` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`user_id` int(11) DEFAULT NULL,
`type` enum('request','response') DEFAULT NULL,
`data` longtext NOT NULL,
`created_at` datetime DEFAULT NULL,
`source` varchar(255) DEFAULT NULL,
`task_name` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
);
I have this data:-
Now I need to select all rows for user_id=527 where created_at value is the maximum. So I need the last 3 rows in this image.
I wrote this query:-
SELECT *
FROM user_activity
WHERE user_id = 527
AND source = 'E1'
AND task_name IN ( 'GetReportTask', 'StopMonitoringUserTask' )
AND created_at = (SELECT Max(created_at)
FROM user_activity
WHERE user_id = 527
AND source = 'E1'
AND task_name IN ( 'GetReportTask',
'StopMonitoringUserTask' ));
This is very inefficient because I am running the exact same query again as an inner query except that it disregards created_at. What's the right way to do this?
I would use a correlated subquery:
SELECT ua.*
FROM user_activity ua
WHERE ua.user_id = 527 AND source = 'E1' AND
ua.task_name IN ('GetReportTask', 'StopMonitoringUserTask' ) AND
ua.created_at = (SELECT MAX(ua2.created_at)
FROM user_activity ua2
WHERE ua2.user_id = ua.user_id AND
ua2.source = ua.source AND
ua2.task_name IN ( 'GetReportTask', 'StopMonitoringUserTask' )
);
Although this might seem inefficient, you can create an index on user_activity(user_id, source, task_name, created_at). With this index, the query should have decent performance.
Order by created_at desc and limit your query to return 1 row.
SELECT *
FROM user_activity
WHERE user_id = 527
AND source = 'E1'
AND task_name IN ( 'GetReportTask', 'StopMonitoringUserTask' )
ORDER BY created_at DESC
LIMIT 1;
I used EverSQL and applied my own changes to come up with this single-select query that uses self-join:-
SELECT *
FROM user_activity AS ua1
LEFT JOIN user_activity AS ua2
ON ua2.user_id = ua1.user_id
AND ua2.source = ua1.source
AND ua2.task_name IN ( 'GetReportTask', 'StopMonitoringUserTask' )
AND ua1.created_at < ua2.created_at
WHERE ua1.user_id = 527
AND ua1.source = 'E1'
AND ua1.task_name IN ( 'GetReportTask', 'StopMonitoringUserTask' )
AND ua2.created_at IS NULL;
However, I noticed that the response times of both queries were similar. I tried to use Explain to identify any performance differences; and from what I understood from its output, there are no noticeable differences because proper indexing is in place. So for readability and maintainability, I'll just use the nested query.

Load top 5 records per date

I have a table, in which there are date wise quiz score of different users. I want to load top 5 scorers for every date.
Table sample create statement:
CREATE TABLE `subscriber_score` (
`msisdn` varchar(25) COLLATE utf8_unicode_ci NOT NULL,
`date` date NOT NULL,
`score` int(11) NOT NULL DEFAULT '0',
`total_questions_sent` int(11) NOT NULL DEFAULT '0',
`total_correct_answers` int(11) NOT NULL DEFAULT '0',
`total_wrong_answers` int(11) NOT NULL DEFAULT '0',
PRIMARY KEY (`msisdn`,`date`),
KEY `fk_subscriber_score_subscriber1` (`msisdn`),
CONSTRAINT `fk_subscriber_score_subscriber1` FOREIGN KEY (`msisdn`) REFERENCES `subscriber` (`msisdn`) ON DELETE NO ACTION ON UPDATE NO ACTION
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
Query which I have tried:
SELECT subscriber.msisdn AS msisdn,subscriber.name AS name,subscriber.gender AS gender,tmp2.score AS score,tmp2.date AS winning_date
FROM subscriber,
(SELECT msisdn,tmp.date,tmp.score
FROM subscriber_score,
(SELECT date,MAX(score) AS score
FROM subscriber_score
WHERE date > '2014-10-10' AND date < '2014-11-10' GROUP BY date)
tmp
WHERE subscriber_score.date=tmp.date AND subscriber_score.score=tmp.score)
tmp2
WHERE subscriber.msisdn=tmp2.msisdn ORDER BY winning_date
Actual output: Only one top scorer for every date is shown.
Wanted Output Top 5(or say 10) records for every date are required.
I think you can do this using variables to assign each row a row number, then filter the top 5 for each date.
SELECT s.name AS name,
s.gender AS gender,
s.msisdn,
ss.date,
ss.score
FROM ( SELECT ss.msisdn,
ss.score,
#r:= CASE WHEN ss.Date = #d THEN #r + 1 ELSE 1 END AS RowNum,
#d:= ss.date AS winning_date
FROM subscriber_score AS ss
CROSS JOIN (SELECT #d:= '', #r:= 0) AS v
WHERE ss.date > '2014-10-10'
AND ss.date < '2014-11-10'
ORDER BY ss.Date, ss.Score DESC
) AS ss
INNER JOIN Subscriber AS s
ON s.msisdn = ss.msisdn
WHERE ss.RowNum <= 5;
Example on SQL Fiddle
refer this query its not complete but hope it helps
SELECT SCORE
FROM table
WHERE date='somedate'
ORDER BY SCORE DESC LIMIT 5
select bc.msisdn msisdn,bc.name name,bc.gender gender,ab.score score,ab.date winning_date
(
select msisdn,date,score,
dense_rank() over (partition by date order by score desc) rnk
from subscriber_score
) ab,subscriber bc
where bc.msisdn=ab.msisdn and ab.rnk<=5
order by winning_date ;
This is how you can get solution of your problem in oracle sql.
try below
SELECT subscriber.msisdn AS msisdn,subscriber.name AS name,subscriber.gender AS gender,tmp2.score AS score,tmp2.date AS winning_date
FROM subscriber inner join
(select msisdn,date, score, ROW_NUMBER() OVER(PARTITION BY date ORDER BY score DESC) AS Row
FROM subscriber_score
WHERE date > '2014-10-10' AND date < '2014-11-10' GROUP BY date)
tmp
on subscriber.msisdn=tmp.msisdn and tmp.row<=5

Select statement joining 2 tables, searching by date, and status

OK I think I have messed up somewhere but maybe someone can spot my error, because I have little clue of what I am doing.
I have 2 Tables Players and RegionPlayer (see bottom for structure)
I am trying to find when a none of the players on a region have been seen in a while. Players can be on vacation which gives then 58 days, else its only 8 days.
If none of the players on a region have been seen in that time, I want the sql search to return the regionID, as well as the most recent person on that region who was seen.
Now I think that way to do this is to get 2 results from each region, each providing me the most recent player seen who was on vacation, and who was not on vacation.
But while, I thought this would give me that, it doesn't seem to.
SELECT RegionPlayer.Regionid, Players.key, Players.Name, Players.Seen, Players.Vacation
FROM RegionPlayer
JOIN Players
ON Players.Key = RegionPlayer.Playerid
where ( RegionPlayer.Status = 1 )
GROUP BY RegionPlayer.Regionid DESC, Players.Vacation DESC
ORDER BY Players.Seen DESC
Then I am going to need to be able to tell who has not been seen in a while, this should give me that.
Now I know I can link both queries together, but I have no idea how, it has been many years since I last had to put this much effort into sql statements.
Select Players.key FROM Players
WHERE
(( Players.Vacation != 1 ) AND
( Players.Seen <= (NOW() - INTERVAL 8 DAY ) ))
OR
(( Players.Vacation != 0 ) AND
( Players.Seen <= (NOW() - INTERVAL 58 DAY ) ))
Is There a better way of doing this, I sort of remember things like views, and store procedures, and functions, would one or more of them be better?
Table Structure.
Please forgive, the names, of the tables and some of the structure, This is an example of why deciding things late at night after 1/2 a bottle of wine is a bad idea.
CREATE TABLE IF NOT EXISTS `Players` (
`key` int(11) NOT NULL,
`Name` varchar(255) NOT NULL,
`Vacation` varchar(1) NOT NULL,
`Seen` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
`Modified` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
)
CREATE TABLE IF NOT EXISTS `RegionPlayer` (
`Key` int(11) NOT NULL,
`Playerid` int(11) NOT NULL,
`Regionid` int(11) NOT NULL,
`Type` varchar(1) NOT NULL,
`Status` int(1) NOT NULL DEFAULT '1',
`Modified` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`Created` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00'
)
I've put up an SQLFiddle.
The query that answers your basic requirement, which seems to be: list all regions that have no active player seen in the last 8 days and no vacated player seen in the last 58 days, giving also the data of the last seen player in that region:
SELECT r.*
FROM (
SELECT rp.Regionid, p.Key, p.Name, p.Vacation, p.Seen
FROM RegionPlayer rp
JOIN Players p ON p.Key = rp.Playerid
WHERE rp.Status = 1
GROUP BY rp.Regionid
ORDER BY p.Seen DESC
) r
WHERE ((r.Vacation != 1) AND (r.Seen <= (NOW()-INTERVAL 8 DAY)))
OR ((r.Vacation != 0) AND (r.Seen <= (NOW()-INTERVAL 58 DAY)));
I desumed from your SQL that only RegionPlayer rows with a Status of 1 should be considered.
On the SQLFiddle I've create a bit of regions with different combinations, and this query does its job.
As to your first SQL statement. You say it doesn't work as expected, but to me it seems to do it... the last seen active player and last seen vacated player for each region. The sorting may not make it very readable, but it does do that.
Try this
SELECT RegionPlayer.Regionid, m.key, m.Name, m.Seen, m.Vacation
FROM RegionPlayer
JOIN (Select * as key FROM Players
WHERE
(( Players.Vacation != 1 ) AND
( Players.Seen <= (NOW() - INTERVAL 8 DAY ) ))
OR
(( Players.Vacation != 0 ) AND
( Players.Seen <= (NOW() - INTERVAL 58 DAY ) ))) m
ON m.Key = RegionPlayer.Playerid
where ( RegionPlayer.Status = 1 )
GROUP BY RegionPlayer.Regionid DESC, m.Vacation DESC
ORDER BY m.Seen DESC

MYSQL: BASIC SELECT with != statemenet

SELECT id, company_id, user_id, valid_until, price
FROM `classifieds`
WHERE `user_id` = 1
AND `valid_until` > now()
AND `price` = -2
this query gives me one row, and it works ok...
as soon as i add a statement
SELECT id, company_id, user_id, valid_until, price
FROM `classifieds`
WHERE `user_id` = 1
AND `valid_until` > now()
AND `price` = -2
AND `company_id` != 23
it gives me nothing!!!
the row I am getting in first query has company_id NULL
id company_id user_id valid_until price
35136 NULL 1 2012-02-12 08:06:37 -2
Please advise
EDIT:
I don't want rows that have company_id NULL. I want ALL rows (NULL and not NULL) except for (company_id) value 23.
SELECT id, company_id, user_id, valid_until, price
FROM `classifieds`
WHERE `user_id` = 1
AND `valid_until` > now()
AND `price` = -2
AND (`company_id` IS NULL OR `company_id` != 23)
this seems to do the trick

Sum amount of overlapping datetime ranges in MySQL

I have a question that is almost the same as Sum amount of overlapping datetime ranges in MySQL, so I'm reusing part of his text, hope that is ok...
I have a table of events, each with a StartTime and EndTime (as type DateTime) in a MySQL Table.
I'm trying to output the sum of overlapping times for each type of event and the number of events that overlapped.
What is the most efficient / simple way to perform this query in MySQL?
CREATE TABLE IF NOT EXISTS `events` (
`EventID` int(10) unsigned NOT NULL auto_increment,
`EventType` int(10) unsigned NOT NULL,
`StartTime` datetime NOT NULL,
`EndTime` datetime default NULL,
PRIMARY KEY (`EventID`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=37 ;
INSERT INTO `events` (`EventID`, EventType,`StartTime`, `EndTime`) VALUES
(10001,1, '2009-02-09 03:00:00', '2009-02-09 10:00:00'),
(10002,1, '2009-02-09 05:00:00', '2009-02-09 09:00:00'),
(10003,1, '2009-02-09 07:00:00', '2009-02-09 09:00:00'),
(10004,3, '2009-02-09 11:00:00', '2009-02-09 13:00:00'),
(10005,3, '2009-02-09 12:00:00', '2009-02-09 14:00:00');
# if the query was run using the data above,
# the table below would be the desired output
# Number of Overlapped Events , The event type, | Total Amount of Time those events overlapped.
1,1, 03:00:00
2,1, 02:00:00
3,1, 02:00:00
1,3, 01:00:00
There is a really beautiful solution given there by Mark Byers and I'm wondering if that one can be extended to include "Event Type".
His solution without event type was:
SELECT `COUNT`, SEC_TO_TIME(SUM(Duration))
FROM (
SELECT
COUNT(*) AS `Count`,
UNIX_TIMESTAMP(Times2.Time) - UNIX_TIMESTAMP(Times1.Time) AS Duration
FROM (
SELECT #rownum1 := #rownum1 + 1 AS rownum, `Time`
FROM (
SELECT DISTINCT(StartTime) AS `Time` FROM events
UNION
SELECT DISTINCT(EndTime) AS `Time` FROM events
) AS AllTimes, (SELECT #rownum1 := 0) AS Rownum
ORDER BY `Time` DESC
) As Times1
JOIN (
SELECT #rownum2 := #rownum2 + 1 AS rownum, `Time`
FROM (
SELECT DISTINCT(StartTime) AS `Time` FROM events
UNION
SELECT DISTINCT(EndTime) AS `Time` FROM events
) AS AllTimes, (SELECT #rownum2 := 0) AS Rownum
ORDER BY `Time` DESC
) As Times2
ON Times1.rownum = Times2.rownum + 1
JOIN events ON Times1.Time >= events.StartTime AND Times2.Time <= events.EndTime
GROUP BY Times1.rownum
) Totals
GROUP BY `Count`
SELECT
COUNT(*) as occurrence
, sub.event_id
, SEC_TO_TIME(SUM(LEAST(e1end, e2end) - GREATEST(e1start, e2start)))) as duration
FROM
( SELECT
, e1.event_id
, UNIX_TIMESTAMP(e1.starttime) as e1start
, UNIX_TIMESTAMP(e1.endtime) as e1end
, UNIX_TIMESTAMP(e2.starttime) as e2start
, UNIX_TIMESTAMP(e2.endtime) as e2end
FROM events e1
INNER JOIN events e2
ON (e1.eventtype = e2.eventtype AND e1.id <> e2.id
AND NOT(e1.starttime > e2.endtime OR e1.endtime < e2.starttime))
) sub
GROUP BY sub.event_id
ORDER BY occurrence DESC