Group database rows by one of two columns - mysql

The goal
I am trying to write a query to find duplicate rows. A row is duplicate when either Column A or Column B is the same.
Writing it so that both need to be the same is easy; just a simple GROUP BY A, B.
However, filtering by just one of the two is proving to be a bit more difficult. How would one go about doing this?
I've tried the following:
select distinct a as col_a,
b as col_b,
(
select count(*)
from table_name
where a = col_a
or b = col_b
) as duplicate_count
from table_name
having duplicate_count > 1;
but it does not feel like the right way to go about this and with 84.000 rows it is also very slow.
Example
With the following table:
+----+------------------------+---+---------+
| id | name | a | b |
+----+------------------------+---+---------+
| 1 | Lorem ipsum | 1 | Donec |
+----+------------------------+---+---------+
| 2 | dolor sit | 2 | rhoncus |
+----+------------------------+---+---------+
| 3 | amet | 3 | rhoncus |
+----+------------------------+---+---------+
| 4 | consectetur adipiscing | 1 | primis |
+----+------------------------+---+---------+
| 5 | vulputate cursus | 4 | Aliquam |
+----+------------------------+---+---------+
Either result 1 or 4 (same A) and either result 2 or 3 (same B) should be returned, both with a duplicate_count of 2.
Which one of the two "duplicates" is returned does not matter.
Versions
On my local machine I use MySQL 5.7.24.
I just checked the live server, it uses 10.1.43-MariaDB.

You already know that this query:
select a, b
from tablename
group by a, b
having count(*) > 1
returns duplicates with both a and b equal.
You can get the rest of the duplicates for your requirement with EXISTS:
select t.a, t.b
from tablename t
where exists (
select 1 from tablename
where (a = t.a and b <> t.b) or (a <> t.a and b = t.b)
)
Or if you want them all use UNION ALL:
select a, b
from tablename
group by a, b
having count(*) > 1
union all
select t.a, t.b
from tablename t
where exists (
select 1 from tablename
where (a = t.a and b <> t.b) or (a <> t.a and b = t.b)
)
Update:
If you have an ID column then use EXISTS like this:
select t.*
from tablename t
where exists (
select 1 from tablename
where id <> t.id and (a = t.a or b = t.b)
)
Or if you want just 1 of the duplicates use id > t.id instead of id <> t.id.
See the demo.
Or with a self join:
select t.*
from tablename t inner join tablename tt
on (tt.a = t.a or tt.b = t.b) and tt.id <> t.id

Following solution works :
Another demo with a line that has duplication in a and b
CREATE TEMPORARY TABLE ab_duplicates (
a INTEGER
) AS
SELECT a, count(*) as cnt
FROM tablename
group by a, b
Having cnt > 1;
ALTER TABLE ab_duplicates ADD INDEX (a);
-- Select duplicates for a, but not for a and b
SELECT id, name, a, b
FROM (SELECT x.*, t.id, t.name, t.a, t.b,
#rn := IF(t.a = #a, #rn + 1, 1) rn,
#a := t.a,
ab.a as ab_exists
FROM (select #a := null, #rn := 0) x,
tablename t
LEFT JOIN ab_duplicates ab on ab.a = t.a
ORDER BY a
) a_duplicates
where rn = 2 and ab_exists is null
UNION
-- union duplicates for b, including duplicates for a and b
SELECT id, name, a, b
FROM (SELECT x.*, t.id, t.name, t.a, t.b,
#rn := IF(t.b = #b, #rn + 1, 1) rn,
#b := t.b
FROM (select #b := null, #rn := 0) x,
tablename t
ORDER BY b
) b_and_ab_duplicates
where rn = 2;
Previous solutions that only worked in some edge cases
Using group by and count() :
First finding ids with duplicates for a :
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
-- this will work better if you have an index starting with a
Same with b :
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
-- this will work better if you have an index starting with b
First solution :
Union gives you ids where there are duplicates for a or b requires 2 indices)
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
UNION
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
Use the ids to filter the table, if you need more data from the table :
SELECT tablename.*
FROM (
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
UNION
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
) as ids
JOIN tablename on tablename.id = ids.id
Now this might not use an index, but you can use a temporary table to have one :
First solution, using a temporary table (might be faster) :
-- using a temporary table to set an index
CREATE TEMPORARY TABLE ids (
-- adds an index on id, for the JOIN in the result query
`id` INTEGER PRIMARY KEY
) as
SELECT id
FROM (
-- duplicates on a, requires an index (a) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
-- removes duplicates between both part of the UNION : this might be slow
-- if there cannot be duplicates on a and b at the same time, consider using UNION ALL
UNION
-- duplicates on b, requires an index (b) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
) tempids;
SELECT tablename.*
FROM ids -- using the temporary table, MUST be in the same database connection, will filter duplicates
JOIN tablename on tablename.id = ids.id;
I do not know if setting the index on the temporary table is better then setting one after populating the data :
-- you might want to postpone the index after the ids are set
-- using a temporary table to set an index
CREATE TEMPORARY TABLE ids2 (
`id` INTEGER
) as
SELECT id
FROM (
-- duplicates on a, requires an index (a) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
-- removes duplicates between both part of the UNION : this might be slow
-- if there cannot be duplicates on a and b at the same time, consider using UNION ALL
UNION
-- duplicates on b, requires an index (b) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
) tempids;
ALTER TABLE ids2 ADD INDEX (id);
SELECT tablename.*
FROM ids2 -- using the temporary table, MUST be in the same database connection, will filter duplicates
JOIN tablename on tablename.id = ids2.id;
With mariadb 10.2, or mysql 8 you could use window function (I guess).
Another solution : using vars :
SELECT id, name, a, b, rn
FROM (SELECT *,
#rn := IF(a = #a, #rn + 1, 1) rn,
#a := a
FROM (select #a := null, #rn := 0) x,
tablename
ORDER BY a
) a_duplicates
where rn = 2
UNION
SELECT id, name, a, b, rn
FROM (SELECT *,
#rn := IF(b = #b, #rn + 1, 1) rn,
#b := b
FROM (select #b := null, #rn := 0) x,
tablename
ORDER BY b
) b_duplicates
where rn = 2
Demo : with some extra steps to understand
Edit : this only works if you don t have lines where a and b are duplicates. Which is the case in the example.

Related

get the range of sequence values in table column

I have a list of value in my column. And want to query the range.
Eg. If values are 1,2,3,4,5,9,11,12,13,14,17,18,19
I want to display
1-5,9,11-14,17-19
Assuming that each value is stored on a separate row, you can use some gaps-and-island technique here:
select case when min(val) <> max(val)
then concat(min(val), '-', max(val))
else min(val)
end val_range
from (select val, row_number() over(order by val) rn from mytable) t
group by val - rn
order by min(val)
The idea is to build groups of consecutive values by taking the difference between the value and an incrementing rank, which is computed using row_number() (available in MySQL 8.0):
Demo on DB Fiddle:
| val_range |
| :-------- |
| 1-5 |
| 9 |
| 11-14 |
| 17-19 |
In earlier versions, you can emulate row_number() with a correlated subquery, or a user variable. The second option goes like:
select case when min(val) <> max(val)
then concat(min(val), '-', max(val))
else min(val)
end val_range
from (select #rn := 0) x
cross join (
select val, #rn := #rn + 1 rn
from (select val from mytable order by val) t
) t
group by val - rn
order by min(val)
As a complement to other answers:
select dn.val as dnval, min(up.val) as upval
from mytable up
join mytable dn
on dn.val <= up.val
where not exists (select 1 from mytable a where a.val = up.val + 1)
and not exists (select 1 from mytable b where b.val = dn.val - 1)
group by dn.val
order by dn.val;
1 5
9 9
11 14
17 19
Needless to say, but using an OLAP function like #GNB does, is orders of magnitude more efficient.
A short article on how to mimic OLAP functions in MySQL < 8 can be found at:
mysql-row_number
Fiddle
EDIT:
If another dimension is introduced (in this case p), something like:
select dn.p, dn.val as dnval, min(up.val) as upval
from mytable up
join mytable dn
on dn.val <= up.val
and dn.p = up.p
where not exists (select 1 from mytable a where a.val = up.val + 1 and a.p = up.p)
and not exists (select 1 from mytable b where b.val = dn.val - 1 and b.p = dn.p)
group by dn.p, dn.val
order by dn.p, dn.val;
can be used, see Fiddle2

MySQL behaviour when using ANY_VALUE multiple times

I want to get a random row for each group when using GROUP BY in MySQL 5.7. The most clean way to do it from my research is doing something like this:
SELECT ANY_VALUE(column_1), ANY_VALUE(column_2), ..., ANY_VALUE(column_n)
FROM table
GROUP BY column
Since there is no syntax for something like ANY_VALUE(*) or ANY_VALUE(column_1, column2, ..., column_n) I am left confused if with the above query each value can come from a different row, or if all ANY_VALUE fields will come from the same row.
If you want a random row, use row_number():
select t.*
from (select t.*,
row_number() over (partition by column order by rand()) as seqnum
from t
) t
where seqnum = 1;
I am guessing that this is also faster than group by, but you can check if that is the case.
In MySQL 5.7, you can use variables:
select t.*
from (select t.*,
(#rn := if(#c = column, #rn + 1,
if(#c := column, 1, 1)
)
) as rn
from (select t.* from t order by column, rand) t cross join
(select #c := '', #rn := 0) params
) t
where rn = 1;
Assuming the following schema and sample data:
create table tbl(
id int auto_increment primary key,
grp int not null,
val int not null,
index (grp)
);
insert into tbl (grp, val) values (1, 1);
insert into tbl (grp, val) values (1, 2);
insert into tbl (grp, val) values (1, 3);
insert into tbl (grp, val) values (2, 1);
insert into tbl (grp, val) values (2, 2);
Get distinct groups in a derived table (or use the base table for groups, if you have). Get a random primary key in a subquery in SELECT clause with ORDER BY rand() LIMIT 1. Then join the result as a derived table with the base table.
select t.*
from (
select (
select id
from tbl t
where t.grp = g.grp
order by rand()
limit 1
) as id
from (select distinct grp from tbl) g
) r
join tbl t using (id);
Result would be something like
| id | grp | val |
| --- | --- | --- |
| 2 | 1 | 2 |
| 4 | 2 | 1 |
View on DB Fiddle

SELECT Current and Previous row WHERE condition

id value
---------
1 a
2 b
3 c
4 a
5 t
6 y
7 a
I want to select all rows where the value is 'a' and the row before it
id value
---------
1 a
3 c
4 a
6 y
7 a
I looked into
but I want to get all such rows in one query.
Please help me start
Thank you
I think the easiest way might be to use variables:
select t.*
from (select t.*,
(rn := if(value = 'a', 1, #rn + 1) as rn
from table t cross join
(select #rn := 0) params
order by id desc
) t
where rn in (1, 2)
order by id;
An alternative method uses a correlated subquery to get the previous value and then uses this in the where clause:
select t.*
from (select t.*,
(select t2.value
from table t2
where t2.id < t.id
order by t2.id desc
limit 1
) as prev_value
from table t
) t
where value = 'a' or prev_value = 'a';
With an index on id, this might even be faster than the method using variables.

my sql query to find sort two columns independently

there is a table having two columns say id and name , i want both columns to be sorted.
table :
id name
3 y
2 z
1 x
output should be
id name
1 x
2 y
3 z
can anybody do it in single sql query ???
You need need to do weird stuff. because what you want to do is weird.
select b1.id, b2.name from
(
select #row := #row +1 as row, id
from broken, (select #row := 0) rr
order by id asc
) b1
inner join
(
select #row2 := #row2 + 1 as row, name
from broken, (select #row2 := 0) rr
order by name asc
) b2
on b1.row = b2.row
demo fiddle: http://sqlfiddle.com/#!9/4d47c/7
Select *
, row_number() over (order by ID) as IDRow
, row_number() over (order by name) as NameRow
into #temp
from table
select a.ID, b.Name from #temp a
full outer join #temp b
on a.IDRow = b.NameRow
order by IDRow, NameRow
If you wanted, you could do this with subqueries instead of the temp table, but it'll probably be faster this way.

mysql table with duplicate records

I have a table
email(email varchar(30),id integer(10),duplicated varchar(10))
with records
sai#gmail.com 101 null
kiran#gmail.com 102 null
sai123#gmail.com 103 null
sai#gmail.com 101 null
kiran#gmail.com 102 null
Now my question is i need to get "yes" in the duplicated column for the two duplicated records for the second time. so, the output table should be
sai#gmail.com 101 null
kiran#gmail.com 102 null
sai123#gmail.com 103 null
sai#gmail.com 101 yes
kiran#gmail.com 102 yes
Try this
update email set duplicated =
(case when (select count(*) from email x where x.email = e.email) > 1 then "yes" else null)
edited: this will update table
You can try this query for viewing:
select numerated.email, numerated.id, (case when cnt=1 OR numerated.rnum=grouped.min_rnum then null else "yes" end) as duplicated
from
(select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c order by id) as numerated
left join
(select email, id, min(rnum) as min_rnum, count(rnum) as cnt
from (select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c order by id) as numerated
group by email, id
) as grouped
on numerated.email=grouped.email and numerated.id=grouped.id
order by id;
Could you explain your situation in details? It looks like it needs another solution, not just SELECT query.
And try this one for updating:
update email u, (select #i:=0) urnum
set
id = id + (#i:=#i + 1) - #i,
duplicated = (
select duplicated from (
select
numerated.email,
numerated.id,
(case when cnt=1 OR numerated.rnum=grouped.min_rnum then null else "yes" end) as duplicated,
rnum
from
(select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c ) as numerated
left join
(select email, id, min(rnum) as min_rnum, count(rnum) as cnt
from (select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c ) as numerated
group by email, id
) as grouped
on numerated.email=grouped.email and numerated.id=grouped.id
order by rnum
) found_duplicates
where u.email=found_duplicates.email and u.id=found_duplicates.id and #i=found_duplicates.rnum
limit 1
)
;
It looks like it works, but you shouldn't rely on it.
If it is possible, you should do any of this:
1. change table structure - add unique field
2. change table filling logic - check uniqueness before inserting new row and insert it with proper 'duplicates' field value;
3. repopulate via temporary table like this:
CREATE TEMPORARY TABLE tmp_email AS <... 'SELECT' version of my query ...>;
TRUNCATE TABLE email;
INSERT INTO email SELECT * FROM tmp_email;