mysql table with duplicate records - mysql

I have a table
email(email varchar(30),id integer(10),duplicated varchar(10))
with records
sai#gmail.com 101 null
kiran#gmail.com 102 null
sai123#gmail.com 103 null
sai#gmail.com 101 null
kiran#gmail.com 102 null
Now my question is i need to get "yes" in the duplicated column for the two duplicated records for the second time. so, the output table should be
sai#gmail.com 101 null
kiran#gmail.com 102 null
sai123#gmail.com 103 null
sai#gmail.com 101 yes
kiran#gmail.com 102 yes

Try this
update email set duplicated =
(case when (select count(*) from email x where x.email = e.email) > 1 then "yes" else null)
edited: this will update table

You can try this query for viewing:
select numerated.email, numerated.id, (case when cnt=1 OR numerated.rnum=grouped.min_rnum then null else "yes" end) as duplicated
from
(select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c order by id) as numerated
left join
(select email, id, min(rnum) as min_rnum, count(rnum) as cnt
from (select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c order by id) as numerated
group by email, id
) as grouped
on numerated.email=grouped.email and numerated.id=grouped.id
order by id;
Could you explain your situation in details? It looks like it needs another solution, not just SELECT query.
And try this one for updating:
update email u, (select #i:=0) urnum
set
id = id + (#i:=#i + 1) - #i,
duplicated = (
select duplicated from (
select
numerated.email,
numerated.id,
(case when cnt=1 OR numerated.rnum=grouped.min_rnum then null else "yes" end) as duplicated,
rnum
from
(select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c ) as numerated
left join
(select email, id, min(rnum) as min_rnum, count(rnum) as cnt
from (select #i := #i + 1 as rnum, email.* from email, (select #i:=0) as c ) as numerated
group by email, id
) as grouped
on numerated.email=grouped.email and numerated.id=grouped.id
order by rnum
) found_duplicates
where u.email=found_duplicates.email and u.id=found_duplicates.id and #i=found_duplicates.rnum
limit 1
)
;
It looks like it works, but you shouldn't rely on it.
If it is possible, you should do any of this:
1. change table structure - add unique field
2. change table filling logic - check uniqueness before inserting new row and insert it with proper 'duplicates' field value;
3. repopulate via temporary table like this:
CREATE TEMPORARY TABLE tmp_email AS <... 'SELECT' version of my query ...>;
TRUNCATE TABLE email;
INSERT INTO email SELECT * FROM tmp_email;

Related

Group database rows by one of two columns

The goal
I am trying to write a query to find duplicate rows. A row is duplicate when either Column A or Column B is the same.
Writing it so that both need to be the same is easy; just a simple GROUP BY A, B.
However, filtering by just one of the two is proving to be a bit more difficult. How would one go about doing this?
I've tried the following:
select distinct a as col_a,
b as col_b,
(
select count(*)
from table_name
where a = col_a
or b = col_b
) as duplicate_count
from table_name
having duplicate_count > 1;
but it does not feel like the right way to go about this and with 84.000 rows it is also very slow.
Example
With the following table:
+----+------------------------+---+---------+
| id | name | a | b |
+----+------------------------+---+---------+
| 1 | Lorem ipsum | 1 | Donec |
+----+------------------------+---+---------+
| 2 | dolor sit | 2 | rhoncus |
+----+------------------------+---+---------+
| 3 | amet | 3 | rhoncus |
+----+------------------------+---+---------+
| 4 | consectetur adipiscing | 1 | primis |
+----+------------------------+---+---------+
| 5 | vulputate cursus | 4 | Aliquam |
+----+------------------------+---+---------+
Either result 1 or 4 (same A) and either result 2 or 3 (same B) should be returned, both with a duplicate_count of 2.
Which one of the two "duplicates" is returned does not matter.
Versions
On my local machine I use MySQL 5.7.24.
I just checked the live server, it uses 10.1.43-MariaDB.
You already know that this query:
select a, b
from tablename
group by a, b
having count(*) > 1
returns duplicates with both a and b equal.
You can get the rest of the duplicates for your requirement with EXISTS:
select t.a, t.b
from tablename t
where exists (
select 1 from tablename
where (a = t.a and b <> t.b) or (a <> t.a and b = t.b)
)
Or if you want them all use UNION ALL:
select a, b
from tablename
group by a, b
having count(*) > 1
union all
select t.a, t.b
from tablename t
where exists (
select 1 from tablename
where (a = t.a and b <> t.b) or (a <> t.a and b = t.b)
)
Update:
If you have an ID column then use EXISTS like this:
select t.*
from tablename t
where exists (
select 1 from tablename
where id <> t.id and (a = t.a or b = t.b)
)
Or if you want just 1 of the duplicates use id > t.id instead of id <> t.id.
See the demo.
Or with a self join:
select t.*
from tablename t inner join tablename tt
on (tt.a = t.a or tt.b = t.b) and tt.id <> t.id
Following solution works :
Another demo with a line that has duplication in a and b
CREATE TEMPORARY TABLE ab_duplicates (
a INTEGER
) AS
SELECT a, count(*) as cnt
FROM tablename
group by a, b
Having cnt > 1;
ALTER TABLE ab_duplicates ADD INDEX (a);
-- Select duplicates for a, but not for a and b
SELECT id, name, a, b
FROM (SELECT x.*, t.id, t.name, t.a, t.b,
#rn := IF(t.a = #a, #rn + 1, 1) rn,
#a := t.a,
ab.a as ab_exists
FROM (select #a := null, #rn := 0) x,
tablename t
LEFT JOIN ab_duplicates ab on ab.a = t.a
ORDER BY a
) a_duplicates
where rn = 2 and ab_exists is null
UNION
-- union duplicates for b, including duplicates for a and b
SELECT id, name, a, b
FROM (SELECT x.*, t.id, t.name, t.a, t.b,
#rn := IF(t.b = #b, #rn + 1, 1) rn,
#b := t.b
FROM (select #b := null, #rn := 0) x,
tablename t
ORDER BY b
) b_and_ab_duplicates
where rn = 2;
Previous solutions that only worked in some edge cases
Using group by and count() :
First finding ids with duplicates for a :
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
-- this will work better if you have an index starting with a
Same with b :
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
-- this will work better if you have an index starting with b
First solution :
Union gives you ids where there are duplicates for a or b requires 2 indices)
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
UNION
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
Use the ids to filter the table, if you need more data from the table :
SELECT tablename.*
FROM (
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
UNION
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
) as ids
JOIN tablename on tablename.id = ids.id
Now this might not use an index, but you can use a temporary table to have one :
First solution, using a temporary table (might be faster) :
-- using a temporary table to set an index
CREATE TEMPORARY TABLE ids (
-- adds an index on id, for the JOIN in the result query
`id` INTEGER PRIMARY KEY
) as
SELECT id
FROM (
-- duplicates on a, requires an index (a) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
-- removes duplicates between both part of the UNION : this might be slow
-- if there cannot be duplicates on a and b at the same time, consider using UNION ALL
UNION
-- duplicates on b, requires an index (b) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
) tempids;
SELECT tablename.*
FROM ids -- using the temporary table, MUST be in the same database connection, will filter duplicates
JOIN tablename on tablename.id = ids.id;
I do not know if setting the index on the temporary table is better then setting one after populating the data :
-- you might want to postpone the index after the ids are set
-- using a temporary table to set an index
CREATE TEMPORARY TABLE ids2 (
`id` INTEGER
) as
SELECT id
FROM (
-- duplicates on a, requires an index (a) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by a having cnt > 1
-- removes duplicates between both part of the UNION : this might be slow
-- if there cannot be duplicates on a and b at the same time, consider using UNION ALL
UNION
-- duplicates on b, requires an index (b) on tablename
SELECT min(id) id, count(*) cnt from tablename t group by b having cnt > 1
) tempids;
ALTER TABLE ids2 ADD INDEX (id);
SELECT tablename.*
FROM ids2 -- using the temporary table, MUST be in the same database connection, will filter duplicates
JOIN tablename on tablename.id = ids2.id;
With mariadb 10.2, or mysql 8 you could use window function (I guess).
Another solution : using vars :
SELECT id, name, a, b, rn
FROM (SELECT *,
#rn := IF(a = #a, #rn + 1, 1) rn,
#a := a
FROM (select #a := null, #rn := 0) x,
tablename
ORDER BY a
) a_duplicates
where rn = 2
UNION
SELECT id, name, a, b, rn
FROM (SELECT *,
#rn := IF(b = #b, #rn + 1, 1) rn,
#b := b
FROM (select #b := null, #rn := 0) x,
tablename
ORDER BY b
) b_duplicates
where rn = 2
Demo : with some extra steps to understand
Edit : this only works if you don t have lines where a and b are duplicates. Which is the case in the example.

Check if a user was "active" in multiple rows - MySQL

How would I go about creating group_ids in the following example based on the area(s) the users are active in?
group_id rep_id area datebegin dateend
1 1000 A 1/1/15 1/1/16
1 1000 B 1/1/15 1/1/16
2 1000 C 1/2/16 12/31/99
In the table you can see that rep 1000 was active in both A and B between 1/15 and 1/16. How would I go about coding the group_id field to group by datebegin & dateend?
Thanks for any help.
You can use variables in order to enumerate groups of records having identical rep_id, datebegin, dateend values:
SELECT rep_id, datebegin, dateend,
#rn := IF(#rep_id <> rep_id,
IF(#rep_id := rep_id, 1, 1),
#rn + 1) AS rn
FROM (
SELECT rep_id, datebegin, dateend
FROM mytable
GROUP BY rep_id, datebegin, dateend) AS t
CROSS JOIN (SELECT #rep_id := 0, #rn := 0) AS v
ORDER BY rep_id, datebegin
Output:
rep_id, datebegin, dateend, rn
-----------------------------------
1000, 2015-01-01, 2016-01-01, 1
1000, 2016-02-01, 2099-12-03, 2
You can use the above query as a derived table and join back to the original table. rn field is the group_id field you are looking for.
You can use variables to assign groups. As you said, only if the date_begin and date_end exactly match for 2 rows, they would be in the same group. Else a new group starts.
select rep_id,area,date_begin,date_end,
,case when #repid <> rep_id then #rn:=1 --reset the group to 1 when rep_id changes
when #repid=rep_id and #begin=date_begin and #end=date_end then #rn:=#rn --if rep_id,date_begin and date_end match use the same #rn previously assigned
else #rn:=#rn+1 --else increment #rn by 1
end as group_id
,#begin:=date_begin
,#end:=date_end
,#repid:=rep_id
from t
cross join (select #rn:=0,#begin:='',#end:='',#repid:=-1) r
order by rep_id,date_begin,date_end
The above query includes variables in the output. To only get the group_id use
select rep_id,area,date_begin,date_end,group_id
from (
select rep_id,area,date_begin,date_end
,case when #repid <> rep_id then #rn:=1
when #repid=rep_id and #begin=date_begin and #end=date_end then #rn:=#rn
else #rn:=#rn+1
end as group_id
,#begin:=date_begin
,#end:=date_end
,#repid:=rep_id
from t
cross join (select #rn:=0,#begin:='',#end:='',#repid:=-1) r
order by rep_id,date_begin,date_end
) x

Select first N messages each user receives

I have a table that stores messages sent to users, the layout is as follows
id (auto-incrementing) | message_id | user_id | datetime_sent
I'm trying to find the first N message_id's that each user has received, but am completely stuck. I can do it easily on a per-user basis (when defining the user ID in the query), but not for all users.
Things to note:
Many users can get the same message_id
Message ID's aren't sent sequentially (i.e. we can send message 400 before message 200)
This is a read only mySQL database
EDIT: On second thought I removed this bit but have added it back in since someone was kind enough to work on it
The end goal is to see what % of users opened one of the first N messages they received.
That table of opens looks like this:
user_id | message_id | datetime_opened
This is an untested answer to the original question (with 2 tables and condition on first 5):
SELECT DISTINCT user_id
FROM (
SELECT om.user_id,
om.message_id,
count(DISTINCT sm2.message_id) messages_before
FROM opened_messages om
INNER JOIN sent_messages sm
ON om.user_id = sm.user_id
AND om.message_id = sm.message_id
LEFT JOIN sent_messages sm2
ON om.user_id = sm2.user_id
AND sm2.datetime_sent < sm.datetime_sent
GROUP BY om.user_id,
om.message_id
HAVING messages_before < 5
) AS base
The subquery joins in sm2 to count the number of preceding messages that were sent to the same user, and then the having clause makes sure that there are fewer than 5 earlier messages sent. As for the same user there might be multiple messages (up to 5) with that condition, the outer query only lists the unique users that comply to the condition.
To get the first N (here 2) messages, try
SELECT
user_id
, message_id
FROM (
SELECT
user_id
, message_id
, id
, (CASE WHEN #user_id != user_id THEN #rank := 1 ELSE #rank := #rank + 1 END) AS rank,
(CASE WHEN #user_id != user_id THEN #user_id := user_id ELSE #user_id END) AS _
FROM (SELECT * FROM MessageSent ORDER BY user_id, id) T
JOIN (SELECT #cnt := 0) c
JOIN (SELECT #user_id := 0) u
) R
WHERE rank < 3
ORDER BY user_id, id
;
which uses a RANK substitute, derived from #Seaux response to Does mysql have the equivalent of Oracle's “analytic functions”?
To extend this to your original question, just add the appropriate calculation:
SELECT
COUNT(DISTINCT MO.user_id) * 100 /
(SELECT COUNT(DISTINCT user_id)
FROM (
SELECT
user_id
, message_id
, id
, (CASE WHEN #user_id != user_id THEN #rank := 1 ELSE #rank := #rank + 1 END) AS rank,
(CASE WHEN #user_id != user_id THEN #user_id := user_id ELSE #user_id END) AS _
FROM (SELECT * FROM MessageSent ORDER BY user_id, id) T
JOIN (SELECT #cnt := 0) c
JOIN (SELECT #user_id := 0) u
) R2
WHERE rank < 3
) AS percentage_who_read_one_of_the_first_messages
FROM MessageOpened MO
JOIN
(SELECT
user_id
, message_id
FROM (
SELECT
user_id
, message_id
, id
, (CASE WHEN #user_id != user_id THEN #rank := 1 ELSE #rank := #rank + 1 END) AS rank,
(CASE WHEN #user_id != user_id THEN #user_id := user_id ELSE #user_id END) AS _
FROM (SELECT * FROM MessageSent ORDER BY user_id, id) T
JOIN (SELECT #cnt := 0) c
JOIN (SELECT #user_id := 0) u
) R
WHERE rank < 3) MR
ON MO.user_id = MR.user_id
AND MO.message_id = MR.message_id
;
With no CTEs in MySQL, and being in a read-only database - I see no way around having the above query twice in the statement.
See it in action: SQL Fiddle.
Please comment if and as this requires adjustment / further detail.

Mysql query counter x/y

It's possible to create a query that return the x/y number of records?
Eg.
I have table like this
ID | id_user | id_event
23 | 3 | 1
24 | 3 | 1
25 | 3 | 1
26 | 4 | 2
27 | 4 | 2
I will return something that looks like this:
Event
id_user 3 -> **1/3**
id_user 3 -> **2/3**
id_user 3 -> **3/3**
id_user 4 -> **1/2**
id_user 4 -> **2/2**
Any suggestion is appreciated!
Try this
SET #id_event := 0;
SELECT CONCAT('id_user ', id_user ,'->','**', (#id_event := #id_event + 1) ,'/', id_user ,** ) from table
This is probably a duplicate to this question.
SELECT CONCAT('id_user ',id_user,' -> **',rank,'/',group_total,'**') FROM (
SELECT id,
group_total,
CASE id_user
WHEN #id_user THEN
CASE id_event
WHEN #id_event THEN #rowno := #rowno + 1
ELSE #rowno := 1
END
ELSE #rowno :=1
END AS rank,
#id_user := id_user AS id_user,
#id_event := id_event AS id_event
FROM event_table
JOIN (SELECT id_user, id_event, COUNT(*) group_total FROM event_table GROUP BY id_user, id_event) t USING (id_user, id_event)
JOIN (SELECT #rowno := 0, #id_user := 0, #id_event := 0) r
ORDER BY id_user, id_event
) c;
Assuming you want output like this:
id_user < id_user > ** serial number of event related to this user / total events related to this user **
You can accomplish such result by the following query:
SELECT
CONCAT('id_user ',UE.id_user,' -> **',IF(#userID = UE.id_user, #eventNumber := #eventNumber + 1, #eventNumber := 1),'/',t.totalEvents,'**') AS output,
#userID := UE.id_user
FROM (SELECT #userID := -1, #eventNumber := 1) var,user_events UE
INNER JOIN
(
SELECT
id_user,
COUNT(id_event) totalEvents
FROM user_events
GROUP BY id_user
) AS t
ON UE.id_user = t.id_user
ORDER BY UE.id_user;
SQL FIDDLE DEMO
More:
SQL FIDDLE DEMO 2
This particular fiddle returns only the desired output column whereas the first fiddle contains one extra column
I played a little bit and that would be my solution:
SELECT id, id_user, id_event, if(#n = a.id_event, #c:=#c+1, if(#n:=a.id_event, #c:=1, #c:=1)) as count, (SELECT count(*) from TABLE b WHERE a.id_user = b.id_user) as total, from TABLE a join (SELECT #n:= "", #c:=1) c
It just have two if conditions for counting a #c up if #n and id_user matches if not #n become id_user and #c is 1 again. The join is for initialize the var in the same query.
Thx to that question, i found the answer to a questions that i asked 4 days ago.

query to add incremental field based on GROUP BY

Have a table photos
photos.id
photos.user_id
photos.order
A) Is it possible via a single query to group all photos by user and then update the order 1,2,3..N ?
B) added twist, what if some of the photos already have an order value associated? Make sure that the new photos.order never gets repeated and fills in ant orders lower or higher than those existing (as best as possible)
My only thought is just to run a script on this and loop through it and re'order' everything?
photos.id int(10)
photos.created_at datetime
photos.order int(10)
photos.user_id int(10)
Right now data may look like this
user_id = 1
photo_id = 1
order = NULL
user_id = 2
photo_id = 2
order = NULL
user_id = 1
photo_id = 3
order = NULL
the desired result would be
user_id = 1
photo_id = 1
order = 1
user_id = 2
photo_id = 2
order = 1
user_id = 1
photo_id = 3
order = 2
A)
You can use a variable that increments with each row and resets with each user_ID to get the row count.
SELECT ID,
User_ID,
`Order`
FROM ( SELECT #r:= IF(#u = User_ID, #r + 1,1) AS `Order`,
ID,
User_ID,
#u:= User_ID
FROM Photos,
(SELECT #r:= 1) AS r,
(SELECT #u:= 0) AS u
ORDER BY User_ID, ID
) AS Photos
Example on SQL Fiddle
B)
My First solution was to just add Order to the sorting that adds the row number, therefore anything with an Order Gets sorted by its order first, but this only works if your ordering system has no gaps and starts at 1:
SELECT ID,
User_ID,
RowNumber AS `Order`
FROM ( SELECT #r:= IF(#u = User_ID, #r + 1,1) AS `RowNumber`,
ID,
User_ID,
#u:= User_ID
FROM Photos,
(SELECT #i:= 1) AS r,
(SELECT #u:= 0) AS u
ORDER BY User_ID, `Order`, ID
) AS Photos
ORDER BY `User_ID`, `Order`
Example using Order Field
ORDERING WITH GAPS
I have eventually found a way of maintaining the sort order even when there are gaps in the sequence.
SELECT ID, User_ID, `Order`
FROM Photos
WHERE `Order` IS NOT NULL
UNION ALL
SELECT Photos.ID,
Photos.user_ID,
Numbers.RowNum
FROM ( SELECT ID,
User_ID,
#r1:= IF(#u1 = User_ID,#r1 + 1,1) AS RowNum,
#u1:= User_ID
FROM Photos,
(SELECT #r1:= 0) AS r,
(SELECT #u1:= 0) AS u
WHERE `Order` IS NULL
ORDER BY User_ID, ID
) AS Photos
INNER JOIN
( SELECT User_ID,
RowNum,
#r2:= IF(#u2 = User_ID,#r2 + 1,1) AS RowNum2,
#u2:= User_ID
FROM ( SELECT DISTINCT p.User_ID, o.RowNum
FROM Photos AS p,
( SELECT #i:= #i + 1 AS RowNum
FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY,
( SELECT #i:= 0) AS i
) AS o
WHERE RowNum <= (SELECT COUNT(*) FROM Photos P1 WHERE p.User_ID = p1.User_ID)
AND NOT EXISTS
( SELECT 1
FROM Photos p2
WHERE p.User_ID = p2.User_ID
AND o.RowNum = p2.`Order`
)
AND p.`Order` IS NULL
ORDER BY User_ID, RowNum
) AS p,
(SELECT #r2:= 0) AS r,
(SELECT #u2:= 0) AS u
ORDER BY user_ID, RowNum
) AS numbers
ON Photos.User_ID = numbers.User_ID
AND photos.RowNum = numbers.RowNum2
ORDER BY User_ID, `Order`
However as you can see this is pretty complicated. This works by treating those with an order value separately to those without. The top query just ranks all photos with no order value in order of ID for each user. The bottom query uses a cross join to generates a sequential list from 1 to n for each user ID (up to the number of entries for each User_ID). So with a data set like this:
ID User_ID Order
1 1 NULL
2 2 NULL
3 1 NULL
4 1 1
5 1 3
6 2 2
7 2 3
It would generate
UserID RowNum
1 1
1 2
1 3
1 4
2 1
2 2
2 3
It then uses NOT EXISTS to elimiate all combinations already used by Photos with a non null order, and ranked in order of RowNum partitioned by User_ID giving
UserID RowNum Rownum2
1 2 1
1 4 2
2 1 1
The RowNum2 value can then be matched with the rownum value achieved in the from subquery, giving the correct order value. Long winded, but it works.
Example on SQL Fiddle
Worked for me. I needed to increment version grouping by 4 fields (host, folder, fileName, status) and sort by 1 (downloadedAtTicks).
This is is my SELECT
SET #status := NULL;
SET #version := NULL;
SELECT
id,
host,
folder,
fileName,
status,
downloadedAtTicks,
version,
IF(IF(status IS NULL, 0, status) = #status, #version := #version + 1, #version := 0) AS varVersion,
#status := IF(status IS NULL, 0, status) AS varStatus
FROM csvsource
ORDER BY host, folder, fileName, status, downloadedAtTicks;
And this is my UPDATE
SET #status := NULL;
SET #version := NULL;
UPDATE
csvsource csv,
(SELECT
id,
IF(IF(status IS NULL, 0, status) = #status, #version := #version + 1, #version := 0) AS varVersion,
#status := IF(status IS NULL, 0, status) AS varStatus
FROM csvsource
ORDER BY host, folder, fileName, status, downloadedAtTicks) AS sub
SET
csv.version = sub.varVersion
WHERE csv.id = sub.id;