MySQL update join query to solve duplicate Values - mysql

I have a Categories table which has some duplicate Categories as described below,
`Categories`
+========+============+============+
| cat_id | cat_name | item_count |
+========+============+============+
| 1 | Category 1 | 2 |
| 2 | Category 1 | 1 |
| 3 | Category 2 | 2 |
| 4 | Category 3 | 1 |
| 5 | Category 3 | 1 |
+--------+------------+------------+
Here is another junction table which relates to another Items table. The item_count in the first table is the total number of items per cat_id.
`Junction`
+========+=========+
| cat_id | item_id |
+========+=========+
| 1 | 100 |
| 1 | 101 |
| 2 | 102 |
| 3 | 103 |
| 3 | 104 |
| 4 | 105 |
| 5 | 106 |
+--------+---------+
How do I add or combine those items from the duplicate Categories into ones each having maximum item_count among their duplicates? (e.g. Category 1).
Also, if the item_count is the same for those duplicate ones, then the Category with maximum cat_id will be chosen and item_count will be combined to that record. (e.g. Category 3).
Note: Instead of removing the duplicate records, the item_count will
be set to 0.
Below is the expected result.
+========+============+============+
| cat_id | cat_name | item_count |
+========+============+============+
| 1 | Category 1 | 3 |
| 2 | Category 1 | 0 |
| 3 | Category 2 | 2 |
| 4 | Category 3 | 0 |
| 5 | Category 3 | 2 |
+--------+------------+------------+
+========+=========+
| cat_id | item_id |
+========+=========+
| 1 | 100 |
| 1 | 101 |
| 1 | 102 |
| 3 | 103 |
| 3 | 104 |
| 5 | 105 |
| 5 | 106 |
+--------+---------+
In the result, there are two duplicates Category 1 and Category 3. And we have 2 scenarios,
cat_id=2 is eliminated because its item_count=1 is less than
that of cat_id=1 which is item_count=2.
cat_id=4 is eliminated even though its item_count is the same
as that of cat_id=5 since 5 is the maximum among duplicate
Category 3.
Please help me if any query that can join and update both tables in order to solve the duplicates.

Here's a SELECT. You can figure out to adapt it to an UPDATE ;-)
I've ignored the jucntion table for simplicity
SELECT z.cat_id
, z.cat_name
, (z.cat_id = x.cat_id) * new_count item_count
FROM categories x
LEFT
JOIN categories y
ON y.cat_name = x.cat_name
AND (y.item_count > x.item_count OR (y.item_count = x.item_count AND y.cat_id > x.cat_id))
LEFT
JOIN
( SELECT a.cat_id, b.*
FROM categories a
JOIN
( SELECT cat_name, SUM(item_count) new_count, MAX(item_count) max_count FROM categories GROUP BY cat_name) b
ON b.cat_name = a.cat_name
) z
ON z.cat_name = x.cat_name
WHERE y.cat_id IS NULL;
+--------+------------+------------+
| cat_id | cat_name | item_count |
+--------+------------+------------+
| 1 | Category 1 | 3 |
| 2 | Category 1 | 0 |
| 3 | Category 2 | 2 |
| 4 | Category 3 | 0 |
| 5 | Category 3 | 2 |
+--------+------------+------------+

DELIMITER $$
DROP PROCEDURE IF EXISTS cursor_proc $$
CREATE PROCEDURE cursor_proc()
BEGIN
DECLARE #cat_id INT;
DECLARE #cat_name VARCHAR(255);
DECLARE #item_count INT;
DECLARE #prev_cat_Name VARCHAR(255);
DECLARE #maxItemPerCategory INT;
DECLARE #maxItemId INT DEFAULT 0;
DECLARE #totalItemsCount INT;
-- this flag will be set to true when cursor reaches end of table
DECLARE exit_loop BOOLEAN;
-- Declare the cursor
DECLARE categories_cursor CURSOR FOR
SELECT select cat_id ,cat_name ,item_count from Categories Order By cat_name, cat_id;
-- set exit_loop flag to true if there are no more rows
DECLARE CONTINUE HANDLER FOR NOT FOUND SET exit_loop = TRUE;
-- open the cursor
OPEN categories_cursor;
-- start looping
categories_loop: LOOP
-- read the name from next row into the variables
FETCH categories_cursor INTO #cat_id, #cat_name, #item_count ;
-- close the cursor and exit the loop if it has.
IF exit_loop THEN
CLOSE categories_loop;
LEAVE categories_loop;
END IF;
IF(#prev_cat_Name <> #cat_name)
THEN
-- Category has changed, set the item_count of the 'best' category with the total items count
IF(#maxItemId > 0)
THEN
UPDATE Categories
SET Categories.item_count=#totalItemsCount
WHERE Categories.cat_id=#maxItemId;
END IF;
-- Reset Values with the actual row values
SET #maxItemPerCategory = #item_count;
SET #prev_cat_Name = #cat_name;
SET #maxItemId = #cat_id
SET #totalItemsCount = #item_count;
ELSE
-- increment the total items count
SET #totalItemsCount = #totalItemsCount + #item_count
-- if the actual row has the maximun item counts, then it is the 'best'
IF (#maxIntPerCategory < #item_count)
THEN
SET #maxIntPerCategory = #item_count
SET #maxItemId = #cat_id
ELSE
-- else, this row is not the best of its Category
UPDATE Categories
SET Categories.item_count=0
WHERE Categories.cat_id=#cat_id;
END IF;
END IF;
END LOOP categories_loop;
END $$
DELIMITER ;

It's not pretty and copied in part from Strawberry's SELECT
UPDATE categories cat,
junction jun,
(select
(z.cat_id = x.cat_id) * new_count c,
x.cat_id newcatid,
z.cat_id oldcatid
from categories x
LEFT
JOIN categories y
ON y.cat_name = x.cat_name
AND (y.item_count > x.item_count OR (y.item_count = x.item_count AND y.cat_id > x.cat_id))
LEFT
JOIN
( SELECT a.cat_id, b.*
FROM categories a
JOIN
( SELECT cat_name, SUM(item_count) new_count, MAX(item_count) max_count FROM categories GROUP BY cat_name) b
ON b.cat_name = a.cat_name
) z
ON z.cat_name = x.cat_name
WHERE
y.cat_id IS NULL) sourceX
SET cat.item_count = sourceX.c, jun.cat_id = sourceX.newcatid
WHERE cat.cat_id = jun.cat_id and cat.cat_id = sourceX.oldcatid

I think it's better to do what you want one step at time:
First, get data you need:
SELECT Max(`cat_id`), sum(`item_count`) FROM `Categories` GROUP BY `cat_name`
With these data you'll be able to check if update was correctly done.
Then, with a loop on acquired data, update:
update Categories set item_count =
(
Select Tot FROM (
Select sum(`item_count`) as Tot
FROM `Categories`
WHERE `cat_name` = '#cat_name') as tmp1
)
WHERE cat_id = (
Select MaxId
FROM (
select max(cat_id) as MaxId
FROM Categories
WHERE `cat_name` = '#cat_name') as tmp2)
Pay attention, if you run twice this code the result will be wrong.
Finally, set others Ids to 0
UPDATE Categories set item_count = 0
WHERE `cat_name` = '#cat_name'
AND cat_id <> (
Select MaxId
FROM (
select max(cat_id) as MaxId
FROM items
WHERE `cat_name` = '#cat_name0') as tmp2)

Related

MySQL efficient data to junction table query

The problem:
I want to move the links of the categories from the table companies_1 into the company_categories table. The company_id in the company_categories table need to be equal to the id of the companies_2 table. The records of the companies_1 and the companies_2 table are linked by the "name"-column.
The current code below took me over a night, still unfinished! I want to learn to be more efficient and speed this progress up. I feel like there is very much to optimize because there are A LOT of company records.
Another issue was that i found no way how to check where my query was while looping (resulting in no way to check the progress). Because the progress took so long i killed the query and I'm searching for a better way to solve this issue.
The information:
There is a table with companies like:
----------------------------------------
| companies_1 |
----------------------------------------
| id | category_id | name |
----------------------------------------
| 1 | 1 | example-1 |
| 2 | 2 | example-1 |
| 3 | 1 | example-2 |
| 4 | 2 | example-2 |
| 5 | 3 | example-2 |
| 6 | 1 | example-3 |
----------------------------------------
A table with the DISTINCT company names:
-------------------------
| companies_2 |
-------------------------
| id | name |
-------------------------
| 1 | example-1 |
| 2 | example-2 |
| 3 | example-3 |
-------------------------
A categories table, like:
-------------------------
| categories |
-------------------------
| id | name |
-------------------------
And a junction table, like:
---------------------------------
| company_categories |
---------------------------------
| company_id | category_id |
---------------------------------
The current code:
This code works, but is far from efficient.
DELIMITER $$
DROP PROCEDURE IF EXISTS fill_junc_table$$
CREATE PROCEDURE fill_junc_table()
BEGIN
DECLARE r INT;
DECLARE i INT;
DECLARE i2 INT;
DECLARE loop_length INT;
DECLARE company_old_len INT;
DECLARE _href VARCHAR(255);
DECLARE cat_id INT;
DECLARE comp_id INT;
SET r = 0;
SET i = 0;
SET company_old_len = 0;
SELECT COUNT(*) INTO loop_length FROM companies;
WHILE i < loop_length DO
SELECT href INTO _href FROM company_old LIMIT i,1;
SELECT id INTO comp_id FROM companies WHERE site_href=_href;
SELECT COUNT(*) INTO company_old_len FROM company_old WHERE href=_href;
SET i2 = 0;
WHILE i2 < company_old_len DO
SELECT category_id INTO cat_id FROM company_old WHERE href=_href LIMIT i2,1;
INSERT INTO company_categories (company_id, category_id) VALUES (comp_id, cat_id);
SET r = r + 1;
SET i2 = i2 + 1;
END WHILE;
SET i = i + 1;
END WHILE;
SELECT r;
END$$
DELIMITER ;
CALL fill_junc_table();
Edit (new idea):
I am going to test another way to solve this problem by fully copying the companies_1 table with the following columns (company_id empty on copy):
---------------------------------------------
| company_id | category_id | name |
---------------------------------------------
Then, I will loop through the companies_2 table to fill the correct company_id related to the name-column.
I hope you can give your thoughts about this. When I finish my test I will leave the result over here for others.
To clarify, I don't see any PIVOT transformation in the company_categories. What I see is you want a JUNCTION TABLE because it seems that companies and categories tables have many-to-many relationship.
In your case, you have company which has multiple categories. And you also have categories assigned to multiple companies.
Now base from your requirement:
I want to move the links of the categories from the table companies_1
into the company_categories table. The company_id in the
company_categories table need to be equal to the id of the companies_2
table. The records of the companies_1 and the companies_2 table are
linked by the "name"-column.
I arrived with this query:
INSERT INTO company_categories (company_id, category_id)
SELECT C2.id
, C1.category_id
FROM companies_1 C1
INNER JOIN companies_2 C2 ON C2.name = C1.name
Let me know if this works. The nested loops that you created will really take a while.
As #DanielE pointed out, this query will work in the assumption that company_categories is empty. We will need to use UPDATE otherwise.
Why not just update companies_1?
ALTER TABLE companies_1 ADD (company_id INT)
UPDATE companies_1 SET company_id = (SELECT id FROM companies_2 WHERE name=companies_1.name)
ALTER TABLE companies_1 DROP name, RENAME TO company_categories
SELECT * FROM `company_categories`
Output
id category_id company_id
1 1 1
2 2 1
3 1 2
4 2 2
5 3 2
6 1 3

2 You can't specify target table 'p' for update in FROM clause SQL2.sql 7 15

this is my query:
UPDATE pupils p
SET p.rollIdentity = NULL
WHERE p.id IN (SELECT pup469.id FROM pupils pup469
inner JOIN pupils pup470 ON pup470.rollIdentity = pup469.rollIdentity
where pup469.school_id = 469 and pup469.year = 10
AND pup470.school_id = 470 AND pup470.year = 3)
so basically, I need to update only the pupils that have the same rollIdentity in the other school.
I read that I should use exist instead, but I don't fully understand this, can somebody explain further, please? thanks
If you bury your sub query a bit deeper
drop table if exists t;
create table t(id int,school_id int, rollIdentity int, yr int);
insert into t values
(1,470, 10,3),
(2, 470, null,2),
(3, 469, 10,10),
(4,34,10,4);
UPDATE t p
SET p.rollIdentity = NULL
WHERE p.id IN
(
select id from
(
SELECT pup469.id
FROM t pup469
inner JOIN t pup470 ON pup470.rollIdentity = pup469.rollIdentity
where (pup469.school_id = 469 and pup469.yr = 10)
AND (pup470.school_id = 470 AND pup470.yr = 3)
) s
);
select * from t;
You get this
+------+-----------+--------------+------+
| id | school_id | rollIdentity | yr |
+------+-----------+--------------+------+
| 1 | 470 | 10 | 3 |
| 2 | 470 | NULL | 2 |
| 3 | 469 | NULL | 10 |
| 4 | 34 | 10 | 4 |
+------+-----------+--------------+------+
4 rows in set (0.00 sec)
But why would you not want id 1 to change?

Mysql recursive query to get parent category

I have 3 tables:
users
user_groups
groups
A user can be in multiple (sub)groups. They are stored in the user_groups table like this
+--------------+--------------+---------------+
| id | user_id | group_id |
+--------------+--------------+---------------+
| 1 | 1 | 23 |
+--------------+--------------+---------------+
| 2 | 2 | 24 |
-----------------------------------------------
Now in my groups table, the top categories are parent_id = 0
+--------------+--------------+---------------+
| id | parent_id | name |
+--------------+--------------+---------------+
| 1 | 2 | Group 1.1 |
+--------------+--------------+---------------+
| 2 | 0 | Group 1 |
+--------------+--------------+---------------+
| 3 | 2 | Group 1.2 |
+--------------+--------------+---------------+
| 4 | 3 | Group 1.2.1 |
+--------------+--------------+---------------+
| 5 | 2 | Group 1.3 |
+--------------+--------------+---------------+
Now I want to build a query which gives me all the parent groups for all users. I did some research about recursive queries and I found this particular post:
How to create a MySQL hierarchical recursive query
But I have no idea how I should approach this when I join the tables.
This is what I got so far:
SELECT
`users`.`id`,
`users`.`first_name`,
`users`.`last_name`,
`users`.`email`,
`users`.`language`,
`groups`.`name`,
`groups`.`parent_id`
FROM `users`
LEFT JOIN `user_groups`
ON `user_groups`.`user_id` = `users`.`id`
LEFT JOIN `groups`
ON `groups`.`id` = `user_groups`.`group_id`
WHERE
`users`.`created`
BETWEEN
DATE_SUB(NOW(), INTERVAL 365 DAY) AND NOW()
But this query just gets me the name and the id of the subgroup. What I want is the top level group.
Thanks for the help!
The typical solution is to create a stored function that returns the top-level group for any given group by tracing the parentage up until it finds a row with parent_id = 0.
Then you can apply that function to each group that the user is a member of, and select the distinct set of top level groups.
Something like this should work for you:
delimiter $$
drop function if exists get_top_level_group_id $$
create function get_top_level_group_id (p_group_id int) returns int
begin
declare v_return_val int;
declare v_group_id int;
declare v_parent_id int;
declare continue handler for not found
begin
return -1;
end;
set v_group_id = p_group_id;
set v_parent_id = p_group_id;
while v_parent_id != 0
do
set v_group_id = v_parent_id;
select `parent_id`
into v_parent_id
from `groups`
where id = v_group_id;
end while;
return v_group_id;
end $$
delimiter ;
Then you can update your query like this to get those users and their distinct top-level groups:
SELECT DISTINCT
`users`.`id`,
`users`.`first_name`,
`users`.`last_name`,
`users`.`email`,
`users`.`language`,
get_top_level_group_id(`user_groups`.`group_id`) as top_level_group_id
FROM `users`
LEFT JOIN `user_groups`
ON `user_groups`.`user_id` = `users`.`id`
WHERE
`users`.`created`
BETWEEN
DATE_SUB(NOW(), INTERVAL 365 DAY) AND NOW()

how to update a field in reverse order to other field in a MYSQL table

I have a table with fields like this
lev_1_id,lev_1_seq,lev_1_new_seq,lev_2_id,lev_2_seq,lev_2_new_seq
284e777e,1,null,b4dce5bb,1,null<br>
284e777e,1,null,dfd158ed,2,null<br>
284e777e,1,null,fedbf511,3,null<br>
0c7e0938,2,null,2333f431,1,null<br>
0c7e0938,2,null,808734fa,2,null<br>
0c7e0938,2,null,2504e0de,3,null<br>
And now I want to update the lev_1_new_seq and, lev_2_new_seq by reversing the values in lev_1_seq and lev_2_seq respectively.
After updating the fields, the table should look like this:
lev_1_id,lev_1_seq,lev_1_new_seq,lev_2_id,lev_2_seq,lev_2_new_seq
284e777e,1,2,b4dce5bb,1,3<br>
284e777e,1,2,dfd158ed,2,2<br>
284e777e,1,2,fedbf511,3,1<br>
0c7e0938,2,1,2333f431,1,3<br>
0c7e0938,2,1,808734fa,2,2<br>
0c7e0938,2,1,2504e0de,3,1<br>
can anyone help me with updating the fields?
Thanks in advance!
This solution works if initial ordering was done by id. 2 - the column you updating, 1 - column that needs to be in reverse order, tmp - table, id - unique key, initial sorting column.
BEGIN
DECLARE p INT;
DECLARE v INT;
DECLARE n INT DEFAULT 0;
DECLARE i INT DEFAULT 0;
SELECT COUNT(*) FROM `tmp` INTO n;
SET i=0;
WHILE i<n DO
select `1` from `tmp` order by `id` desc limit i,1 into p;
select `id` from `tmp` order by `id` limit i,1 into v;
update `tmp` set `2` = p where `id` = v;
SET i = i + 1;
END WHILE;
End
For your table: 1 = lev_1_seq, 2 = lev_1_new_seq, id = lev_1_id
It appears that you should be able to calculate the "new" sequences by deducting the exiting sequences from the relevant maximums + 1.
e.g. the max(lev_1_seq) + 1 = 3, so for an existing value of 2: 3-2 = 1, for an existing value of 1: 3-1 = 2
UPDATE table1 tu
JOIN (
SELECT
t1.lev_1_id
, m1.lev1maxseq
, MAX(t1.lev_2_seq) + 1 lev2maxseq
FROM table1 t1
CROSS JOIN (
SELECT
MAX(lev_1_seq) + 1 lev1maxseq
FROM table1
) m1
GROUP BY
t1.lev_1_id
, m1.lev1maxseq
) nv on tu.lev_1_id = nv.lev_1_id
SET
tu.lev_1_new_seq = (nv.lev1maxseq - tu.lev_1_seq)
, tu.lev_2_new_seq = (nv.lev2maxseq - tu.lev_2_seq)
;
see this sqlfiddle
results:
| lev_1_id | lev_1_seq | lev_1_new_seq | lev_2_id | lev_2_seq | lev_2_new_seq |
|----------|-----------|---------------|----------|-----------|---------------|
| 284e777e | 1 | 2 | b4dce5bb | 1 | 3 |
| 284e777e | 1 | 2 | dfd158ed | 2 | 2 |
| 284e777e | 1 | 2 | fedbf511 | 3 | 1 |
| 0c7e0938 | 2 | 1 | 2333f431 | 1 | 3 |
| 0c7e0938 | 2 | 1 | 808734fa | 2 | 2 |
| 0c7e0938 | 2 | 1 | 2504e0de | 3 | 1 |

MySQL Multiplication of Hierarchical data in a single parent-children table

I am working on a project whose MySQL database contains two tables; people and percentages.
people table:
+----+------+--------+
| ID | Name | Parent |
+----+------+--------+
| 1 | A | 0 |
| 2 | B | 1 |
| 3 | C | 2 |
| 4 | D | 3 |
| 5 | E | 1 |
| 6 | F | 0 |
+----+------+--------+
Percentages table:
+----+------------+
| ID | Percentage |
+----+------------+
| 1 | 70% |
| 2 | 60% |
| 3 | 10% |
| 4 | 5% |
| 5 | 40% |
| 6 | 30% |
+----+------------+
The query result I am seeking should be as the following:
+----+------------+----------------+--------+
| ID | Percentage | Calculation | Actual |
+----+------------+----------------+--------+
| 1 | 70 | 70% | 70.00% |
| 2 | 60 | 70%*60% | 42.00% |
| 3 | 10 | 70%*60%*10% | 4.20% |
| 4 | 5 | 70%*60%*10%*5% | 0.21% |
| 5 | 40 | 70%*40% | 28.00% |
| 6 | 30 | 30% | 30.00% |
+----+------------+----------------+--------+
The Calculation column is only for elaboration. Is there any MySQL technique that i could use to achieve this hierarchical query? Even if the percentages table might contain multiple entries (percentages) for the same person ?
A solution is to utilize the function described at the following link for heirarchical queries:
http://explainextended.com/2009/03/19/hierarchical-queries-in-mysql-adding-ancestry-chains/
Instead of making a PATH though, you will want to calculate the multiplication.
SOLUTION SCRIPT
Copy and paste this directly in a mysql console. I have not had much luck in workbench. Additionally, this can be further optimized by combining hierarchy_sys_connect_by_path_percentage and hierarchy_sys_connect_by_path_percentage_result into one stored procedure. Unfortunately this may be quite slow for giant data sets.
Setup Table and Data
drop table people;
drop table percentages;
create table people
(
id int,
name varchar(10),
parent int
);
create table percentages
(
id int,
percentage float
);
insert into people values(1,' A ',0);
insert into people values(2,' B ',1);
insert into people values(3,' C ',2);
insert into people values(4,' D ',3);
insert into people values(5,' E ',1);
insert into people values(6,' F ',0);
insert into percentages values(1,0.70);
insert into percentages values(2,0.60);
insert into percentages values(3,0.10);
insert into percentages values(4,0.5);
insert into percentages values(5,0.40);
insert into percentages values(6,0.30);
DELIMITER $$
DROP FUNCTION IF EXISTS `hierarchy_sys_connect_by_path_percentage`$$
CREATE FUNCTION hierarchy_sys_connect_by_path_percentage(
delimiter TEXT,
node INT)
RETURNS TEXT
NOT DETERMINISTIC
READS SQL DATA
BEGIN
DECLARE _path TEXT;
DECLARE _id INT;
DECLARE _percentage FLOAT;
DECLARE EXIT HANDLER FOR NOT FOUND RETURN _path;
SET _id = COALESCE(node, #id);
SELECT Percentage
INTO _path
FROM percentages
WHERE id = _id;
LOOP
SELECT parent
INTO _id
FROM people
WHERE id = _id
AND COALESCE(id <> #start_with, TRUE);
SELECT Percentage
INTO _percentage
FROM percentages
WHERE id = _id;
SET _path = CONCAT( _percentage , delimiter, _path);
END LOOP;
END $$
DROP FUNCTION IF EXISTS `hierarchy_sys_connect_by_path_percentage_result`$$
CREATE FUNCTION hierarchy_sys_connect_by_path_percentage_result(
node INT)
RETURNS FLOAT
NOT DETERMINISTIC
READS SQL DATA
BEGIN
DECLARE _path TEXT;
DECLARE _id INT;
DECLARE _percentage FLOAT;
DECLARE EXIT HANDLER FOR NOT FOUND RETURN _path;
SET _id = COALESCE(node, #id);
SELECT Percentage
INTO _path
FROM percentages
WHERE id = _id;
LOOP
SELECT parent
INTO _id
FROM people
WHERE id = _id
AND COALESCE(id <> #start_with, TRUE);
SELECT Percentage
INTO _percentage
FROM percentages
WHERE id = _id;
SET _path = _percentage * _path;
END LOOP;
END $$
DELIMITER ;
Query
SELECT hi.id AS ID,
p.Percentage,
hierarchy_sys_connect_by_path_percentage('*', hi.id) AS Calculation,
hierarchy_sys_connect_by_path_percentage_result(hi.id) AS Actual
FROM people hi
JOIN percentages p
ON hi.id = p.id;
Result
+------+------------+-----------------+--------------------+
| ID | Percentage | Calculation | Actual |
+------+------------+-----------------+--------------------+
| 1 | 0.7 | 0.7 | 0.699999988079071 |
| 2 | 0.6 | 0.7*0.6 | 0.419999986886978 |
| 3 | 0.1 | 0.7*0.6*0.1 | 0.0419999994337559 |
| 4 | 0.5 | 0.7*0.6*0.1*0.5 | 0.0210000015795231 |
| 5 | 0.4 | 0.7*0.4 | 0.280000001192093 |
| 6 | 0.3 | 0.3 | 0.300000011920929 |
+------+------------+-----------------+--------------------+
Formatting the numbers is trivial so I leave it to you...
More important are optimizations to make less calls on the database.
Step 1 - Create a MySQL function to return the family tree as a comma delimited TEXT column:
DELIMITER //
CREATE FUNCTION fnFamilyTree ( id INT ) RETURNS TEXT
BEGIN
SET #tree = id;
SET #qid = id;
WHILE (#qid > 0) DO
SELECT IFNULL(p.parent,-1)
INTO #qid
FROM people p
WHERE p.id = #qid LIMIT 1;
IF ( #qid > 0 ) THEN
SET #tree = CONCAT(#tree,',',#qid);
END IF;
END WHILE;
RETURN #tree;
END
//
DELIMITER ;
Then use the following SQL to retrieve your results:
SELECT ppl.id
,ppl.percentage
,GROUP_CONCAT(pct.percentage SEPARATOR '*') as Calculations
,EXP(SUM(LOG(pct.percentage))) as Actual
FROM (SELECT p1.id
,p2.percentage
,fnFamilyTree( p1.id ) as FamilyTree
FROM people p1
JOIN percentages p2
ON p2.id = p1.id
) ppl
JOIN percentages pct
ON FIND_IN_SET( pct.id, ppl.FamilyTree ) > 0
GROUP BY ppl.id
,ppl.percentage
;
SQLFiddle at http://sqlfiddle.com/#!2/9da5b/12
Results:
+------+----------------+-----------------+----------------+
| ID | Percentage | Calculations | Actual |
+------+----------------+-----------------+----------------+
| 1 | 0.699999988079 | 0.7 | 0.699999988079 |
| 2 | 0.600000023842 | 0.7*0.6 | 0.420000009537 |
| 3 | 0.10000000149 | 0.7*0.6*0.1 | 0.04200000158 |
| 4 | 0.5 | 0.1*0.5*0.7*0.6 | 0.02100000079 |
| 5 | 0.40000000596 | 0.4*0.7 | 0.279999999404 |
| 6 | 0.300000011921 | 0.3 | 0.300000011921 |
+------+----------------+-----------------+----------------+
MySQL is a Relational DBS. Your requirements needs a Graph database.
However if you stay at MySQL there exists a few methods to add a few graph features. One of them is the concept of Nested Sets. But I don't suggest that, as it adds a lot of complexity.
SELECT a.id
, ROUND(pa.percentage/100
* COALESCE(pb.percentage/100,1)
* COALESCE(pc.percentage/100,1)
* COALESCE(pd.percentage/100,1)
* 100,2) x
FROM people a
LEFT
JOIN people b
ON b.id = a.parent
LEFT
JOIN people c
ON c.id = b.parent
LEFT
JOIN people d
ON d.id = c.parent
LEFT
JOIN percentages pa
ON pa.id = a.id
LEFT
JOIN percentages pb
ON pb.id = b.id
LEFT
JOIN percentages pc
ON pc.id = c.id
LEFT
JOIN percentages pd
ON pd.id = d.id
;
Consider switching to Postgres9, which supports recursive queries:
WITH RECURSIVE recp AS (
SELECT p.id, p.name, p.parent
, array[p.id] AS anti_loop
, array[pr.percentage ] AS percentages
, pr.percentage AS final_pr
FROM people p
JOIN percentages pr ON pr.id = p.id
WHERE parent = 0
UNION ALL
SELECT ptree.id, ptree.name, ptree.parent
, recp.anti_loop || ptree.id
, recp.percentages || pr.percentage
, recp.final_pr * pr.percentage
FROM people ptree
JOIN percentages pr ON pr.id = ptree.id
JOIN recp ON recp.id = ptree.parent AND ptree.id != ALL(recp.anti_loop)
)
SELECT id, name
, array_to_string(anti_loop, ' <- ') AS path
, array_to_string(percentages::numeric(10,2)[], ' * ') AS percentages_str
, final_pr
FROM recp
ORDER BY anti_loop
Check out sqlFiddle demo