|
1 | 1 | import torch |
2 | 2 | from pytest import mark, raises |
| 3 | +from torch.autograd import grad |
3 | 4 | from torch.testing import assert_close |
4 | 5 |
|
5 | 6 | from torchjd import mtl_backward |
6 | | -from torchjd.aggregation import MGDA, Aggregator, Mean, Random, UPGrad |
| 7 | +from torchjd.aggregation import MGDA, Aggregator, Mean, Random, Sum, UPGrad |
7 | 8 |
|
8 | 9 |
|
9 | 10 | @mark.parametrize("aggregator", [Mean(), UPGrad(), MGDA(), Random()]) |
@@ -557,3 +558,118 @@ def test_default_shared_params_overlapping_with_default_tasks_params_fails(): |
557 | 558 | features=[f], |
558 | 559 | aggregator=UPGrad(), |
559 | 560 | ) |
| 561 | + |
| 562 | + |
| 563 | +def test_repeated_losses(): |
| 564 | + """ |
| 565 | + Tests that mtl_backward correctly works when some losses are repeated. In this case, since |
| 566 | + torch.autograd.backward would sum the gradients of the repeated losses, it is natural for |
| 567 | + autojac to sum the task-specific gradients, and to compute and aggregate a Jacobian with one row |
| 568 | + per repeated tensor, for shared gradients. |
| 569 | + """ |
| 570 | + |
| 571 | + p0 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 572 | + p1 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 573 | + p2 = torch.tensor([3.0, 4.0], requires_grad=True) |
| 574 | + |
| 575 | + f1 = torch.tensor([-1.0, 1.0]) @ p0 |
| 576 | + f2 = (p0**2).sum() + p0.norm() |
| 577 | + y1 = f1 * p1[0] + f2 * p1[1] |
| 578 | + y2 = f1 * p2[0] + f2 * p2[1] |
| 579 | + |
| 580 | + expected_grad_wrt_p0 = grad([y1, y1, y2], [p0], retain_graph=True)[0] |
| 581 | + expected_grad_wrt_p1 = grad([y1, y1], [p1], retain_graph=True)[0] |
| 582 | + expected_grad_wrt_p2 = grad([y2], [p2], retain_graph=True)[0] |
| 583 | + |
| 584 | + losses = [y1, y1, y2] |
| 585 | + mtl_backward(losses=losses, features=[f1, f2], aggregator=Sum(), retain_graph=True) |
| 586 | + |
| 587 | + assert_close(p0.grad, expected_grad_wrt_p0) |
| 588 | + assert_close(p1.grad, expected_grad_wrt_p1) |
| 589 | + assert_close(p2.grad, expected_grad_wrt_p2) |
| 590 | + |
| 591 | + |
| 592 | +def test_repeated_features(): |
| 593 | + """ |
| 594 | + Tests that mtl_backward correctly works when some features are repeated. Repeated features are |
| 595 | + a bit more tricky, because we differentiate with respect to them (in which case it shouldn't |
| 596 | + matter that they are repeated) and we also differentiate them (in which case it should lead to |
| 597 | + extra rows in the Jacobian). |
| 598 | + """ |
| 599 | + |
| 600 | + p0 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 601 | + p1 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 602 | + p2 = torch.tensor([3.0, 4.0], requires_grad=True) |
| 603 | + |
| 604 | + f1 = torch.tensor([-1.0, 1.0]) @ p0 |
| 605 | + f2 = (p0**2).sum() + p0.norm() |
| 606 | + y1 = f1 * p1[0] + f2 * p1[1] |
| 607 | + y2 = f1 * p2[0] + f2 * p2[1] |
| 608 | + |
| 609 | + grad_outputs = grad([y1, y2], [f1, f1, f2], retain_graph=True) |
| 610 | + expected_grad_wrt_p0 = grad([f1, f1, f2], [p0], grad_outputs, retain_graph=True)[0] |
| 611 | + expected_grad_wrt_p1 = grad([y1], [p1], retain_graph=True)[0] |
| 612 | + expected_grad_wrt_p2 = grad([y2], [p2], retain_graph=True)[0] |
| 613 | + |
| 614 | + features = [f1, f1, f2] |
| 615 | + mtl_backward(losses=[y1, y2], features=features, aggregator=Sum()) |
| 616 | + |
| 617 | + assert_close(p0.grad, expected_grad_wrt_p0) |
| 618 | + assert_close(p1.grad, expected_grad_wrt_p1) |
| 619 | + assert_close(p2.grad, expected_grad_wrt_p2) |
| 620 | + |
| 621 | + |
| 622 | +def test_repeated_shared_params(): |
| 623 | + """ |
| 624 | + Tests that mtl_backward correctly works when some shared are repeated. Since these are tensors |
| 625 | + with respect to which we differentiate, to match the behavior of torch.autograd.backward, this |
| 626 | + repetition should not affect the result. |
| 627 | + """ |
| 628 | + |
| 629 | + p0 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 630 | + p1 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 631 | + p2 = torch.tensor([3.0, 4.0], requires_grad=True) |
| 632 | + |
| 633 | + f1 = torch.tensor([-1.0, 1.0]) @ p0 |
| 634 | + f2 = (p0**2).sum() + p0.norm() |
| 635 | + y1 = f1 * p1[0] + f2 * p1[1] |
| 636 | + y2 = f1 * p2[0] + f2 * p2[1] |
| 637 | + |
| 638 | + expected_grad_wrt_p0 = grad([y1, y2], [p0], retain_graph=True)[0] |
| 639 | + expected_grad_wrt_p1 = grad([y1], [p1], retain_graph=True)[0] |
| 640 | + expected_grad_wrt_p2 = grad([y2], [p2], retain_graph=True)[0] |
| 641 | + |
| 642 | + shared_params = [p0, p0] |
| 643 | + mtl_backward(losses=[y1, y2], features=[f1, f2], aggregator=Sum(), shared_params=shared_params) |
| 644 | + |
| 645 | + assert_close(p0.grad, expected_grad_wrt_p0) |
| 646 | + assert_close(p1.grad, expected_grad_wrt_p1) |
| 647 | + assert_close(p2.grad, expected_grad_wrt_p2) |
| 648 | + |
| 649 | + |
| 650 | +def test_repeated_task_params(): |
| 651 | + """ |
| 652 | + Tests that mtl_backward correctly works when some task-specific params are repeated for some |
| 653 | + task. Since these are tensors with respect to which we differentiate, to match the behavior of |
| 654 | + torch.autograd.backward, this repetition should not affect the result. |
| 655 | + """ |
| 656 | + |
| 657 | + p0 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 658 | + p1 = torch.tensor([1.0, 2.0], requires_grad=True) |
| 659 | + p2 = torch.tensor([3.0, 4.0], requires_grad=True) |
| 660 | + |
| 661 | + f1 = torch.tensor([-1.0, 1.0]) @ p0 |
| 662 | + f2 = (p0**2).sum() + p0.norm() |
| 663 | + y1 = f1 * p1[0] + f2 * p1[1] |
| 664 | + y2 = f1 * p2[0] + f2 * p2[1] |
| 665 | + |
| 666 | + expected_grad_wrt_p0 = grad([y1, y2], [p0], retain_graph=True)[0] |
| 667 | + expected_grad_wrt_p1 = grad([y1], [p1], retain_graph=True)[0] |
| 668 | + expected_grad_wrt_p2 = grad([y2], [p2], retain_graph=True)[0] |
| 669 | + |
| 670 | + tasks_params = [[p1, p1], [p2]] |
| 671 | + mtl_backward(losses=[y1, y2], features=[f1, f2], aggregator=Sum(), tasks_params=tasks_params) |
| 672 | + |
| 673 | + assert_close(p0.grad, expected_grad_wrt_p0) |
| 674 | + assert_close(p1.grad, expected_grad_wrt_p1) |
| 675 | + assert_close(p2.grad, expected_grad_wrt_p2) |
0 commit comments