From 67cc8f5e3b7ccab9a1bc0738692c2a42cfc44dbb Mon Sep 17 00:00:00 2001
From: Puning97 <114408373+Puning97@users.noreply.github.com>
Date: Mon, 11 May 2026 23:12:41 +0800
Subject: [PATCH 1/4] update es, satimp, and eua

We update the following components:

Extraction Strength (ES) metric for retain data.

Hyperparameter default setting for SatImp

New method EUA, which is accepted in ICML2026
---
 configs/eval/tofu.yaml                        |  1 +
 .../retain_extraction_strength.yaml           | 15 +++++
 configs/trainer/EUA.yaml                      | 15 +++++
 configs/trainer/SatImp.yaml                   |  2 +-
 src/evals/metrics/__init__.py                 |  2 +
 src/evals/metrics/memorization.py             | 49 ++++++++++++++++
 src/trainer/__init__.py                       |  2 +
 src/trainer/unlearn/eua.py                    | 36 ++++++++++++
 src/trainer/utils.py                          | 56 +++++++++++++++++++
 9 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 configs/eval/tofu_metrics/retain_extraction_strength.yaml
 create mode 100644 configs/trainer/EUA.yaml
 create mode 100644 src/trainer/unlearn/eua.py

diff --git a/configs/eval/tofu.yaml b/configs/eval/tofu.yaml
index 29e05e488..e1d4fd368 100644
--- a/configs/eval/tofu.yaml
+++ b/configs/eval/tofu.yaml
@@ -11,6 +11,7 @@ defaults: # include all defined metrics files
     - model_utility # populated in the metrics key as metrics.model_utility
     - privleak
     - extraction_strength
+    - retain_extraction_strength
     # - exact_memorization
     # - mia_min_k_plus_plus
     # - mia_min_k
diff --git a/configs/eval/tofu_metrics/retain_extraction_strength.yaml b/configs/eval/tofu_metrics/retain_extraction_strength.yaml
new file mode 100644
index 000000000..981851211
--- /dev/null
+++ b/configs/eval/tofu_metrics/retain_extraction_strength.yaml
@@ -0,0 +1,15 @@
+# @package eval.tofu.metrics.retain_extraction_strength
+defaults:
+  - ../../data/datasets@datasets: TOFU_QA_retain_eval
+  - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
+  # ^ get default dataset and generation config information
+
+handler: retain_extraction_strength
+batch_size: ${eval.tofu.batch_size}
+
+datasets:
+  TOFU_QA_retain_eval:
+    args:
+      hf_args:
+        name: "retain_perturbed"
+      question_key: ${eval.tofu.question_key}
\ No newline at end of file
diff --git a/configs/trainer/EUA.yaml b/configs/trainer/EUA.yaml
new file mode 100644
index 000000000..ad331b269
--- /dev/null
+++ b/configs/trainer/EUA.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - GradDiff
+
+handler: EUA
+
+args: # HuggingFace TrainingArguments
+  learning_rate: 1e-5
+  num_train_epochs: 10
+
+method_args:
+  beta1: 0.1
+  beta2: 1.0
+  alpha: 1.0   #retain_loss
+  gamma: 0.05    #forget_loss
+  retain_loss_type: NLL
\ No newline at end of file
diff --git a/configs/trainer/SatImp.yaml b/configs/trainer/SatImp.yaml
index f8d9c757b..3e27d8e14 100644
--- a/configs/trainer/SatImp.yaml
+++ b/configs/trainer/SatImp.yaml
@@ -9,7 +9,7 @@ args: # HuggingFace TrainingArguments
 
 method_args:
   beta1: 5.0
-  beta2: 1.0
+  beta2: 0.5
   alpha: 1.0
   gamma: 0.1
   retain_loss_type: NLL
\ No newline at end of file
diff --git a/src/evals/metrics/__init__.py b/src/evals/metrics/__init__.py
index 5afb04243..967e89a7c 100644
--- a/src/evals/metrics/__init__.py
+++ b/src/evals/metrics/__init__.py
@@ -7,6 +7,7 @@
     rouge,
     truth_ratio,
     extraction_strength,
+    retain_extraction_strength,
     exact_memorization,
 )
 from evals.metrics.privacy import ks_test, privleak, rel_diff
@@ -62,6 +63,7 @@ def get_metrics(metric_cfgs: DictConfig, **kwargs):
 _register_metric(rel_diff)
 _register_metric(exact_memorization)
 _register_metric(extraction_strength)
+_register_metric(retain_extraction_strength)
 
 # Register MIA metrics
 _register_metric(mia_loss)
diff --git a/src/evals/metrics/memorization.py b/src/evals/metrics/memorization.py
index c7bbe386c..9ddeb0a64 100644
--- a/src/evals/metrics/memorization.py
+++ b/src/evals/metrics/memorization.py
@@ -267,3 +267,52 @@ def _extraction_strength(model, batch):
     )
     es_values = aggregate_to_1D(es_values)
     return {"agg_value": np.mean(es_values), "value_by_index": scores_by_index}
+
+@unlearning_metric(name="retain_extraction_strength")
+def retain_extraction_strength(model, **kwargs):
+    data = kwargs["data"]
+    collator = kwargs["collators"]
+    batch_size = kwargs["batch_size"]
+    dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator)
+
+    def _extraction_strength(model, batch):
+        log_probs_batch, labels_batch = tokenwise_vocab_logprobs(
+            model, batch, grad=False, return_labels=True
+        )
+        es_batch = []
+        for log_probs, labels in zip(log_probs_batch, labels_batch):
+            valid_len = len(labels)
+            preds = torch.argmax(log_probs, dim=-1)
+            for k in range(valid_len):
+                suff_preds = preds[k:]
+                suff_labels = labels[k:]
+                if torch.equal(suff_preds, suff_labels):
+                    break
+            if valid_len == 0:
+                # Rarely, tokenization can result in a mismatch with no valid target
+                # tokens for loss computation (see preprocess_chat_instance() for
+                # reference). Since this condition makes no sense in terms of
+                # computing ES, we just choose to set ES=None
+                logger.warning(
+                    "ES score for an instance is marked None, due to "
+                    "tokenization issues that resulted in no valid target tokens."
+                )
+                es_batch.append({"score": 0})
+            else:
+                es_score = 1 - (k / valid_len)
+                es_batch.append({"score": es_score})
+        return es_batch
+
+    fun_args = {}
+    scores_by_index = run_batchwise_evals(
+        model, dataloader, _extraction_strength, fun_args, "Calculating ES"
+    )
+    es_values = np.array(
+        [
+            evals["score"]
+            for evals in scores_by_index.values()
+            if evals["score"] is not None
+        ]
+    )
+    es_values = aggregate_to_1D(es_values)
+    return {"agg_value": np.mean(es_values), "value_by_index": scores_by_index}
diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py
index 447b2d2dc..be7e5be37 100644
--- a/src/trainer/__init__.py
+++ b/src/trainer/__init__.py
@@ -15,6 +15,7 @@
 from trainer.unlearn.satimp import SatImp
 from trainer.unlearn.wga import WGA
 from trainer.unlearn.pdu import PDU
+from trainer.unlearn.eua import EUA
 
 
 import logging
@@ -99,3 +100,4 @@ def load_trainer(
 _register_trainer(SatImp)
 _register_trainer(WGA)
 _register_trainer(PDU)
+_register_trainer(EUA)
diff --git a/src/trainer/unlearn/eua.py b/src/trainer/unlearn/eua.py
new file mode 100644
index 000000000..cdab20c7a
--- /dev/null
+++ b/src/trainer/unlearn/eua.py
@@ -0,0 +1,36 @@
+from trainer.unlearn.grad_diff import GradDiff
+import torch
+import torch.nn.functional as F
+from trainer.utils import compute_eua_loss
+
+class EUA(GradDiff):
+    def __init__(
+        self, beta1=0.3, beta2=1.0, gamma=1.0, alpha=0.1, *args, **kwargs
+    ):  # attention, satimp requires two beta!!!!
+        super().__init__(*args, **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.gamma = gamma
+        self.alpha = alpha
+        if self.ref_model is None:
+            self.ref_model = self._prepare_ref_model(self.model)
+    
+    def compute_loss(self, model, inputs, return_outputs=False):
+        forget_inputs = inputs["forget"]
+        forget_inputs = {
+            "input_ids": forget_inputs["input_ids"],
+            "attention_mask": forget_inputs["attention_mask"],
+            "labels": forget_inputs["labels"],
+        }
+
+        retain_inputs = inputs["retain"]
+        retain_inputs = {
+            "input_ids": retain_inputs["input_ids"],
+            "attention_mask": retain_inputs["attention_mask"],
+            "labels": retain_inputs["labels"],
+        }
+        retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs)
+        eua_loss, outputs = compute_eua_loss(model=model, forget_inputs=forget_inputs, retain_inputs=retain_inputs, beta1=self.beta1, beta2=self.beta2, ref_model=self.ref_model)
+        loss = self.gamma * eua_loss + self.alpha * retain_loss
+
+        return (loss, outputs) if return_outputs else loss
diff --git a/src/trainer/utils.py b/src/trainer/utils.py
index 5bdb328f4..9da1342e5 100644
--- a/src/trainer/utils.py
+++ b/src/trainer/utils.py
@@ -132,3 +132,59 @@ def compute_satimp_loss(model, inputs, beta1, beta2):
         shift_labels.view(-1) != -100
     ].mean()
     return forget_loss, outputs
+
+def compute_eua_loss(model, forget_inputs, retain_inputs,beta1, beta2, ref_model=None):
+    def get_preference_tensors(logits, ratio=0.1):
+        assert 0 < ratio < 1
+        dim = logits.shape[1]
+        k = int(dim * ratio)
+        if k == 0:
+            raise ValueError("ratio too small, leading k=0.")
+
+        # top ratio%
+        topk_values, topk_indices = torch.topk(logits, k, dim=1)
+        preference_positive = torch.zeros_like(logits)
+        preference_positive.scatter_(1, topk_indices, topk_values)
+
+        # bottom ratio%
+        bottomk_values, bottomk_indices = torch.topk(-logits, k, dim=1)
+        preference_negative = torch.zeros_like(logits)
+        preference_negative.scatter_(1, bottomk_indices, logits.gather(1, bottomk_indices))
+
+        return preference_positive, preference_negative
+    #forget
+    outputs = model(**forget_inputs)
+    labels = forget_inputs["labels"]
+    labels = labels.to(outputs.logits.device)
+
+    shift_logits = outputs.logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+    en_out = -torch.logsumexp(shift_logits.view(-1, shift_logits.size(-1))/beta2, dim=1)
+
+    #retain
+    retain_outputs = model(**retain_inputs)
+    retain_labels = retain_inputs["labels"]
+    retain_labels = retain_labels.to(retain_outputs.logits.device)
+    
+    shift_retain_logits = retain_outputs.logits[..., :-1, :].contiguous()
+    shift_retain_labels = retain_labels[..., 1:].contiguous()
+    en_in = -torch.logsumexp(shift_retain_logits.view(-1, shift_retain_logits.size(-1))/beta2, dim=1)
+
+    with torch.no_grad():
+        forget_outputs_oracle = ref_model(**forget_inputs)
+        retain_outputs_oracle = ref_model(**retain_inputs)
+        retain_logits_oracle = retain_outputs_oracle.logits[..., :-1, :].contiguous()
+        forget_logits_oracle = forget_outputs_oracle.logits[..., :-1, :].contiguous()
+
+        forget_positive, forget_negative = get_preference_tensors(forget_logits_oracle.view(-1, forget_logits_oracle.size(-1)),ratio=beta1)
+        retain_positive, retain_negative = get_preference_tensors(retain_logits_oracle.view(-1, retain_logits_oracle.size(-1)),ratio=beta1)
+
+        margin_out = -torch.logsumexp(forget_negative/beta2, dim=1)
+        margin_in =  -torch.logsumexp(retain_positive/beta2, dim=1)
+
+    eua_loss = (torch.pow(F.relu(en_in-margin_in), 2)[shift_retain_labels.view(-1) != -100].mean() + torch.pow(F.relu(margin_out-en_out), 2)[shift_labels.view(-1) != -100].mean())
+    return eua_loss, outputs
+
+
+
+

From 272e87b1712f1266acae58eb3246b7a75e9c82dd Mon Sep 17 00:00:00 2001
From: Puning97 <114408373+Puning97@users.noreply.github.com>
Date: Fri, 15 May 2026 21:37:14 +0800
Subject: [PATCH 2/4] Update SatImp and ES for Retain

---
 README.md                         |  63 ++++++++++++++++--------------
 configs/trainer/SatImp.yaml       |   8 ++--
 src/evals/metrics/memorization.py |   1 +
 src/trainer/__init__.py           |   2 -
 src/trainer/unlearn/.DS_Store     | Bin 0 -> 6148 bytes
 src/trainer/unlearn/eua.py        |  36 -----------------
 src/trainer/unlearn/satimp.py     |   2 +-
 src/trainer/utils.py              |  56 --------------------------
 8 files changed, 40 insertions(+), 128 deletions(-)
 create mode 100644 src/trainer/unlearn/.DS_Store
 delete mode 100644 src/trainer/unlearn/eua.py

diff --git a/README.md b/README.md
index 293bf34ca..1385f90a9 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 <div align="center">
 
-![*Open*Unlearning](assets/banner.png)
+![OpenUnlearning](assets/banner.png)
 
 <h3><strong>An easily extensible framework unifying LLM unlearning evaluation benchmarks.</strong></h3>
 
-  <div style="display: flex; gap: 10px; justify-content: center; align-items: center;">
+<div style="display: flex; gap: 10px; justify-content: center; align-items: center;">
     <a href="https://arxiv.org/abs/2506.12618"><img src="https://img.shields.io/badge/arXiv-Report-b31b1b?logo=arxiv&logoColor=white" alt="arXiv Paper"/></a>
     <a href="https://github.com/locuslab/open-unlearning"><img src="https://img.shields.io/github/stars/locuslab/open-unlearning?style=social" alt="GitHub Repo stars"/></a>
     <a href="https://github.com/locuslab/open-unlearning/actions"><img src="https://github.com/locuslab/open-unlearning/actions/workflows/tests.yml/badge.svg" alt="Build Status"/></a>
@@ -21,7 +21,6 @@
 
 We provide efficient and streamlined implementations of the TOFU, MUSE and WMDP unlearning benchmarks while supporting 12+ unlearning methods, 5+ datasets, 10+ evaluation metrics, and 7+ LLM architectures. Each of these can be easily extended to incorporate more variants.
 
-
 We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field.
 
 ---
@@ -37,14 +36,14 @@ We invite the LLM unlearning community to collaborate by adding new benchmarks,
 🚨 Our paper `OpenUnlearning: Accelerating LLM Unlearning via Unified Benchmarking of Methods and Metrics` is now out on [arXiv](https://arxiv.org/abs/2506.12618).
 
 🌟 **Highlights:**
+
 - A detailed technical report on OpenUnlearning covering the design, features, and implementation.
-- A meta-evaluation framework for benchmarking unlearning evaluations across 450+ models, open-sourced on HuggingFace 🤗: [TOFU Models w & w/o Knowledge](https://huggingface.co/collections/open-unlearning/tofu-models-w-and-w-o-knowledge-6861e4d935eb99ba162e55cd), [TOFU Unlearned Models](https://huggingface.co/collections/open-unlearning/tofu-unlearned-models-6860f6cf3fe35d0223d92e88).
+- A meta-evaluation framework for benchmarking unlearning evaluations across 450+ models, open-sourced on HuggingFace 🤗: [TOFU Models w &amp; w/o Knowledge](https://huggingface.co/collections/open-unlearning/tofu-models-w-and-w-o-knowledge-6861e4d935eb99ba162e55cd), [TOFU Unlearned Models](https://huggingface.co/collections/open-unlearning/tofu-unlearned-models-6860f6cf3fe35d0223d92e88).
 - Results benchmarking 8 diverse unlearning methods in one place using 10 evaluation metrics on TOFU.
 
 <details>
 <summary><b>Older Updates</b></summary>
 
-
 #### [May 19, 2025]
 
 - **More Methods!** Added support for unlearning methods [UNDIAL](https://aclanthology.org/2025.naacl-long.444/) and [AltPO](https://aclanthology.org/2025.coling-main.252/).
@@ -55,6 +54,7 @@ We invite the LLM unlearning community to collaborate by adding new benchmarks,
 - **More evaluations!**  The [`lm-evaluation-harness`](https://github.com/EleutherAI/lm-evaluation-harness) toolkit has been integrated into OpenUnlearning, enabling WMDP evaluations and support for popular general LLM benchmarks, including MMLU, GSM8K, and others.
 
 #### [Apr 6, 2025]
+
 - **More Metrics!** Added 6 Membership Inference Attacks (MIA) (LOSS, ZLib, Reference, GradNorm, MinK, and MinK++), along with Extraction Strength (ES) and  Exact Memorization (EM) as additional evaluation metrics.
 - **More TOFU Evaluations!** Now includes a holdout set and supports MIA attack-based evaluation. You can now compute MUSE's privleak on TOFU.
 - **More Documentation!** [`docs/links.md`](docs/links.md) contains resources for each of the implemented features and other useful LLM unlearning resources.
@@ -62,12 +62,15 @@ We invite the LLM unlearning community to collaborate by adding new benchmarks,
 Be sure to run `python setup_data.py` immediately after merging the latest version. This is required to refresh the downloaded eval log files and ensure they're compatible with the latest evaluation metrics.
 
 #### [Mar 27, 2025]
+
 - **More Documentation: easy contributions and the leaderboard functionality**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details.
 
 #### [Mar 9, 2025]
+
 - **More Methods!** Added support for [RMU](https://arxiv.org/abs/2403.03218) (representation-engineering based unlearning).
 
-#### [Feb 27, 2025]  
+#### [Feb 27, 2025]
+
 ⚠️ **Repository Update**: This repo replaces the original TOFU codebase at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu), which is no longer maintained.
 
 </details>
@@ -78,18 +81,18 @@ Be sure to run `python setup_data.py` immediately after merging the latest versi
 
 We provide several variants for each of the components in the unlearning pipeline.
 
-| **Component**          | **Available Options** |
-|------------------------|----------------------|
-| **Benchmarks**        | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/), [WMDP](https://www.wmdp.ai/) |
-| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU, UNDIAL, AltPO, SatImp, WGA, CE-U, PDU |
-| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) |
-| **Datasets**          | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits), WMDP-Bio, WMDP-Cyber |
-| **Model Families**    | TOFU: Llama-3.2, Llama-3.1, Llama-2; MUSE: Llama-2; Additional: Phi-3.5, Phi-1.5, Gemma, Zephyr |
+| **Component**          | **Available Options**                                                                                                                                                                                                           |
+| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Benchmarks**         | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/), [WMDP](https://www.wmdp.ai/)                                                                                                                                  |
+| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU, UNDIAL, AltPO, SatImp, WGA, CE-U, PDU                                                                                                                                                    |
+| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks,[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) |
+| **Datasets**           | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits), WMDP-Bio, WMDP-Cyber                                                                                                                                             |
+| **Model Families**     | TOFU: Llama-3.2, Llama-3.1, Llama-2; MUSE: Llama-2; Additional: Phi-3.5, Phi-1.5, Gemma, Zephyr                                                                                                                                       |
 
 ---
 
-
 ## 📌 Table of Contents
+
 - 📖 [Overview](#-overview)
 - 📢 [Updates](#-updates)
 - 🗃️ [Available Components](#%EF%B8%8F-available-components)
@@ -101,7 +104,7 @@ We provide several variants for each of the components in the unlearning pipelin
   - 📜 [Running Baseline Experiments](#-running-baseline-experiments)
 - ➕ [How to Contribute](#-how-to-contribute)
 - 📚 [Further Documentation](#-further-documentation)
-- 🔗 [Support & Contributors](#-support--contributors)
+- 🔗 [Support &amp; Contributors](#-support--contributors)
 - 📝 [Citing this work](#-citing-this-work)
 - 🤝 [Acknowledgements](#-acknowledgements)
 - 📄 [License](#-license)
@@ -129,7 +132,7 @@ python setup_data.py --eval # saves/eval now contains evaluation results of the
 
 ### 🔄 Updated TOFU benchmark
 
-We've updated Open-Unlearning's TOFU benchmark target models to use a wider variety of newer architectures with sizes varying from 1B to 8B. These include Llama 3.2 1B, Llama 3.2 3B, Llama 3.1 8B, and the original Llama-2 7B (re-created) target models from [the old version of TOFU](github.com/locuslab/tofu). 
+We've updated Open-Unlearning's TOFU benchmark target models to use a wider variety of newer architectures with sizes varying from 1B to 8B. These include Llama 3.2 1B, Llama 3.2 3B, Llama 3.1 8B, and the original Llama-2 7B (re-created) target models from [the old version of TOFU](github.com/locuslab/tofu).
 
 For each architecture, we have finetuned with four different splits of the TOFU datasets: `full`, `retain90`, `retain95`, `retain99`, for a total of 16 finetuned models. The first serves as the target (base model for unlearning) and the rest are retain models used to measure performance against for each forget split. These models are on [HuggingFace](`https://huggingface.co/collections/open-unlearning/tofu-new-models-67bcf636334ea81727573a9f0`) and the paths to these models can be set in the experimental configs or in command-line overrides.
 
@@ -172,8 +175,8 @@ python src/eval.py --config-name=eval.yaml experiment=eval/tofu/default \
 
 For more details about creating and running evaluations, refer [`docs/evaluation.md`](docs/evaluation.md).
 
-
 ### 📜 Running Baseline Experiments
+
 The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. The expected results for these are in [`docs/repro.md`](docs/repro.md).
 
 ```bash
@@ -189,20 +192,20 @@ The above scripts are not tuned and uses default hyper parameter settings. We en
 
 If you are interested in contributing to our work, please have a look at [`contributing.md`](docs/contributing.md) guide.
 
-
 ## 📚 Further Documentation
 
 For more in-depth information on specific aspects of the framework, refer to the following documents:
 
-| **Documentation**                              | **Contains**                                                                                                       |
-|------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|
-| [`docs/contributing.md`](docs/contributing.md)       | Instructions on how to add new methods, benchmarks, components such as trainers, benchmarks, metrics, models, datasets, etc.              |
-| [`docs/evaluation.md`](docs/evaluation.md)       | Detailed instructions on creating and running evaluation metrics and benchmarks.                                     |
-| [`docs/experiments.md`](docs/experiments.md)     | Guide on running experiments in various configurations and settings, including distributed training, fine-tuning, and overriding arguments. |
-| [`docs/hydra.md`](docs/hydra.md)                 | A short tutorial on Hydra features, Hydra is the configuration management package we use extensively.                                  |
-| [`community/leaderboard.md`](community/leaderboard.md)             | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks.              |
-| [`docs/links.md`](docs/links.md)             | List of all links to the research papers or other sources the implemented features are sourced from.              |
-| [`docs/repro.md`](docs/repro.md)            | Results are provided solely for reproducibility purposes, without any parameter tuning.             |
+| **Documentation**                               | **Contains**                                                                                                                          |
+| ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`docs/contributing.md`](docs/contributing.md)         | Instructions on how to add new methods, benchmarks, components such as trainers, benchmarks, metrics, models, datasets, etc.                |
+| [`docs/evaluation.md`](docs/evaluation.md)             | Detailed instructions on creating and running evaluation metrics and benchmarks.                                                            |
+| [`docs/experiments.md`](docs/experiments.md)           | Guide on running experiments in various configurations and settings, including distributed training, fine-tuning, and overriding arguments. |
+| [`docs/hydra.md`](docs/hydra.md)                       | A short tutorial on Hydra features, Hydra is the configuration management package we use extensively.                                       |
+| [`community/leaderboard.md`](community/leaderboard.md) | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks.                                     |
+| [`docs/links.md`](docs/links.md)                       | List of all links to the research papers or other sources the implemented features are sourced from.                                        |
+| [`docs/repro.md`](docs/repro.md)                       | Results are provided solely for reproducibility purposes, without any parameter tuning.                                                     |
+
 ---
 
 ## 🔗 Support & Contributors
@@ -239,18 +242,20 @@ If you use OpenUnlearning in your research, please make sure to cite our OpenUnl
   url={https://arxiv.org/abs/2407.06460}
 }
 ```
+
 </details>
 
 ---
 
 ### 🤝 Acknowledgements
 
-- This repo is inspired from [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory). 
-- The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/swj0419/muse_bench) benchmarks served as the foundation for our re-implementation. 
+- This repo is inspired from [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
+- The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/swj0419/muse_bench) benchmarks served as the foundation for our re-implementation.
 
 ---
 
 ### 📄 License
+
 This project is licensed under the MIT License. See the [`LICENSE`](LICENSE) file for details.
 
 ---
diff --git a/configs/trainer/SatImp.yaml b/configs/trainer/SatImp.yaml
index 3e27d8e14..8da6b1336 100644
--- a/configs/trainer/SatImp.yaml
+++ b/configs/trainer/SatImp.yaml
@@ -8,8 +8,8 @@ args: # HuggingFace TrainingArguments
   num_train_epochs: 5
 
 method_args:
-  beta1: 5.0
-  beta2: 0.5
-  alpha: 1.0
-  gamma: 0.1
+  beta1: 4.0
+  beta2: 0.1
+  alpha: 0.1
+  gamma: 1.0
   retain_loss_type: NLL
\ No newline at end of file
diff --git a/src/evals/metrics/memorization.py b/src/evals/metrics/memorization.py
index 9ddeb0a64..c70b2f6d2 100644
--- a/src/evals/metrics/memorization.py
+++ b/src/evals/metrics/memorization.py
@@ -268,6 +268,7 @@ def _extraction_strength(model, batch):
     es_values = aggregate_to_1D(es_values)
     return {"agg_value": np.mean(es_values), "value_by_index": scores_by_index}
 
+
 @unlearning_metric(name="retain_extraction_strength")
 def retain_extraction_strength(model, **kwargs):
     data = kwargs["data"]
diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py
index be7e5be37..447b2d2dc 100644
--- a/src/trainer/__init__.py
+++ b/src/trainer/__init__.py
@@ -15,7 +15,6 @@
 from trainer.unlearn.satimp import SatImp
 from trainer.unlearn.wga import WGA
 from trainer.unlearn.pdu import PDU
-from trainer.unlearn.eua import EUA
 
 
 import logging
@@ -100,4 +99,3 @@ def load_trainer(
 _register_trainer(SatImp)
 _register_trainer(WGA)
 _register_trainer(PDU)
-_register_trainer(EUA)
diff --git a/src/trainer/unlearn/.DS_Store b/src/trainer/unlearn/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
GIT binary patch
literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

literal 0
HcmV?d00001

diff --git a/src/trainer/unlearn/eua.py b/src/trainer/unlearn/eua.py
deleted file mode 100644
index cdab20c7a..000000000
--- a/src/trainer/unlearn/eua.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from trainer.unlearn.grad_diff import GradDiff
-import torch
-import torch.nn.functional as F
-from trainer.utils import compute_eua_loss
-
-class EUA(GradDiff):
-    def __init__(
-        self, beta1=0.3, beta2=1.0, gamma=1.0, alpha=0.1, *args, **kwargs
-    ):  # attention, satimp requires two beta!!!!
-        super().__init__(*args, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.gamma = gamma
-        self.alpha = alpha
-        if self.ref_model is None:
-            self.ref_model = self._prepare_ref_model(self.model)
-    
-    def compute_loss(self, model, inputs, return_outputs=False):
-        forget_inputs = inputs["forget"]
-        forget_inputs = {
-            "input_ids": forget_inputs["input_ids"],
-            "attention_mask": forget_inputs["attention_mask"],
-            "labels": forget_inputs["labels"],
-        }
-
-        retain_inputs = inputs["retain"]
-        retain_inputs = {
-            "input_ids": retain_inputs["input_ids"],
-            "attention_mask": retain_inputs["attention_mask"],
-            "labels": retain_inputs["labels"],
-        }
-        retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs)
-        eua_loss, outputs = compute_eua_loss(model=model, forget_inputs=forget_inputs, retain_inputs=retain_inputs, beta1=self.beta1, beta2=self.beta2, ref_model=self.ref_model)
-        loss = self.gamma * eua_loss + self.alpha * retain_loss
-
-        return (loss, outputs) if return_outputs else loss
diff --git a/src/trainer/unlearn/satimp.py b/src/trainer/unlearn/satimp.py
index f42d4acbb..b72be474d 100644
--- a/src/trainer/unlearn/satimp.py
+++ b/src/trainer/unlearn/satimp.py
@@ -4,7 +4,7 @@
 
 class SatImp(GradDiff):
     def __init__(
-        self, beta1=5.0, beta2=1.0, gamma=1.0, alpha=0.1, *args, **kwargs
+        self, beta1=5.0, beta2=0.1, gamma=1.0, alpha=0.05, *args, **kwargs
     ):  # attention, satimp requires two beta!!!!
         super().__init__(*args, **kwargs)
         self.beta1 = beta1
diff --git a/src/trainer/utils.py b/src/trainer/utils.py
index 9da1342e5..5bdb328f4 100644
--- a/src/trainer/utils.py
+++ b/src/trainer/utils.py
@@ -132,59 +132,3 @@ def compute_satimp_loss(model, inputs, beta1, beta2):
         shift_labels.view(-1) != -100
     ].mean()
     return forget_loss, outputs
-
-def compute_eua_loss(model, forget_inputs, retain_inputs,beta1, beta2, ref_model=None):
-    def get_preference_tensors(logits, ratio=0.1):
-        assert 0 < ratio < 1
-        dim = logits.shape[1]
-        k = int(dim * ratio)
-        if k == 0:
-            raise ValueError("ratio too small, leading k=0.")
-
-        # top ratio%
-        topk_values, topk_indices = torch.topk(logits, k, dim=1)
-        preference_positive = torch.zeros_like(logits)
-        preference_positive.scatter_(1, topk_indices, topk_values)
-
-        # bottom ratio%
-        bottomk_values, bottomk_indices = torch.topk(-logits, k, dim=1)
-        preference_negative = torch.zeros_like(logits)
-        preference_negative.scatter_(1, bottomk_indices, logits.gather(1, bottomk_indices))
-
-        return preference_positive, preference_negative
-    #forget
-    outputs = model(**forget_inputs)
-    labels = forget_inputs["labels"]
-    labels = labels.to(outputs.logits.device)
-
-    shift_logits = outputs.logits[..., :-1, :].contiguous()
-    shift_labels = labels[..., 1:].contiguous()
-    en_out = -torch.logsumexp(shift_logits.view(-1, shift_logits.size(-1))/beta2, dim=1)
-
-    #retain
-    retain_outputs = model(**retain_inputs)
-    retain_labels = retain_inputs["labels"]
-    retain_labels = retain_labels.to(retain_outputs.logits.device)
-    
-    shift_retain_logits = retain_outputs.logits[..., :-1, :].contiguous()
-    shift_retain_labels = retain_labels[..., 1:].contiguous()
-    en_in = -torch.logsumexp(shift_retain_logits.view(-1, shift_retain_logits.size(-1))/beta2, dim=1)
-
-    with torch.no_grad():
-        forget_outputs_oracle = ref_model(**forget_inputs)
-        retain_outputs_oracle = ref_model(**retain_inputs)
-        retain_logits_oracle = retain_outputs_oracle.logits[..., :-1, :].contiguous()
-        forget_logits_oracle = forget_outputs_oracle.logits[..., :-1, :].contiguous()
-
-        forget_positive, forget_negative = get_preference_tensors(forget_logits_oracle.view(-1, forget_logits_oracle.size(-1)),ratio=beta1)
-        retain_positive, retain_negative = get_preference_tensors(retain_logits_oracle.view(-1, retain_logits_oracle.size(-1)),ratio=beta1)
-
-        margin_out = -torch.logsumexp(forget_negative/beta2, dim=1)
-        margin_in =  -torch.logsumexp(retain_positive/beta2, dim=1)
-
-    eua_loss = (torch.pow(F.relu(en_in-margin_in), 2)[shift_retain_labels.view(-1) != -100].mean() + torch.pow(F.relu(margin_out-en_out), 2)[shift_labels.view(-1) != -100].mean())
-    return eua_loss, outputs
-
-
-
-

From 87799a6bc70810eed6de3361c7d935fd14e50315 Mon Sep 17 00:00:00 2001
From: Puning97 <114408373+Puning97@users.noreply.github.com>
Date: Fri, 15 May 2026 21:44:51 +0800
Subject: [PATCH 3/4] Delete EUA.yaml

---
 configs/trainer/EUA.yaml | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 configs/trainer/EUA.yaml

diff --git a/configs/trainer/EUA.yaml b/configs/trainer/EUA.yaml
deleted file mode 100644
index ad331b269..000000000
--- a/configs/trainer/EUA.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-defaults:
-  - GradDiff
-
-handler: EUA
-
-args: # HuggingFace TrainingArguments
-  learning_rate: 1e-5
-  num_train_epochs: 10
-
-method_args:
-  beta1: 0.1
-  beta2: 1.0
-  alpha: 1.0   #retain_loss
-  gamma: 0.05    #forget_loss
-  retain_loss_type: NLL
\ No newline at end of file

From 19206497b5925483052bf4e2cc4e694f513b4ae2 Mon Sep 17 00:00:00 2001
From: Puning97 <114408373+Puning97@users.noreply.github.com>
Date: Fri, 15 May 2026 21:55:21 +0800
Subject: [PATCH 4/4] Update satimp.py

---
 src/trainer/unlearn/satimp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/trainer/unlearn/satimp.py b/src/trainer/unlearn/satimp.py
index b72be474d..dfd1ed33a 100644
--- a/src/trainer/unlearn/satimp.py
+++ b/src/trainer/unlearn/satimp.py
@@ -4,7 +4,7 @@
 
 class SatImp(GradDiff):
     def __init__(
-        self, beta1=5.0, beta2=0.1, gamma=1.0, alpha=0.05, *args, **kwargs
+        self, beta1=4.0, beta2=0.1, gamma=1.0, alpha=0.1, *args, **kwargs
     ):  # attention, satimp requires two beta!!!!
         super().__init__(*args, **kwargs)
         self.beta1 = beta1