diff --git a/Project.toml b/Project.toml
index a4b06c5..2418b61 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,17 +1,17 @@
 name = "DecisionFocusedLearningAlgorithms"
 uuid = "46d52364-bc3b-4fac-a992-eb1d3ef2de15"
-version = "0.2.0"
 authors = ["Members of JuliaDecisionFocusedLearning and contributors"]
-
-[workspace]
-projects = ["docs", "test"]
+version = "0.2.0"
 
 [deps]
 DecisionFocusedLearningBenchmarks = "2fbe496a-299b-4c81-bab5-c44dfc55cf20"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 InferOpt = "4846b161-c94e-4150-8dac-c7ae193c601f"
+Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -21,12 +21,18 @@ ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"
 [compat]
 DecisionFocusedLearningBenchmarks = "0.5.0, 0.6"
 DocStringExtensions = "0.9.5"
+Documenter = "1.17.0"
 Flux = "0.16.9"
 InferOpt = "0.7.1"
+Literate = "2.21.0"
 MLUtils = "0.4.8"
+Plots = "1.41.6"
 ProgressMeter = "1.11.0"
 Random = "1.11.0"
 Statistics = "1.11.1"
 UnicodePlots = "3.8.2"
 ValueHistories = "0.5.6"
 julia = "1.11"
+
+[workspace]
+projects = ["docs", "test"]
diff --git a/src/DecisionFocusedLearningAlgorithms.jl b/src/DecisionFocusedLearningAlgorithms.jl
index d7a6250..19fdf70 100644
--- a/src/DecisionFocusedLearningAlgorithms.jl
+++ b/src/DecisionFocusedLearningAlgorithms.jl
@@ -25,6 +25,7 @@ include("algorithms/abstract_algorithm.jl")
 include("algorithms/supervised/fyl.jl")
 include("algorithms/supervised/anticipative_imitation.jl")
 include("algorithms/supervised/dagger.jl")
+include("algorithms/MirrorDescent/mirror_descent.jl")
 
 export TrainingContext
 
@@ -41,7 +42,7 @@ export AbstractMetric,
 
 export AbstractAlgorithm, AbstractImitationAlgorithm
 export PerturbedFenchelYoungLossImitation,
-    DAgger, AnticipativeImitation, train_policy!, train_policy
+    DAgger, AnticipativeImitation, train_policy!, train_policy, MirrorDescent
 export AbstractPolicy, DFLPolicy
 
 end
diff --git a/src/algorithms/MirrorDescent/mirror_descent.jl b/src/algorithms/MirrorDescent/mirror_descent.jl
new file mode 100644
index 0000000..2f282e2
--- /dev/null
+++ b/src/algorithms/MirrorDescent/mirror_descent.jl
@@ -0,0 +1,138 @@
+"""
+$TYPEDEF
+
+Mirror Descent algorithm for learning coordinated solutions.
+
+This algorithm is designed for stochastic benchmarks.
+
+Reference: <https://arxiv.org/abs/2505.04757>
+
+# Fields
+$TYPEDFIELDS
+"""
+@kwdef struct MirrorDescent{A} <: AbstractImitationAlgorithm
+    "inner imitation algorithm for supervised learning"
+    inner_algorithm::A = PerturbedFenchelYoungLossImitation()
+end
+
+"""
+$TYPEDSIGNATURES
+Generate a dataset for the provided benchmark and train a DFLPolicy using the Mirror Descent algorithm.
+
+# Core training method
+"""
+
+
+function train_policy(
+    algorithm::MirrorDescent,
+    benchmark::ExogenousStochasticBenchmark;
+    dataset_size=30,
+    epochs=10,
+    iterations=10,
+    κ = 1.0,
+    metrics::Tuple=(),
+    seed=nothing,
+)
+
+    train_dataset = generate_dataset(benchmark, dataset_size; seed=seed)
+
+    # Initialize model and create policy
+    model = generate_statistical_model(benchmark; seed=seed)
+    maximizer = generate_maximizer(benchmark)
+    policy = DFLPolicy(model, maximizer)
+
+    # vector because we store one history per iteration
+    histories_per_iteration = MVHistory[]
+
+    anticipative_solver = generate_anticipative_solver(benchmark;) 
+    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark;) 
+
+    # perturb = true correspond to "real" iterations of mirror descent
+    # we compute solutions with the penalized anticipative solver  + perturbation
+
+    # perturb = false correspond to imitation learning
+    # we use the anticipative solver without perturbation
+    # usefull to start with one iteration of pure imitation learning
+    perturb = false
+
+    # Train policy
+    for n_it in 1:iterations
+        println("Iteration $n_it / $iterations")
+
+        if n_it > 1
+            perturb = true
+        end
+
+
+        # Generate anticipative solutions as training data
+        augmented_dataset = augment_dataset(
+            algorithm.inner_algorithm, benchmark, train_dataset, model, anticipative_solver, parametric_anticipative_solver;
+            κ = κ, perturb = perturb
+        )
+
+
+        # Train policy on augmented dataset
+        history = train_policy!(
+            algorithm.inner_algorithm,
+            policy,
+            augmented_dataset;
+            epochs = epochs,
+            metrics = metrics,
+            maximizer_kwargs=sample -> sample.context,
+        )
+
+        push!(histories_per_iteration, history)
+    end
+
+    return histories_per_iteration, policy
+end
+
+
+function augment_dataset(
+    algorithm::PerturbedFenchelYoungLossImitation,
+    bench::AbstractStochasticBenchmark,
+    train_dataset::AbstractArray,
+    model,
+    anticipative_solver,
+    parametric_anticipative_solver;
+    κ = 1.0,
+    perturb = false
+)
+
+    (; nb_samples, ε, threaded, training_optimizer, seed) = algorithm
+
+    augmented_dataset = Vector{DataSample}()
+
+    if perturb
+        perturbed_maximizer = PerturbedAdditive(
+            parametric_anticipative_solver; ε=κ*ε, nb_samples=nb_samples
+        )
+    end
+
+
+    for sample in train_dataset
+
+        θ = model(sample.x)
+
+        if perturb
+            if is_minimization_problem(bench)
+                y = perturbed_maximizer(-κ*θ; scenario = sample.scenario, context = sample) 
+            else
+                y = perturbed_maximizer(κ*θ; scenario = sample.scenario, context = sample)
+            end
+        else
+            y = anticipative_solver(sample.scenario; context = sample)
+        end
+
+        augmented_datasample = DataSample(;
+            x = sample.x,
+            y,
+            instance = sample.context,
+            extra = sample.extra
+        )
+
+        push!(augmented_dataset, augmented_datasample)
+    end
+
+    return augmented_dataset
+end
\ No newline at end of file
diff --git a/src/algorithms/mirror_descent/mirror_descent.jl b/src/algorithms/mirror_descent/mirror_descent.jl
new file mode 100644
index 0000000..b0847cb
--- /dev/null
+++ b/src/algorithms/mirror_descent/mirror_descent.jl
@@ -0,0 +1,196 @@
+"""
+$TYPEDEF
+
+Mirror Descent algorithm for learning coordinated solutions.
+
+This algorithm is designed for stochastic benchmarks.
+
+Reference: <https://arxiv.org/abs/2505.04757>
+
+# Fields
+$TYPEDFIELDS
+"""
+@kwdef struct MirrorDescent{A<:PerturbedFenchelYoungLossImitation} <: AbstractAlgorithm
+    "inner imitation algorithm for supervised learning"
+    inner_algorithm::A = PerturbedFenchelYoungLossImitation()
+end
+
+"""
+$TYPEDSIGNATURES
+
+Train a DFLPolicy using the Mirror Descent algorithm on a provided training dataset.
+
+# Core training method
+
+# Arguments
+- `epochs`: number of training epochs per iteration
+- `iterations`: number of mirror descent iterations
+- `κ`: scaling factor for the perturbation magnitude
+- `metrics`: tuple of metrics to track during training
+- `verbose`: if true, prints progress at each iteration
+- `imitation_start`: if true, the first iteration uses pure imitation learning (no perturbation)
+"""
+
+function train_policy!(
+    benchmark::ExogenousStochasticBenchmark,
+    algorithm::MirrorDescent,
+    policy::DFLPolicy,
+    train_dataset,
+    anticipative_solver,
+    perturbed_anticipative_solver;
+    epochs=10,
+    iterations=10,
+    κ=1.0,
+    metrics::Tuple=(),
+    verbose::Bool=false,
+    imitation_start::Bool=true
+)
+
+    augmented_dataset = train_dataset
+    return map(1:iterations) do n_it
+        if verbose
+            println("Iteration $n_it / $iterations")
+        end
+
+        perturb = n_it > 1 || !imitation_start
+
+        augmented_dataset = augment_dataset(
+            benchmark, augmented_dataset, policy.statistical_model, anticipative_solver, perturbed_anticipative_solver;
+            κ=κ, perturb=perturb
+        )
+
+        train_policy!(
+            algorithm.inner_algorithm,
+            policy,
+            augmented_dataset;
+            epochs=epochs,
+            metrics=metrics,
+            maximizer_kwargs=sample -> sample.context,
+        )
+    end
+end
+
+"""
+$TYPEDSIGNATURES
+
+Generate a dataset for the provided benchmark and train a DFLPolicy using the Mirror Descent algorithm.
+
+# Benchmark convenience wrapper
+
+This high-level function handles all setup from the benchmark and returns a trained policy.
+
+# Arguments
+- `dataset_size`: number of samples in the training dataset
+- `epochs`: number of training epochs per iteration
+- `iterations`: number of mirror descent iterations
+- `κ`: scaling factor for the perturbation magnitude
+- `metrics`: tuple of metrics to track during training
+- `seed`: random seed for reproducibility
+- `verbose`: if true, prints progress at each iteration
+- `imitation_start`: if true, the first iteration uses pure imitation learning (no perturbation)
+- `model_kwargs`: additional keyword arguments passed to `generate_statistical_model`
+- `maximizer_kwargs`: additional keyword arguments passed to `generate_maximizer`
+- `solver_kwargs`: additional keyword arguments passed to `generate_anticipative_solver` and `generate_parametric_anticipative_solver`
+- `nb_scenarios`: number of scenarios per instance. 
+- `context_per_instance`: number of contexts per instance. 
+"""
+
+
+
+function train_policy(
+    algorithm::MirrorDescent,
+    benchmark::ExogenousStochasticBenchmark;
+    dataset_size=30,
+    epochs=10,
+    iterations=10,
+    κ=1.0,
+    metrics::Tuple=(),
+    seed=nothing,
+    verbose::Bool=false,
+    imitation_start::Bool=true,
+    model_kwargs=(;),
+    maximizer_kwargs=(;),
+    solver_kwargs=(;),
+    nb_scenarios = 1,
+    context_per_instance = 1,
+)
+    train_dataset = generate_dataset(benchmark, dataset_size; nb_scenarios=nb_scenarios, contexts_per_instance=context_per_instance, seed=seed)
+
+    model = generate_statistical_model(benchmark; seed=seed, model_kwargs...)
+    maximizer = generate_maximizer(benchmark; maximizer_kwargs...)
+    policy = DFLPolicy(model, maximizer)
+
+    anticipative_solver = generate_anticipative_solver(benchmark; solver_kwargs...)
+    parametric_anticipative_solver = generate_parametric_anticipative_solver(benchmark; solver_kwargs...)
+    (; nb_samples, ε, threaded, seed) = algorithm.inner_algorithm
+    perturbed_anticipative_solver = PerturbedAdditive((θ; scenario, kwargs...) -> parametric_anticipative_solver(θ, scenario; kwargs...); ε=κ*ε, nb_samples=nb_samples, seed=seed, threaded=threaded)
+
+
+    histories_per_iteration = train_policy!(
+        benchmark, algorithm, policy, train_dataset, anticipative_solver, perturbed_anticipative_solver;
+        epochs=epochs, iterations=iterations, κ=κ, metrics=metrics, verbose=verbose, imitation_start=imitation_start
+    )
+
+    return histories_per_iteration, policy
+end
+
+function augment_dataset(
+    bench::ExogenousStochasticBenchmark,
+    train_dataset::AbstractArray,
+    model,
+    anticipative_solver,
+    perturbed_anticipative_solver;
+    κ=1.0,
+    perturb=false
+)
+    return _augment_dataset(
+        Val(fieldtype(eltype(train_dataset), :y) !== Nothing),
+        bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
+        κ=κ, perturb=perturb
+    )
+end
+
+# Raw dataset (samples have no y) → create new DataSamples
+function _augment_dataset(
+    ::Val{false},
+    bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
+    κ=1.0, perturb=false
+)
+    return map(train_dataset) do sample
+        θ = model(sample.x)
+        if perturb
+            if is_minimization_problem(bench)
+                y = perturbed_anticipative_solver(-κ*θ; scenario=sample.scenario, sample.context...)
+            else
+                y = perturbed_anticipative_solver(κ*θ; scenario=sample.scenario, sample.context...)
+            end
+        else
+            y = anticipative_solver(sample.scenario; sample.context...)
+        end
+        DataSample(sample; y=y)
+    end
+end
+
+# Augmented dataset (samples already have y) → update y in place
+function _augment_dataset(
+    ::Val{true},
+    bench, train_dataset, model, anticipative_solver, perturbed_anticipative_solver;
+    κ=1.0, perturb=false
+)
+    for (i, sample) in enumerate(train_dataset)
+        θ = model(sample.x)
+        if perturb
+            if is_minimization_problem(bench)
+                y = perturbed_anticipative_solver(-κ*θ; scenario=sample.scenario, sample.context...)
+            else
+                y = perturbed_anticipative_solver(κ*θ; scenario=sample.scenario, sample.context...)
+            end
+        else
+            y = anticipative_solver(sample.scenario; sample.context...)
+        end
+        ET = eltype(sample.y)
+        y_converted = convert(typeof(sample.y), ET <: Integer ? round.(ET, y) : y)
+        train_dataset[i] = DataSample(sample; y=y_converted)
+    end
+    return train_dataset
+end
\ No newline at end of file
diff --git a/test/mirror_descent.jl b/test/mirror_descent.jl
new file mode 100644
index 0000000..0a42cc3
--- /dev/null
+++ b/test/mirror_descent.jl
@@ -0,0 +1,95 @@
+using DecisionFocusedLearningAlgorithms
+using DecisionFocusedLearningBenchmarks
+using Test
+using ValueHistories
+using Statistics: mean
+
+@testset "MirrorDescent Training" begin
+
+    @testset "MirrorDescent - ContextualStochasticArgmax basic" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=5, epochs=2, iterations=2, seed=0
+        )
+
+        @test histories isa Vector
+        @test length(histories) == 2
+        @test all(h isa MVHistory for h in histories)
+        @test all(haskey(h, :training_loss) for h in histories)
+        @test policy isa DFLPolicy
+    end
+
+    @testset "MirrorDescent - StochasticVehicleScheduling basic" begin
+        benchmark = StochasticVehicleSchedulingBenchmark()
+        algorithm = MirrorDescent()
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=1, epochs=2, iterations=2, seed=0
+        )
+
+        @test histories isa Vector
+        @test length(histories) == 2
+        @test all(h isa MVHistory for h in histories)
+        @test all(haskey(h, :training_loss) for h in histories)
+        @test policy isa DFLPolicy
+    end
+
+    @testset "MirrorDescent - imitation_start=false" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=5, epochs=2, iterations=2, seed=0, imitation_start=false
+        )
+
+        @test histories isa Vector
+        @test length(histories) == 2
+        @test policy isa DFLPolicy
+    end
+
+    @testset "MirrorDescent - performance improves over iterations" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        val_dataset = generate_dataset(benchmark, 100; seed=99)
+
+        val_metric = FunctionMetric(:val_obj, val_dataset) do ctx, data
+            vals = map(data) do s
+                θ = ctx.policy.statistical_model(s.x)
+                y = ctx.policy.maximizer(θ; s.context...)
+                Float64(DecisionFocusedLearningBenchmarks.objective_value(benchmark, s, y))
+            end
+            (val_obj = mean(vals),)
+        end
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=20, epochs=3, iterations=5, seed=0, metrics=(val_metric,)
+        )
+
+        val_objs = [get(histories[i], :val_obj)[2][end] for i in 1:5]
+
+        # Performance should improve at each iteration
+        @test (val_objs[4] > val_objs[1])
+    end
+
+    @testset "MirrorDescent - with metrics" begin
+        benchmark = ContextualStochasticArgmaxBenchmark()
+        algorithm = MirrorDescent()
+
+        metrics = (FunctionMetric(ctx -> ctx.epoch, :epoch),)
+
+        histories, policy = train_policy(
+            algorithm, benchmark;
+            dataset_size=5, epochs=2, iterations=2, seed=0, metrics=metrics
+        )
+
+        @test all(haskey(h, :epoch) for h in histories)
+    end
+
+end