TuringLang
diff --git a/‎docs/src/evaluators.md‎
Lines changed: 51 additions & 0 deletions b/‎docs/src/evaluators.md‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎ext/AbstractPPLDifferentiationInterfaceExt.jl‎
Lines changed: 94 additions & 29 deletions b/‎ext/AbstractPPLDifferentiationInterfaceExt.jl‎
Lines changed: 94 additions & 29 deletions
@@ -138,6 +138,56 @@ library invokes the inner callable many times with same-length dual arrays
 derived from a single user-supplied `x`; re-validating on each invocation
 would be redundant work in the hot path.
 
+## Hessian (`order=2`)
+
+Pass `order=2` to `prepare` to build a Hessian-capable evaluator. The
+returned object answers `value_gradient_and_hessian!!`, which returns
+`(value, gradient, hessian)` in a single call. `order=2` requires
+`problem` to be scalar-valued; a vector-valued probe throws at preparation
+time.
+
+```julia
+using AbstractPPL: prepare, value_gradient_and_hessian!!
+using ADTypes: AutoForwardDiff
+using ForwardDiff, DifferentiationInterface
+
+quadratic(x) = sum(abs2, x)
+prepared = prepare(AutoForwardDiff(), quadratic, zeros(3); order=2)
+val, grad, hess = value_gradient_and_hessian!!(prepared, [1.0, 2.0, 3.0])
+# val == 14.0
+# grad == [2.0, 4.0, 6.0]
+# hess == [2 0 0; 0 2 0; 0 0 2]
+```
+
+Both `context=` and `check_dims=` apply to `order=2` preps with the same
+semantics as for `order=1`. The `!!` aliasing contract also extends: the
+returned gradient and Hessian may alias internal cache buffers of
+`prepared`, so copy before retaining them past the next call. NamedTuple
+inputs are not supported at `order=2`.
+
+For DifferentiationInterface, `adtype` can be either a single backend
+(letting DI pick its own Hessian strategy) or a
+[`DifferentiationInterface.SecondOrder(outer, inner)`](https://juliadiff.org/DifferentiationInterface.jl/stable/api/#DifferentiationInterface.SecondOrder)
+composition that selects the outer differentiator and the inner gradient
+backend independently — typically forward-over-reverse:
+
+```julia
+using DifferentiationInterface: SecondOrder
+using ADTypes: AutoForwardDiff, AutoReverseDiff
+
+adtype = SecondOrder(AutoForwardDiff(), AutoReverseDiff())
+prepared = prepare(adtype, quadratic, zeros(3); order=2)
+```
+
+`SecondOrder <: AbstractADType`, so the same `prepare(adtype, problem, x; order=2)`
+entry handles it.
+
+Calling `value_gradient_and_hessian!!` on an `order=1` prep throws an
+`ArgumentError` — re-prepare with `order=2` instead. Likewise, calling
+`value_and_gradient!!` or `value_and_jacobian!!` on an `order=2` prep is
+unsupported; use `value_gradient_and_hessian!!` and discard the unused
+return value.
+
 ## Constant context arguments
 
 When the underlying callable naturally takes the form `f(x, context...)` —
@@ -177,4 +227,5 @@ p([1.0, 2.0, 3.0])
 AbstractPPL.prepare
 AbstractPPL.value_and_gradient!!
 AbstractPPL.value_and_jacobian!!
+AbstractPPL.value_gradient_and_hessian!!
 ```
@@ -5,27 +5,39 @@ using AbstractPPL.Evaluators: Evaluators, Prepared, VectorEvaluator, _ad_output_
 using ADTypes: AbstractADType, AutoReverseDiff
 using DifferentiationInterface: DifferentiationInterface as DI
 
-# AD target used by both `DICache` modes. `Vararg{Any,N}` with a free `N`
+# AD target used by every `DICache` mode. `Vararg{Any,N}` with a free `N`
 # forces specialization on the trailing arity (a bare `Vararg{Any}` would
 # skip it). DI invokes this as `_call_evaluator(x, f, c1, …, cN)` on the
 # constants path, and as `_call_evaluator(x, evaluator)` (via `Fix2`) on
 # the closure path — empty `ctx` then makes the splat a no-op.
 @inline _call_evaluator(x, f::F, ctx::Vararg{Any,N}) where {F,N} = f(x, ctx...)
 
 # `Mode` tags the cache shape:
-#   * `:closure`    — compiled-tape ReverseDiff: target is a `Fix2` closure,
-#                     the AD call passes **0** `DI.Constant`s.
-#   * `N::Int`      — constants path: `N == length(evaluator.context)`, the
-#                     AD call passes **N + 1** `DI.Constant`s (`f` plus the
-#                     `N` context values).
-# Encoding `Mode` in the type resolves the dispatch in `_di_value_and_*`
-# at compile time without a runtime branch.
-struct DICache{Mode,F,GP,JP}
+#   * `:closure` — compiled-tape ReverseDiff: target is a `Fix2` closure, the
+#                  AD call passes **0** `DI.Constant`s.
+#   * `N::Int`   — constants path: `N == length(evaluator.context)`, the AD
+#                  call passes **N + 1** `DI.Constant`s (`f` plus the `N`
+#                  context values).
+# Encoding `Mode` in the type resolves the dispatch in `_di_value_and_*` at
+# compile time without a runtime branch.
+#
+# Single cache for every derivative order. At most one of `gradient_prep`,
+# `jacobian_prep`, `hessian_prep` is non-`Nothing` at any time; the hot-path
+# methods discriminate via `=== nothing` checks (folded at compile time since
+# field types are concrete in each instantiation). `grad_buf` / `hess_buf` are
+# non-`Nothing` only for order=2 — caller-owned output buffers handed to
+# `DI.value_gradient_and_hessian!`. Returned arrays alias them (`!!` contract).
+struct DICache{Mode,F,GP,JP,HP,G,H}
     target::F
     gradient_prep::GP
     jacobian_prep::JP
-    function DICache{Mode}(target::F, gp::GP, jp::JP) where {Mode,F,GP,JP}
-        return new{Mode,F,GP,JP}(target, gp, jp)
+    hessian_prep::HP
+    grad_buf::G
+    hess_buf::H
+    function DICache{Mode}(
+        target::F, gp::GP, jp::JP, hp::HP, g::G, h::H
+    ) where {Mode,F,GP,JP,HP,G,H}
+        return new{Mode,F,GP,JP,HP,G,H}(target, gp, jp, hp, g, h)
     end
 end
 
@@ -48,18 +60,42 @@ function _prepare_di(prep::F, adtype::AbstractADType, x, evaluator) where {F}
     )
 end
 
-@inline _wrap_cache(target, gp, jp, ::Val{Mode}) where {Mode} =
-    DICache{Mode}(target, gp, jp)
+@inline _wrap_cache(target, gp, jp, ::Val{Mode}) where {Mode} = DICache{Mode}(
+    target, gp, jp, nothing, nothing, nothing
+)
 
 function AbstractPPL.prepare(
     adtype::AbstractADType,
     problem,
     x::AbstractVector{<:Real};
     check_dims::Bool=true,
     context::Tuple=(),
+    order::Int=1,
 )
     evaluator = AbstractPPL.prepare(problem, x; check_dims, context)::VectorEvaluator
     arity = _ad_output_arity(evaluator(x))
+    if order == 2
+        arity === :scalar || Evaluators._throw_hessian_needs_scalar()
+        if length(x) == 0
+            # DI Hessian prep crashes on length-0 input; the AD entry
+            # short-circuits before any DI call. `Val(0)` is a non-`Nothing`
+            # sentinel for `hessian_prep` so dispatch recognises this as an
+            # order=2 prep (mirrors the order=1 empty-input pattern below).
+            cache = _wrap_hessian_cache(
+                _call_evaluator, Val(0), nothing, nothing, Val(length(context))
+            )
+            return Prepared(adtype, evaluator, cache)
+        end
+        target, hessian_prep, mode = _prepare_di(DI.prepare_hessian, adtype, x, evaluator)
+        # Buffers pre-allocated from `x` (shape and eltype): the hot path is
+        # zero-allocation on the gradient/Hessian outputs, and the returned
+        # arrays alias these slots — copy if you need to retain them.
+        grad_buf = similar(x)
+        hess_buf = similar(x, length(x), length(x))
+        cache = _wrap_hessian_cache(target, hessian_prep, grad_buf, hess_buf, mode)
+        return Prepared(adtype, evaluator, cache)
+    end
+    order == 1 || throw(ArgumentError("`order` must be 1 or 2, got $order."))
     if length(x) == 0
         # DI prep crashes on length-0 input (e.g. ForwardDiff `BoundsError`).
         # `Val(0)` is an arity sentinel for the `gradient_prep === nothing`
@@ -78,36 +114,35 @@ function AbstractPPL.prepare(
     return Prepared(adtype, evaluator, _wrap_cache(target, nothing, jacobian_prep, mode))
 end
 
+@inline _wrap_hessian_cache(target, hp, g, h, ::Val{Mode}) where {Mode} = DICache{Mode}(
+    target, nothing, nothing, hp, g, h
+)
+
 # Hot-path dispatch is by `Mode` (closure vs constants), resolved at compile
 # time. The unconstrained method matches every non-`:closure` `Mode` (i.e.
 # any `Int N`); `:closure` is strictly more specific and wins for compiled
 # tapes. On the constants path we always pass `DI.Constant(eval.f)` plus the
 # `N` context constants — `N == 0` collapses the `map` splat to nothing.
-@inline _di_value_and_gradient(c::DICache{:closure}, ad, x, _) =
-    DI.value_and_gradient(c.target, c.gradient_prep, ad, x)
+@inline _di_value_and_gradient(c::DICache{:closure}, ad, x, _) = DI.value_and_gradient(
+    c.target, c.gradient_prep, ad, x
+)
 @inline _di_value_and_gradient(c::DICache, ad, x, eval) = DI.value_and_gradient(
-    c.target,
-    c.gradient_prep,
-    ad,
-    x,
-    DI.Constant(eval.f),
-    map(DI.Constant, eval.context)...,
+    c.target, c.gradient_prep, ad, x, DI.Constant(eval.f), map(DI.Constant, eval.context)...
 )
 
-@inline _di_value_and_jacobian(c::DICache{:closure}, ad, x, _) =
-    DI.value_and_jacobian(c.target, c.jacobian_prep, ad, x)
+@inline _di_value_and_jacobian(c::DICache{:closure}, ad, x, _) = DI.value_and_jacobian(
+    c.target, c.jacobian_prep, ad, x
+)
 @inline _di_value_and_jacobian(c::DICache, ad, x, eval) = DI.value_and_jacobian(
-    c.target,
-    c.jacobian_prep,
-    ad,
-    x,
-    DI.Constant(eval.f),
-    map(DI.Constant, eval.context)...,
+    c.target, c.jacobian_prep, ad, x, DI.Constant(eval.f), map(DI.Constant, eval.context)...
 )
 
 @inline function AbstractPPL.value_and_gradient!!(
     p::Prepared{<:AbstractADType,<:VectorEvaluator,<:DICache}, x::AbstractVector{T}
 ) where {T<:Real}
+    # Both `=== nothing` branches fold at compile time: each instantiation
+    # has concrete field types, so only the relevant branch survives.
+    p.cache.hessian_prep === nothing || Evaluators._throw_use_value_gradient_and_hessian()
     p.cache.gradient_prep === nothing && Evaluators._throw_gradient_needs_scalar()
     Evaluators._check_ad_input(p.evaluator, x)
     # Bypass DI on length-0 input — DI prep paths fail (e.g. ForwardDiff
@@ -119,6 +154,7 @@ end
 @inline function AbstractPPL.value_and_jacobian!!(
     p::Prepared{<:AbstractADType,<:VectorEvaluator,<:DICache}, x::AbstractVector{T}
 ) where {T<:Real}
+    p.cache.hessian_prep === nothing || Evaluators._throw_use_value_gradient_and_hessian()
     p.cache.jacobian_prep === nothing && Evaluators._throw_jacobian_needs_vector()
     Evaluators._check_ad_input(p.evaluator, x)
     if length(x) == 0
@@ -128,4 +164,33 @@ end
     return _di_value_and_jacobian(p.cache, p.adtype, x, p.evaluator)
 end
 
+# Hessian hot-path dispatch mirrors the gradient/jacobian helpers above:
+# `:closure` (compiled-tape) vs constants `Mode`, resolved at compile time.
+# Uses DI's in-place variant `value_gradient_and_hessian!` with caller-owned
+# buffers; the returned `(val, grad, hess)` aliases `c.grad_buf` / `c.hess_buf`.
+@inline _di_value_gradient_and_hessian(c::DICache{:closure}, ad, x, _) = DI.value_gradient_and_hessian!(
+    c.target, c.grad_buf, c.hess_buf, c.hessian_prep, ad, x
+)
+@inline _di_value_gradient_and_hessian(c::DICache, ad, x, eval) = DI.value_gradient_and_hessian!(
+    c.target,
+    c.grad_buf,
+    c.hess_buf,
+    c.hessian_prep,
+    ad,
+    x,
+    DI.Constant(eval.f),
+    map(DI.Constant, eval.context)...,
+)
+
+@inline function AbstractPPL.value_gradient_and_hessian!!(
+    p::Prepared{<:AbstractADType,<:VectorEvaluator,<:DICache}, x::AbstractVector{T}
+) where {T<:Real}
+    # Order=1 preps have `hessian_prep === nothing` (compile-folded check).
+    p.cache.hessian_prep === nothing && Evaluators._throw_hessian_needs_order_2_prep()
+    Evaluators._check_ad_input(p.evaluator, x)
+    # Empty-input shortcut — same reasoning as the order=1 path.
+    length(x) == 0 && return (p.evaluator(x), T[], similar(x, 0, 0))
+    return _di_value_gradient_and_hessian(p.cache, p.adtype, x, p.evaluator)
+end
+
 end # module