Proximal operator for the entropy of location-scale families (#168)

Red-Portal · github-actions[bot] · sunxd3 · web-flow · commit 8fdff7253c7f · 2025-04-29T11:40:59.000-04:00
* add proximal operator for the entropy of location-scale families

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;

* improve docstring for zero gradient entropy estimators

* add missing file

* add documentation for proximal operator

* run formatter

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;

* fix improve type stability

* apply formatter

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;

* fix typo in doctring

Co-authored-by: Xianda Sun &lt;5433119+sunxd3@users.noreply.github.com&gt;

* fix typo in comment

Co-authored-by: Xianda Sun &lt;5433119+sunxd3@users.noreply.github.com&gt;

* apply code review comments

* bump compat bound for subprojects

---------

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
Co-authored-by: Xianda Sun &lt;5433119+sunxd3@users.noreply.github.com&gt;
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "AdvancedVI"
 uuid = "b5ca4192-6429-45e5-a2d9-87aec30a685c"
-version = "0.3.2"
+version = "0.4.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
diff --git a/bench/Project.toml b/bench/Project.toml
@@ -20,7 +20,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 ADTypes = "1"
-AdvancedVI = "0.3"
+AdvancedVI = "0.3, 0.4"
 BenchmarkTools = "1"
 Bijectors = "0.13, 0.14, 0.15"
 Distributions = "0.25.111"
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -15,7 +15,7 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 
 [compat]
 ADTypes = "1"
-AdvancedVI = "0.3, 0.2"
+AdvancedVI = "0.4"
 Bijectors = "0.13.6, 0.14, 0.15"
 Distributions = "0.25"
 Documenter = "1"
diff --git a/docs/src/optimization.md b/docs/src/optimization.md
@@ -33,11 +33,20 @@ For this, an operator acting on the parameters can be supplied via the  `operato
 
 ### [`ClipScale`](@id clipscale)
 
-For the location scale, it is often the case that optimization is stable only when the smallest eigenvalue of the scale matrix is strictly positive[^D2020].
+For the location-scale family, it is often the case that optimization is stable only when the smallest eigenvalue of the scale matrix is strictly positive[^D2020].
 To ensure this, we provide the following projection operator:
 
 ```@docs
 ClipScale
 ```
 
+### [`ProximalLocationScaleEntropy`](@id proximalocationscaleentropy)
+
+ELBO maximization with the location-scale family tends to be unstable when the scale has small eigenvalues or the stepsize is large.
+To remedy this, a proximal operator of the entropy[^D2020] can be used.
+
+```@docs
+ProximalLocationScaleEntropy
+```
+
 [^D2020]: Domke, J. (2020). Provable smoothness guarantees for black-box variational inference. In *International Conference on Machine Learning*.
diff --git a/ext/AdvancedVIBijectorsExt.jl b/ext/AdvancedVIBijectorsExt.jl
@@ -9,6 +9,7 @@ using Random
 function AdvancedVI.apply(
     op::ClipScale,
     ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScale}},
+    state,
     params,
     restructure,
 )
@@ -27,6 +28,7 @@ end
 function AdvancedVI.apply(
     op::ClipScale,
     ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScaleLowRank}},
+    state,
     params,
     restructure,
 )
@@ -40,6 +42,26 @@ function AdvancedVI.apply(
     return params
 end
 
+function AdvancedVI.apply(
+    ::AdvancedVI.ProximalLocationScaleEntropy,
+    ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScale}},
+    leaf::Optimisers.Leaf{<:Union{<:DoG,<:DoWG,<:Descent},S},
+    params,
+    restructure,
+) where {S}
+    q = restructure(params)
+
+    stepsize = AdvancedVI.stepsize_from_optimizer_state(leaf.rule, leaf.state)
+    diag_idx = diagind(q.dist.scale)
+    scale_diag = q.dist.scale[diag_idx]
+    @. q.dist.scale[diag_idx] =
+        scale_diag + 1 / 2 * (sqrt(scale_diag^2 + 4 * stepsize) - scale_diag)
+
+    params, _ = Optimisers.destructure(q)
+
+    return params
+end
+
 function AdvancedVI.reparam_with_entropy(
     rng::Random.AbstractRNG,
     q::Bijectors.TransformedDistribution,
diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl
@@ -177,13 +177,14 @@ function estimate_gradient! end
 abstract type AbstractEntropyEstimator end
 
 """
-    estimate_entropy(entropy_estimator, mc_samples, q)
+    estimate_entropy(entropy_estimator, mc_samples, q, q_stop)
 
 Estimate the entropy of `q`.
 
 # Arguments
 - `entropy_estimator`: Entropy estimation strategy.
 - `q`: Variational approximation.
+- `q_stop`: Variational approximation with detached from the automatic differentiation graph.
 - `mc_samples`: Monte Carlo samples used to estimate the entropy. (Only used for Monte Carlo strategies.)
 
 # Returns
@@ -192,7 +193,12 @@ Estimate the entropy of `q`.
 function estimate_entropy end
 
 export RepGradELBO,
-    ScoreGradELBO, ClosedFormEntropy, StickingTheLandingEntropy, MonteCarloEntropy
+    ScoreGradELBO,
+    ClosedFormEntropy,
+    StickingTheLandingEntropy,
+    MonteCarloEntropy,
+    ClosedFormEntropyZeroGradient,
+    StickingTheLandingEntropyZeroGradient
 
 include("objectives/elbo/entropy.jl")
 include("objectives/elbo/repgradelbo.jl")
@@ -259,20 +265,21 @@ export NoAveraging, PolynomialAveraging
 abstract type AbstractOperator end
 
 """
-    apply(op::AbstractOperator, family, params, restructure)
+    apply(op::AbstractOperator, family, rule, opt_state, params, restructure)
 
 Apply operator `op` on the variational parameters `params`. For instance, `op` could be a projection or proximal operator.
 
 # Arguments
 - `op::AbstractOperator`: Operator operating on the parameters `params`.
 - `family::Type`: Type of the variational approximation `restructure(params)`.
+- `opt_state`: State of the optimizer.
 - `params`: Variational parameters.
 - `restructure`: Function that reconstructs the variational approximation from `params`.
 
 # Returns
 - `oped_params`: Parameters resulting from applying the operator.
 """
-function apply(::AbstractOperator, ::Type, ::Any, ::Any) end
+function apply(::AbstractOperator, ::Type, ::Optimisers.AbstractRule, ::Any, ::Any, ::Any) end
 
 """
     IdentityOperator()
@@ -281,11 +288,12 @@ Identity operator.
 """
 struct IdentityOperator <: AbstractOperator end
 
-apply(::IdentityOperator, ::Type, params, restructure) = params
+apply(::IdentityOperator, ::Type, opt_st, params, restructure) = params
 
 include("optimization/clip_scale.jl")
+include("optimization/proximal_location_scale_entropy.jl")
 
-export IdentityOperator, ClipScale
+export IdentityOperator, ClipScale, ProximalLocationScaleEntropy
 
 # Main optimization routine
 function optimize end
diff --git a/src/objectives/elbo/entropy.jl b/src/objectives/elbo/entropy.jl
@@ -1,4 +1,19 @@
 
+"""
+    ClosedFormEntropyZeroGradient()
+
+Use closed-form expression of entropy but detach it from the AD graph.
+This is expected to be used only with `ProximalLocationScaleEntropy`.
+
+# Requirements
+- The variational approximation implements `entropy`.
+"""
+struct ClosedFormEntropyZeroGradient <: AbstractEntropyEstimator end
+
+function estimate_entropy(::ClosedFormEntropyZeroGradient, ::Any, ::Any, q_stop)
+    return entropy(q_stop)
+end
+
 """
     ClosedFormEntropy()
 
@@ -9,12 +24,27 @@ Use closed-form expression of entropy[^TL2014][^KTRGB2017].
 """
 struct ClosedFormEntropy <: AbstractEntropyEstimator end
 
-maybe_stop_entropy_score(::AbstractEntropyEstimator, q, q_stop) = q
-
-function estimate_entropy(::ClosedFormEntropy, ::Any, q)
+function estimate_entropy(::ClosedFormEntropy, ::Any, q, q_stop)
     return entropy(q)
 end
 
+"""
+    MonteCarloEntropy()
+
+Monte Carlo estimation of the entropy.
+
+# Requirements
+- The variational approximation `q` implements `logpdf`.
+- `logpdf(q, η)` must be differentiable by the selected AD framework.
+"""
+struct MonteCarloEntropy <: AbstractEntropyEstimator end
+
+function estimate_entropy(::MonteCarloEntropy, mc_samples::AbstractMatrix, q, q_stop)
+    return mean(eachcol(mc_samples)) do mc_sample
+        -logpdf(q, mc_sample)
+    end
+end
+
 """
     StickingTheLandingEntropy()
 
@@ -26,14 +56,35 @@ The "sticking the landing" entropy estimator[^RWD2017].
 """
 struct StickingTheLandingEntropy <: AbstractEntropyEstimator end
 
-struct MonteCarloEntropy <: AbstractEntropyEstimator end
+function estimate_entropy(
+    ::StickingTheLandingEntropy, mc_samples::AbstractMatrix, q, q_stop
+)
+    return mean(eachcol(mc_samples)) do mc_sample
+        -logpdf(q_stop, mc_sample)
+    end
+end
 
-maybe_stop_entropy_score(::StickingTheLandingEntropy, q, q_stop) = q_stop
+"""
+    StickingTheLandingEntropyZeroGradient()
+
+The "sticking the landing" entropy estimator[^RWD2017] but modified to have a gradient of mean zero.
+This is expected to be used only with `ProximalLocationScaleEntropy`.
+
+# Requirements
+- The variational approximation `q` implements `logpdf`.
+- `logpdf(q, η)` must be differentiable by the selected AD framework.
+- The variational approximation implements `entropy`.
+"""
+struct StickingTheLandingEntropyZeroGradient <: AbstractEntropyEstimator end
 
 function estimate_entropy(
-    ::Union{MonteCarloEntropy,StickingTheLandingEntropy}, mc_samples::AbstractMatrix, q
+    ::Union{MonteCarloEntropy,StickingTheLandingEntropyZeroGradient},
+    mc_samples::AbstractMatrix,
+    q,
+    q_stop,
 )
-    mean(eachcol(mc_samples)) do mc_sample
-        -logpdf(q, mc_sample)
+    entropy_stl = mean(eachcol(mc_samples)) do mc_sample
+        -logpdf(q_stop, mc_sample)
     end
+    return entropy_stl - entropy(q) + entropy(q_stop)
 end
diff --git a/src/objectives/elbo/repgradelbo.jl b/src/objectives/elbo/repgradelbo.jl
@@ -67,13 +67,6 @@ function Base.show(io::IO, obj::RepGradELBO)
     return print(io, ")")
 end
 
-function estimate_entropy_maybe_stl(
-    entropy_estimator::AbstractEntropyEstimator, samples, q, q_stop
-)
-    q_maybe_stop = maybe_stop_entropy_score(entropy_estimator, q, q_stop)
-    return estimate_entropy(entropy_estimator, samples, q_maybe_stop)
-end
-
 function estimate_energy_with_samples(prob, samples)
     return mean(Base.Fix1(LogDensityProblems.logdensity, prob), eachsample(samples))
 end
@@ -98,7 +91,7 @@ function reparam_with_entropy(
     rng::Random.AbstractRNG, q, q_stop, n_samples::Int, ent_est::AbstractEntropyEstimator
 )
     samples = rand(rng, q, n_samples)
-    entropy = estimate_entropy_maybe_stl(ent_est, samples, q, q_stop)
+    entropy = estimate_entropy(ent_est, samples, q, q_stop)
     return samples, entropy
 end
 
diff --git a/src/optimization/clip_scale.jl b/src/optimization/clip_scale.jl
@@ -9,11 +9,11 @@ Optimisers.@def struct ClipScale <: AbstractOperator
     epsilon = 1e-5
 end
 
-function apply(::ClipScale, family::Type, params, restructure)
+function apply(::ClipScale, family::Type, state, params, restructure)
     return error("`ClipScale` is not defined for the variational family of type $(family).")
 end
 
-function apply(op::ClipScale, ::Type{<:MvLocationScale}, params, restructure)
+function apply(op::ClipScale, ::Type{<:MvLocationScale}, state, params, restructure)
     q = restructure(params)
     ϵ = convert(eltype(params), op.epsilon)
 
@@ -26,7 +26,7 @@ function apply(op::ClipScale, ::Type{<:MvLocationScale}, params, restructure)
     return params
 end
 
-function apply(op::ClipScale, ::Type{<:MvLocationScaleLowRank}, params, restructure)
+function apply(op::ClipScale, ::Type{<:MvLocationScaleLowRank}, state, params, restructure)
     q = restructure(params)
     ϵ = convert(eltype(params), op.epsilon)
 
diff --git a/src/optimization/proximal_location_scale_entropy.jl b/src/optimization/proximal_location_scale_entropy.jl
@@ -0,0 +1,61 @@
+
+"""
+    ProximalLocationScaleEntropy()
+
+Proximal operator for the entropy of a location-scale distribution, which is defined as
+```math
+    \\mathrm{prox}(\\lambda) = \\argmin_{\\lambda^{\\prime}} - \\mathbb{H}(q_{\\lambda^{\\prime}}) + \\frac{1}{2 \\gamma_t} \\left\\lVert \\lambda - \\lambda^{\\prime} \\right\\rVert ,
+```
+where \$\\gamma_t\$ is the stepsize the optimizer used with the proximal operator.
+This assumes the variational family is `<:VILocationScale` and the optimizer is one of the following:
+- `DoG`
+- `DoWG`
+- `Descent`
+
+For ELBO maximization, since this proximal operator handles the entropy, the gradient estimator for the ELBO must ignore the entropy term.
+That is, the `entropy` keyword argument of `RepGradELBO` muse be one of the following:
+- `ClosedFormEntropyZeroGradient`
+- `StickingTheLandingEntropyZeroGradient`
+"""
+struct ProximalLocationScaleEntropy <: AbstractOperator end
+
+function apply(::ProximalLocationScaleEntropy, family, state, params, restructure)
+    return error("`ProximalLocationScaleEntropy` only supports `<:MvLocationScale`.")
+end
+
+function stepsize_from_optimizer_state(rule::Optimisers.AbstractRule, state)
+    return error(
+        "`ProximalLocationScaleEntropy` does not support optimization rule $(typeof(rule))."
+    )
+end
+
+stepsize_from_optimizer_state(rule::Descent, ::Any) = rule.eta
+
+function stepsize_from_optimizer_state(::DoG, state)
+    _, v, r = state
+    return r / sqrt(v)
+end
+
+function stepsize_from_optimizer_state(::DoWG, state)
+    _, v, r = state
+    return r * r / sqrt(v)
+end
+
+function apply(
+    ::ProximalLocationScaleEntropy,
+    ::Type{<:MvLocationScale},
+    leaf::Optimisers.Leaf{<:Union{<:DoG,<:DoWG,<:Descent},S},
+    params,
+    restructure,
+) where {S}
+    q = restructure(params)
+
+    stepsize = stepsize_from_optimizer_state(leaf.rule, leaf.state)
+    diag_idx = diagind(q.scale)
+    scale_diag = q.scale[diag_idx]
+    @. q.scale[diag_idx] = scale_diag + (sqrt(scale_diag^2 + 4 * stepsize) - scale_diag) / 2
+
+    params, _ = Optimisers.destructure(q)
+
+    return params
+end
diff --git a/src/optimize.jl b/src/optimize.jl
@@ -92,7 +92,7 @@ function optimize(
 
         grad = DiffResults.gradient(grad_buf)
         opt_st, params = Optimisers.update!(opt_st, params, grad)
-        params = apply(operator, typeof(q_init), params, restructure)
+        params = apply(operator, typeof(q_init), opt_st, params, restructure)
         avg_st = apply(averager, avg_st, params)
 
         if !isnothing(callback)
diff --git a/test/inference/repgradelbo_distributionsad.jl b/test/inference/repgradelbo_distributionsad.jl
@@ -1,3 +1,4 @@
+
 AD_repgradelbo_distributionsad = if TEST_GROUP == "Enzyme"
     Dict(
         :Enzyme => AutoEnzyme(;
diff --git a/test/inference/repgradelbo_proximal_locationscale.jl b/test/inference/repgradelbo_proximal_locationscale.jl
diff --git a/test/inference/repgradelbo_proximal_locationscale_bijectors.jl b/test/inference/repgradelbo_proximal_locationscale_bijectors.jl
diff --git a/test/interface/clip_scale.jl b/test/interface/clip_scale.jl
diff --git a/test/interface/proximal_location_scale_entropy.jl b/test/interface/proximal_location_scale_entropy.jl
diff --git a/test/runtests.jl b/test/runtests.jl