Gaussian Process important derivations

\[ \text{KL}(q||p) = \frac{1}{2}\left[\log|K| - \log|\Sigma| - d + \text{tr} (K^{-1}\Sigma) + (\mu_0 - \mu)^T K^{-1}(\mu_0 - \mu)\right]. \]
using KernelFunctions, LinearAlgebra, ForwardDiff, Plots, LaTeXStrings
D = 3
μ = randn(D)
μ₀ = randn(D)
Σ = rand(D, D) |> x -> x * x' + I # Positive definite matrix
K = rand(D, D) |> x-> x * x' + I
L = cholesky(Σ).L # Cholesky of the covariance Σ
KL(μ, Σ::Matrix, μ₀ ,K) =
	0.5*(logdet(K) - logdet(Σ) - length(μ) +
		tr(inv(K) * Σ) + (μ₀ - μ)' * inv(K) * (μ₀ - μ))
@show KL(μ,Σ,μ₀,K)
KL(μ, Σ, μ₀, K) = 3.167850463946593

When using instead \(L\) where \(LL^\top = \Sigma\).

\[\begin{aligned} \text{KL}(q||p) = \frac{1}{2}\left[\log|K| - 2 \log|L| - d + \text{tr} (L^\top K^{-1}L) + (\mu_0 - \mu)^T K^{-1}(\mu_0 - \mu)\right]. \end{aligned}\]
L = cholesky(Σ).L
KL(μ, L::LowerTriangular, μ₀, K) =
 	0.5*(logdet(K) - 2logdet(L) - length(μ) +
			tr(L' * inv(K) * L) + (μ₀ - μ)' * inv(K) * (μ₀-μ))
@show KL(μ,L,μ₀,K)
KL(μ, L, μ₀, K) = 3.167850463946593

Derivatives KL Divergence given variational parameters

Gradients

Gradients given \(\mu,\Sigma,L\):

\[\begin{aligned} \frac{d\text{KL}}{d\mu} =& K^{-1}(\mu-\mu_0)\\ \frac{d\text{KL}}{d\Sigma} =& \frac{1}{2}\left(-\Sigma^{-1} + K^{-1}\right)\\ \frac{d\text{KL}}{dL} =& \frac{1}{2}\left(-2\text{diag}(L)^{-1} + 2K^{-1}L\right) \end{aligned}\]
dKL_dμ(K, μ, μ₀) = inv(K) * (μ - μ₀) # Analytic Formulation
analytic = dKL_dμ(K, μ, μ₀)
autodiff = ForwardDiff.gradient(μ) do x
	KL(x, Σ, μ₀, K)
end

dKL_dΣ(K, Σ) = 0.5 * (-inv(Σ) + inv(K))
analytic = dKL_dΣ(K, Σ)
autodiff = ForwardDiff.gradient(Σ) do x
	KL(μ, x, μ₀, K)
end
dKL_dL(K, L) =
	LowerTriangular(0.5 * (-2 * inv(Diagonal(Matrix(L))) + 2 * inv(K) * L))
analytic = dKL_dL(K, L)
autodiff = ForwardDiff.gradient(L) do x
	KL(μ, x, μ₀, K)
end

Hessians

Hessian given \(\mu,\Sigma,L\):

\[\begin{aligned} \frac{d^2KL}{d\mu\mu} =& K^{-1}\\ \frac{d^2KL}{d\mu d\Sigma} =& 0\\ \frac{d^2KL}{d\Sigma\Sigma} =& \frac{1}{2} \Sigma^{-1}\otimes \Sigma^{-1}\\ \frac{d^2KL}{d\mu dL} =& 0\\ \frac{d^2KL}{dLL} =& - \text{diag} L^{-1}\otimes L^{-1} + K\otimes I \end{aligned}\]

Where \(\otimes\) is the outer-product

d2KL_dμμ(K) = inv(K)
analytic = d2KL_dμμ(K)
autodiff = ForwardDiff.hessian(μ) do x
	KL(x, Σ, μ₀, K)
end
d2KL_dSS(Σ) = 0.5 * kron(inv(Σ), inv(Σ))
analytic = d2KL_dSS(Σ)
autodiff = ForwardDiff.hessian(Σ) do x
	KL(μ, x, μ₀, K)
end