Gaussian Process Regression (for large data)

Loading necessary packages

using AugmentedGaussianProcesses
const AGP = AugmentedGaussianProcesses
using Distributions
using Plots

We create a toy dataset with X ∈ [-20, 20] and y = 5 * sinc(X)

N = 1000
X = reshape((sort(rand(N)) .- 0.5) * 40.0, N, 1)
σ = 0.01

function latent(x)
    return 5.0 * sinc.(x)
end
Y = vec(latent(X) + σ * randn(N));

Visualization of the data :

scatter(X, Y; lab="")

Gaussian noise

In this first example we are going to look at the effect of using inducing points compared to the true Gaussian Process For simplicity we will keep all inducing points and kernel parameters fixed

Ms = [4, 8, 16, 32, 64];

Create an empty array of GPs

models = Vector{AbstractGP}(undef, length(Ms) + 1);

Chose a kernel

kernel = SqExponentialKernel();#  + PeriodicKernel()

And Run sparse classification with an increasing number of inducing points

for (index, num_inducing) in enumerate(Ms)
    @info "Training with $(num_inducing) points"
    m = SVGP(
        X,
        Y, # First arguments are the input and output
        kernel, # Kernel
        GaussianLikelihood(σ), # Likelihood used
        AnalyticVI(), # Inference usede to solve the problem
        num_inducing; # Number of inducing points used
        optimiser=false, # Keep kernel parameters fixed
        Zoptimiser=false, # Keep inducing points locations fixed
    )
    @time train!(m, 100) # Train the model for 100 iterations
    models[index] = m # Save the model in the array
end
[ Info: Training with 4 points
  0.012984 seconds (9.38 k allocations: 15.641 MiB)
[ Info: Training with 8 points
  0.033302 seconds (9.38 k allocations: 25.678 MiB, 52.86% gc time)
[ Info: Training with 16 points
  0.035638 seconds (9.38 k allocations: 46.377 MiB, 34.82% gc time)
[ Info: Training with 32 points
  0.050051 seconds (9.38 k allocations: 90.406 MiB, 26.90% gc time)
[ Info: Training with 64 points
  0.108230 seconds (10.29 k allocations: 189.085 MiB, 23.96% gc time)

Train the model without any inducing points (no approximation)

@info "Training with full model"
mfull = GP(
    X,
    Y,
    kernel;
    noise=σ,
    opt_noise=false, # Keep the noise value fixed
    optimiser=false, # Keep kernel parameters fixed
)
@time train!(mfull, 5);
models[end] = mfull;
[ Info: Training with full model
  0.763739 seconds (238 allocations: 366.450 MiB, 6.79% gc time)

Create a grid and compute prediction on it

function compute_grid(model, n_grid=50)
    mins = -20
    maxs = 20
    x_grid = range(mins, maxs; length=n_grid) # Create a grid
    y_grid, sig_y_grid = proba_y(model, reshape(x_grid, :, 1)) # Predict the mean and variance on the grid
    return y_grid, sig_y_grid, x_grid
end;

Plot the data as a scatter plot

function plotdata(X, Y)
    return Plots.scatter(X, Y; alpha=0.33, msw=0.0, lab="", size=(300, 500))
end

function plot_model(model, X, Y, title=nothing)
    n_grid = 100
    y_grid, sig_y_grid, x_grid = compute_grid(model, n_grid)
    title = if isnothing(title)
        (model isa SVGP ? "M = $(dim(model[1]))" : "full")
    else
        title
    end

    p = plotdata(X, Y)
    Plots.plot!(
        p,
        x_grid,
        y_grid;
        ribbon=2 * sqrt.(sig_y_grid), # Plot 2 std deviations
        title=title,
        color="red",
        lab="",
        linewidth=3.0,
    )
    if model isa SVGP # Plot the inducing points as well
        Plots.plot!(
            p,
            vec(model.f[1].Z),
            zeros(dim(model.f[1]));
            msize=2.0,
            color="black",
            t=:scatter,
            lab="",
        )
    end
    return p
end;

Plots.plot(
    plot_model.(models, Ref(X), Ref(Y))...; layout=(1, length(models)), size=(1000, 200)
) # Plot all models and combine the plots